In [2]:
%reset -f

In [3]:
import pandas as pd
import numpy as np
import random
import re
import gc

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score, KFold

from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification

In [4]:
gc.collect()

0

In [5]:
listings = pd.read_csv("train.csv")

In [6]:
listings

Unnamed: 0,Id,price,name,neighborhood_overview,host_id,host_since,host_location,host_about,host_response_time,host_response_rate,...,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,KNXTO,143.0,Guesthouse in Oakland · ★4.93 · 1 bedroom · 1 ...,You will be in the Crocker Highlands neighborh...,4211733,2012-11-21,"Oakland, CA",,within a few hours,100%,...,2023-04-28,2023-11-26,4.93,5.00,4.86,4.86,5.00,4.93,4.86,f
1,ZMPFX,103.0,Rental unit in San Francisco · 1 bedroom · 1 b...,,1257432,2011-10-06,"San Francisco, CA","Our Company is San Francisco Life Real Estate,...",within an hour,98%,...,2023-09-05,2023-10-31,5.00,4.50,4.50,5.00,5.00,5.00,4.50,f
2,GZZJO,55.0,Home in San Francisco · ★5.0 · 1 bedroom · 1 b...,,9219277,2013-10-04,"San Francisco, CA",Hi there! My name's Henry and my wife and I ha...,within an hour,100%,...,2021-12-17,2022-11-10,5.00,4.75,4.75,5.00,5.00,5.00,3.75,f
3,6FFE0,80.0,Home in Palo Alto · ★5.0 · 1 bedroom · 1 bed ·...,This is a very safe neighborhood with low crim...,13149124,2014-03-15,"Palo Alto, CA",Hi! I work in tech but enjoy running Airbnb on...,within a day,90%,...,2021-08-28,2023-05-27,5.00,5.00,5.00,5.00,5.00,4.86,5.00,f
4,YIIMB,259.0,Hotel in Sunnyvale · ★4.75 · 1 bedroom · 2 bed...,De Anza College – 4.3 miles; <br />Computer Hi...,501999278,2023-02-20,,,within an hour,100%,...,2023-06-11,2023-11-26,4.75,4.88,4.75,5.00,4.88,4.88,4.88,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15269,3HWAB,400.0,Condo in San Francisco · ★4.89 · 3 bedrooms · ...,The Mission district is a fun and lively neigh...,24910,2009-07-08,"San Francisco, CA",Digital health tech entrepreneur and mom. Asia...,within an hour,100%,...,2016-10-17,2023-08-24,4.89,4.98,4.93,4.98,5.00,4.80,4.78,f
15270,HFG5D,135.0,Home in Daly City · ★4.73 · 2 bedrooms · 2 bed...,,378727622,2020-12-07,"Monterey Park, CA",世界那麼大，我們應該去看看,within an hour,100%,...,2023-10-22,2023-12-09,4.73,4.73,4.82,4.45,4.91,4.55,4.82,t
15271,ISHO7,2064.0,Villa in Saratoga · ★4.96 · 7 bedrooms · 13 be...,You can’t see any neighbours as it is a bit fa...,471777228,2022-07-27,"Saratoga, CA",I am an entrepreneur and real estate investor.,within an hour,90%,...,2022-08-24,2023-10-29,4.96,4.91,4.96,4.96,4.96,4.87,4.83,t
15272,SHCPL,332.0,Home in Santa Cruz · ★4.99 · 2 bedrooms · 2 be...,"Too much fun! Bike, walk, surf, eat, play on ...",84170936,2016-07-16,,I am a semi retired real estate agent that lov...,within an hour,100%,...,2021-05-12,2023-12-26,4.99,4.99,4.97,5.00,4.99,4.96,4.88,f


In [7]:
random.sample(list(listings["neighborhood_overview"].values), 1)

['Conveniently located 10 minutes from sfo international airport, highway 101 and 280. 1 minute walk from muni bus line 8 (goes to Chase arena, AT&T park, Alcatraz Island, Fisherman’s wharf, Union Square). We’re also a 5 min drive 29 min walk from Caltrain Bayshore Station. 8 min walk to Muni metro KT line. You can also use ride service app (Lyft/Uber), since we’re located near freeway ride access is quick.']

In [8]:
positive_words = np.array(["close", "night life", "favorite", "famous",
                           "!", "amazing", "easy", "lovely", "easily",
                           "great", "shopping", "restaurants", "explore",
                           "historic", "hotspot", "relax",
                           "perfect", "unforgettable", "inspiring", "convenient",
                           "theaters", "attractions", "walking", "playground",
                           "park", "clubs", "quiet", "market",
                           "best", "library", "local", "sights",
                           "heart", "charming", "museum"])

In [9]:
def parse_descriptions(descriptions):
    parsed_info = {
        'bedrooms': [],
        'beds': [],
        'baths': []
    }

    for description in descriptions:
        brms = re.split(r'(\d+)\s*bedroom',description)
        bds = re.split(r'(\d+)\s*bed(?:s\b)?(?!room)',description)
        bths = re.split(r'(\d+(\.\d+)?)\s*(shared|private)?\s*bath',description)

        # print(brms)
        # print(bds)
        # print(bths)

        if len(brms) > 1:
            parsed_info["bedrooms"].append(float(brms[1]))
        else:
            parsed_info["bedrooms"].append(0)
        if len(bds) > 1:
            parsed_info["beds"].append(float(bds[1]))
        else: 
            parsed_info["beds"].append(0)
        if len(bths) > 1:
            parsed_info["baths"].append(float(bths[1]))
        else:
            parsed_info["baths"].append(0)

    return parsed_info

In [10]:
descriptions = listings["name"].values
apartment_stats  = parse_descriptions(descriptions)

In [11]:
listings_cleaned = pd.concat([listings.drop(columns=["bedrooms"]), pd.DataFrame(apartment_stats)[["bedrooms", "baths"]]], axis=1)

In [12]:
listings_cleaned["host_response_rate"] = listings_cleaned["host_response_rate"].apply(lambda r: float(r.split("%")[0]) if isinstance(r, str) else r)

In [13]:
listings_cleaned.columns

Index(['Id', 'price', 'name', 'neighborhood_overview', 'host_id', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_listings_count', 'host_verifications', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'beds', 'amenities', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 

In [14]:
listings_cleaned["last_review"] = pd.to_datetime(listings_cleaned["last_review"])
most_recent_date = listings_cleaned["last_review"].max()
listings_cleaned["recency"] = 1+(most_recent_date-listings_cleaned["last_review"]).dt.days

In [15]:
def apt_type(description):
    return description.split(" ")[0].upper()

listings_cleaned["apt_type"] = listings_cleaned["name"].apply(lambda d: apt_type(d))

In [16]:
listings_cleaned["neighborhood_positivity"] = listings_cleaned["neighborhood_overview"].apply(
    lambda d: np.sum([word in d.lower() for word in positive_words]) if not pd.isna(d) else 0
)

In [17]:
listings_cleaned["host_description_score"] = listings_cleaned["host_about"].apply(
    lambda d: len(d) if not pd.isna(d) else 0
)

listings_cleaned["host_since"] = pd.to_datetime(listings_cleaned["host_since"])
listings_cleaned["host_recency"] = 1+(most_recent_date-listings_cleaned["host_since"]).dt.days

In [18]:
#Spatial dependency
neighborhood_prices = []

epsilon = 1
R = 6371

for i, data in listings_cleaned.iterrows():
    x = np.deg2rad(data["latitude"])
    y = np.deg2rad(data["longitude"])
    lat = np.deg2rad(listings_cleaned["latitude"].values)
    long = np.deg2rad(listings_cleaned["longitude"].values)
    xdiff = x-lat
    ydiff = y-long
    
    d = np.sin(xdiff / 2)**2 + np.cos(x) * np.cos(lat) * np.sin(ydiff / 2)**2
    c = 2 * np.arctan2(np.sqrt(d), np.sqrt(1-d))
    km = R*c

    indices = np.where(km <= epsilon)[0]
    neighborhood = indices[indices != i]
    neighborhood_price = np.mean(listings_cleaned.iloc[neighborhood]["price"])

    prices = neighborhood_prices.append(neighborhood_price)

listings_cleaned["neighborhood_price"] = neighborhood_prices


In [19]:
listings_cleaned.columns

Index(['Id', 'price', 'name', 'neighborhood_overview', 'host_id', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_listings_count', 'host_verifications', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'beds', 'amenities', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 

In [20]:
dummies = ["host_location", "host_identity_verified", "property_type",
                  "room_type", "has_availability"]
continuous = [    "accommodates", "baths", "bedrooms", "beds", "recency", "neighborhood_price"]

In [21]:
listings_cleaned[["price"]+continuous].corr()

Unnamed: 0,price,accommodates,baths,bedrooms,beds,recency,neighborhood_price
price,1.0,0.066758,0.05363,0.046132,0.063849,0.055093,0.118464
accommodates,0.066758,1.0,0.5853,0.795649,0.870187,-0.080264,0.040733
baths,0.05363,0.5853,1.0,0.642599,0.595881,0.011474,0.032085
bedrooms,0.046132,0.795649,0.642599,1.0,0.791106,-0.041249,-0.015432
beds,0.063849,0.870187,0.595881,0.791106,1.0,-0.05628,0.028224
recency,0.055093,-0.080264,0.011474,-0.041249,-0.05628,1.0,0.122148
neighborhood_price,0.118464,0.040733,0.032085,-0.015432,0.028224,0.122148,1.0


In [22]:
pipe_numeric = Pipeline(steps=[
  ('impute',KNNImputer()),
  ('standardize', StandardScaler())
])

pipe_categorical = Pipeline(steps=[
  ('impute',SimpleImputer(strategy='most_frequent')),
  ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", pipe_numeric, continuous),
        ("categorical", pipe_categorical, dummies)
    ]
)

knn_pipe = Pipeline(steps=[
    ("preprocessors", preprocessor),
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('model', KNeighborsRegressor(n_neighbors=10, weights='distance'))
])

In [23]:
params = {
  #'preprocessors__categorical__select_percentile__percentile': [50],
  'preprocessors__numerical__impute__n_neighbors': list(range(5, 101, 5)),
  'poly_features__degree': [3],
  'model__n_neighbors': list(range(5, 101, 5)),
  'model__weights': ['distance']
}

X = listings_cleaned[continuous+dummies]
Y = listings_cleaned["price"]
mae = make_scorer(mean_absolute_error, greater_is_better=False)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=486)
# knn_pipe.fit(X_train, Y_train)

In [24]:
rs = RandomizedSearchCV(knn_pipe, param_distributions=params, scoring=mae, cv=5, random_state=486)

In [25]:
#We don't have enough memory to run this
#rs.fit(X,Y)

In [26]:
#rs.best_params_

In [27]:
#KNN Model 1
cv = KFold(n_splits=5,
           shuffle=True,
           random_state=0)

pipe_numeric = Pipeline(steps=[
  ('impute',KNNImputer(n_neighbors=10)),
  ('standardize', StandardScaler())   
])

knn_pipe = Pipeline(steps=[
    ("preprocessors", preprocessor),
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('model', KNeighborsRegressor(n_neighbors=10, weights='distance'))
])

# knn_pipe.fit(X_train, Y_train)
# knn_yhat = knn_pipe.predict(X_test)

#knn_cv_error = cross_val_score(knn_pipe, X, Y, cv=cv, scoring=mae)
#-np.mean(knn_cv_error)

In [28]:
#KNN Model 2
cv = KFold(n_splits=5,
           shuffle=True,
           random_state=0)

pipe_numeric = Pipeline(steps=[
  ('impute',KNNImputer(n_neighbors=10)),
  ('standardize', StandardScaler())   
])

knn_pipe = Pipeline(steps=[
    ("preprocessors", preprocessor),
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('model', KNeighborsRegressor(n_neighbors=20, weights='distance'))
])

# knn_pipe.fit(X_train, Y_train)
# knn_yhat = knn_pipe.predict(X_test)

#knn_cv_error = cross_val_score(knn_pipe, X, Y, cv=cv, scoring=mae)
#-np.mean(knn_cv_error)

In [29]:
#KNN Model 3
cv = KFold(n_splits=10,
           shuffle=True,
           random_state=0)

pipe_numeric = Pipeline(steps=[
  ('impute',KNNImputer(n_neighbors=15)),
  ('standardize', StandardScaler())   
])

knn_pipe = Pipeline(steps=[
    ("preprocessors", preprocessor),
    ('poly_features', PolynomialFeatures(degree=1, include_bias=False)),
    ('model', KNeighborsRegressor(n_neighbors=15, weights='distance'))
])

# knn_pipe.fit(X_train, Y_train)
# knn_yhat = knn_pipe.predict(X_test)

#knn_cv_error = cross_val_score(knn_pipe, X, Y, cv=cv, scoring=mae)
#-np.mean(knn_cv_error)

In [30]:
#Ridge regression
# alphas = np.logspace(-6, 6, 30)

# ridge_pipe = Pipeline(steps=[
#     ("preprocessors", preprocessor),
#     ('poly_features', PolynomialFeatures(degree=1, include_bias=False)),
#     ('model', RidgeCV(alphas=np.logspace(-6, 6, 30), cv=10))
# ])

# ridge_pipe.fit(X_train, Y_train)
# yhat = ridge_pipe.predict(X_test)
# mean_absolute_error(Y_test, yhat)


In [31]:
#Lasso regression
# lasso_pipe = Pipeline(steps=[
#     ("preprocessors", preprocessor),
#     ('poly_features', PolynomialFeatures(degree=1, include_bias=False)),
#     ('model', LassoCV(alphas=np.logspace(-6, 6, 30), cv=10))
# ])

# lasso_pipe.fit(X_train, Y_train)
# lasso_yhat = ridge_pipe.predict(X_test)
# mean_absolute_error(Y_test, lasso_yhat)

In [32]:
#Final model
pipe_numeric = Pipeline(steps=[
  ('impute',KNNImputer(n_neighbors=10)),
  ('standardize', StandardScaler())   
])

knn_pipe = Pipeline(steps=[
    ("preprocessors", preprocessor),
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('model', KNeighborsRegressor(n_neighbors=10, weights='distance'))
])

# knn_pipe = Pipeline(steps=[
#     ("preprocessors", preprocessor),
#     ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
#     ('model', SGDRegressor(max_iter=1000, tol=1e-3, penalty='l2'))
# ])

test = pd.read_csv("test.csv")

In [33]:
test_cleaned = pd.concat([test.drop(columns=["bedrooms"]), pd.DataFrame(parse_descriptions(test["name"].values))[["bedrooms", "baths"]]], axis=1)
test_cleaned["host_response_rate"] = test_cleaned["host_response_rate"].apply(lambda r: float(r.split("%")[0]) if isinstance(r, str) else r)
test_cleaned["last_review"] = pd.to_datetime(test_cleaned["last_review"])
most_recent_date = test_cleaned["last_review"].max()
test_cleaned["recency"] = 1+(most_recent_date-test_cleaned["last_review"]).dt.days
test_cleaned["apt_type"] = test_cleaned["name"].apply(lambda d: apt_type(d))
test_cleaned["neighborhood_positivity"] = test_cleaned["neighborhood_overview"].apply(
    lambda d: np.sum([word in d.lower() for word in positive_words]) if not pd.isna(d) else 0
)
test_cleaned["host_description_score"] = test_cleaned["host_about"].apply(
    lambda d: len(d) if not pd.isna(d) else 0
)

test_cleaned["host_since"] = pd.to_datetime(test_cleaned["host_since"])
test_cleaned["host_recency"] = 1+(most_recent_date-test_cleaned["host_since"]).dt.days

#Spatial dependency
neighborhood_prices = []

epsilon = 1
R = 6371

for i, data in test_cleaned.iterrows():
    x = np.deg2rad(data["latitude"])
    y = np.deg2rad(data["longitude"])
    lat = np.deg2rad(test_cleaned["latitude"].values)
    long = np.deg2rad(test_cleaned["longitude"].values)
    xdiff = x-lat
    ydiff = y-long
    
    d = np.sin(xdiff / 2)**2 + np.cos(x) * np.cos(lat) * np.sin(ydiff / 2)**2
    c = 2 * np.arctan2(np.sqrt(d), np.sqrt(1-d))
    km = R*c

    indices = np.where(km <= epsilon)[0]
    neighborhood = indices[indices != i]
    neighborhood_price = np.mean(listings_cleaned.iloc[neighborhood]["price"])

    prices = neighborhood_prices.append(neighborhood_price)

test_cleaned["neighborhood_price"] = neighborhood_prices

In [37]:
shuffle_indices = np.random.permutation(len(X))

X_shuffled = X.iloc[shuffle_indices].reset_index(drop=True)
Y_shuffled = Y.iloc[shuffle_indices].reset_index(drop=True)

batch_size = 1000

for i in range(0, len(X_shuffled), batch_size):
    
    print(i)
    
    X_batch = X_shuffled.iloc[i:i+batch_size]
    Y_batch = Y_shuffled.iloc[i:i+batch_size]
    
    knn_pipe.fit(X_batch, Y_batch)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


In [41]:
Xtest = test_cleaned[continuous+dummies]

#MAE of our data set
yhat = knn_pipe.predict(X)
mean_absolute_error(Y, yhat)

175.13812785493835

In [53]:
predictions = pd.DataFrame(test_cleaned["Id"].copy())
prices = knn_pipe.predict(Xtest)

In [59]:
predictions["price"] = prices
predictions

0                                                    PSJEN
1                                                    PVZW7
2                                                    EJLAM
3                                                    SDHPB
4                                                    MJGYX
                               ...                        
6543                                                 YFHBN
6544                                                 Z0TCA
6545                                                 VBCBJ
6546                                                 S2ZQX
price    [83.32159042610631, 160.65830752093632, 61.010...
Name: Id, Length: 6548, dtype: object

In [58]:
predictions.to_csv("predictions.csv", index=False)

##### 

#### 