In [32]:
import pandas as pd
import numpy as np
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import lars_path, LinearRegression, Lasso, LassoCV
from sklearn.metrics import r2_score, mean_squared_error
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
import requests, io, re

from patsy import dmatrices
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import partial_dependence, permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

In [33]:
data_apr = pd.read_csv("/Users/Admin/Desktop/listings_APR_FINAL.csv")
data_apr.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,...,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,ln_price,property_House,property_Private room,property_Shared Room
0,100%,100%,f,2.0,5.0,"['email', 'phone']",t,t,Canarsie,Brooklyn,...,4.94,2,1,1,0,4.46,3.401197,False,True,False
1,,,f,1.0,5.0,"['email', 'phone']",t,t,Civic Center,Manhattan,...,,1,0,1,0,0.01,3.951244,False,True,False
2,,,f,1.0,1.0,"['email', 'phone']",t,t,Greenpoint,Brooklyn,...,5.0,1,0,1,0,0.11,4.787492,False,True,False
3,,,f,1.0,1.0,"['email', 'phone']",t,f,Upper West Side,Manhattan,...,4.9,1,1,0,0,0.12,5.010635,True,False,False
4,,,f,1.0,1.0,"['email', 'phone']",t,t,Washington Heights,Manhattan,...,5.0,1,0,1,0,0.01,3.218876,False,True,False


In [34]:
#Drop columns that are not needed

columns_to_drop = ['host_verifications', 'latitude', 'longitude', 'neighbourhood_group_cleansed', 'host_listings_count',
                   'host_total_listings_count', 'maximum_nights_avg_ntm', 'minimum_minimum_nights', 'calendar_last_scraped',
                   'first_review']

data_apr.drop(columns_to_drop, axis=1, inplace=True)


In [35]:
# Formatting columns    
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "has_availability",
]:
    data_apr[binary] = data_apr[binary].map({"t": True, "f": False})

data_apr["f_property_type"] = data_apr["property_type"].astype("category")
data_apr["f_neighbourhood_cleansed"] = data_apr["neighbourhood_cleansed"].astype("category")
data_apr['n_bathroom'] = data_apr['bathrooms_text'].str.extract('(\d+\.?\d*)').astype(float)

In [36]:
# add new numeric columns from certain columns

numericals = [
    "accommodates",
    "review_scores_value",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "review_scores_location",
    "review_scores_communication",
    "review_scores_checkin",
    "review_scores_cleanliness",
    "reviews_per_month",
    "minimum_nights",
    "beds",
    "bedrooms",
]

for col in numericals:
    data_apr["n_" + col] = pd.to_numeric(data_apr[col], errors="coerce")

In [37]:
#Assign median values to missing ones

data_apr["n_review_scores_value"].fillna(data_apr["n_review_scores_value"].mean(), inplace=True)
data_apr["n_review_scores_location"].fillna(data_apr["n_review_scores_location"].mean(), inplace=True)
data_apr["n_review_scores_communication"].fillna(data_apr["n_review_scores_communication"].mean(), inplace=True)
data_apr["n_review_scores_checkin"].fillna(data_apr["n_review_scores_checkin"].mean(), inplace=True)
data_apr["n_reviews_per_month"].fillna(data_apr["n_reviews_per_month"].mean(), inplace=True)

data_apr = data_apr.assign(
    flag_review_scores_value=np.multiply(data_apr.n_review_scores_value.isna(), 1),
    n_review_scores_rating=data_apr.n_review_scores_value.fillna(
        np.mean(data_apr.n_review_scores_value.dropna())
    ),
    flag_review_scores_location=np.multiply(data_apr.n_review_scores_location.isna(), 1),
    n_review_scores_location=data_apr.n_review_scores_location.fillna(
        np.mean(data_apr.n_review_scores_location.dropna())
    ),

    flag_review_scores_communication=np.multiply(data_apr.n_review_scores_communication.isna(), 1),
    n_review_scores_communication=data_apr.n_review_scores_communication.fillna(
        np.mean(data_apr.n_review_scores_communication.dropna())
    ),

    flag_review_scores_checkin=np.multiply(data_apr.n_review_scores_checkin.isna(), 1),
    n_review_scores_checkin=data_apr.n_review_scores_checkin.fillna(
        np.mean(data_apr.n_review_scores_checkin.dropna())
    ),

    flag_reviews_per_month=np.multiply(data_apr.n_reviews_per_month.isna(), 1),
    n_reviews_per_month=data_apr.n_reviews_per_month.fillna(
        np.mean(data_apr.n_reviews_per_month.dropna())
    ),

    flag_review_scores_cleanliness=np.multiply(data_apr.n_review_scores_cleanliness.isna(), 1),
    n_review_scores_cleanliness=data_apr.n_review_scores_cleanliness.fillna(
        np.mean(data_apr.n_review_scores_cleanliness.dropna())
    ),
)

In [38]:
variable_names = data_apr.columns.tolist()
variable_table = pd.DataFrame(variable_names, columns=["Variable Names"])
pd.set_option('display.max_rows', None)
print(variable_table)

                                  Variable Names
0                             host_response_rate
1                           host_acceptance_rate
2                              host_is_superhost
3                           host_has_profile_pic
4                         host_identity_verified
5                         neighbourhood_cleansed
6                                  property_type
7                                   accommodates
8                                 bathrooms_text
9                                       bedrooms
10                                          beds
11                                     amenities
12                                         price
13                                minimum_nights
14                                maximum_nights
15                              has_availability
16                               availability_30
17                               availability_60
18                               availability_90
19                  

In [39]:
# Remove various strings, split amenities and turn into dtype:object

replace_str_dict = {
    '"' : "",
    ", " : ",",
    "\\\\" : "",
    ":" : "",
    "\\+" : "_",
}

data_apr["amenities"] = data_apr["amenities"].replace(replace_str_dict, regex=True).str.strip("[]").str.split(",")

In [40]:
# Function to merge amenities containing any keyword from a dictionary (case-insensitive)
def merge_items_with_keywords(amenities_list, merge_dict):
    merged_amenities = []
    for amenities in amenities_list:
        merged_item = []
        for amenity in amenities:
            lower_amenity = amenity.lower()
            for new_category, old_categories in merge_dict.items():
                if any(old_category in lower_amenity for old_category in old_categories):
                    merged_item.append(new_category)
                    break
            else:
                merged_item.append(amenity)
        merged_amenities.append(list(set(merged_item)))
    return merged_amenities

In [41]:
# Dictionary to merge amenities

merge_dict = {
    'wifi': ['wifi'],
    'kitchen': ['kitchen', 'kitchenette'],
    'stove': ['stove'],
    'oven': ['oven'],
    'microwave': ['microwave'],
    'refrigerator': ['refrigerator', 'fridge'],
    'dishwasher': ['dishwasher'],
    'kettel': ['kettle'],
    'toaster': ['toaster'],
    'coffee': ['coffee maker', 'machine', 'coffee', 'espresso', 'nespresso'],
    'tv': ['tv'],
    'sound_system': ['speaker', 'sound'],
    'game_console': ['game console', 'ps2', 'ps3', 'ps4', 'ps5', 
                     'playstation', 'wii', 'xbox'],
    'baby': ['baby', 'toys'],
    'body_wash': ['body', 'soap', 'shower gel'],
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'hair dryer': ['hair dryer'],
    'laundry': ['washer', 'laundry'],
    'backyard': ['backyard'],
    'grill': ['grill'],
    'breakfast': ['breakfast'],
    'clothing_storage': ['clothing storage'],
    'ac': ['ac - split type ductless system', 'air conditioning', 'central air conditioning', 'window ac unit'],
    'heating': ['heating']
}

In [42]:
# Merge some amenities together

data_apr['amenities'] = merge_items_with_keywords(data_apr['amenities'], merge_dict)

In [43]:
# Generate dummies by amenities

dummies = data_apr['amenities'].str.join('|').str.get_dummies()
dummies.columns = "d_" + dummies.columns.str.replace('/', '_').str.replace(' ', '_').str.replace('-', '_').str.replace('\\\\', '')

In [44]:
dummies.head()

Unnamed: 0,d__R&Co,d_100%_Vegan,d_1802_Beekman_toiletries,d_2_5_years_old,d_24_hour_fitness_center,d_26.4_QT_25_L,d_365,d_5_10_years_old,d_AVON,d_Acca_Kappa_toiletries,...,d_stove,d_tesla_only,d_toaster,d_treadmill,d_tv,d_walk_in_closet,d_wardrobe,d_wifi,d_wood_burning,d_yoga_mat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [45]:
# Top amenities in NYC
top_150_columns = dummies.sum().sort_values(ascending=False).head(150).index
final_dummies = dummies[top_150_columns]

In [46]:
final_dummies.head()

Unnamed: 0,d_wifi,d_Smoke_alarm,d_kitchen,d_heating,d_ac,d_Essentials,d_tv,d_Carbon_monoxide_alarm,d_Hangers,d_Hot_water,...,d_closet,d_EV_charger,d_Fireplace_guards,d_Exercise_equipment_yoga_mat,d_and_closet,d_Sun_loungers,d_dresser,d_Beach_access,d_and_dresser,d_treadmill
0,1,1,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
#Add dummies amneties to data_apr
data_apr = pd.concat([data_apr, final_dummies], axis=1)

In [48]:
data_apr.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,property_type,accommodates,bathrooms_text,bedrooms,...,d_closet,d_EV_charger,d_Fireplace_guards,d_Exercise_equipment_yoga_mat,d_and_closet,d_Sun_loungers,d_dresser,d_Beach_access,d_and_dresser,d_treadmill
0,100%,100%,False,True,True,Canarsie,Private room,1,1 shared bath,1.0,...,0,0,0,0,0,0,0,0,0,0
1,,,False,True,True,Civic Center,Private room,1,1 shared bath,5.0,...,0,0,0,0,0,0,0,0,0,0
2,,,False,True,True,Greenpoint,Private room,2,2 baths,1.0,...,0,0,0,0,0,0,0,0,0,0
3,,,False,True,False,Upper West Side,House,3,1 bath,1.0,...,0,0,0,0,0,0,0,0,0,0
4,,,False,True,True,Washington Heights,Private room,1,1 bath,1.0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Basic Variables inc neighnourhood
basic_vars = [
    "n_accommodates",
    "n_beds",
    "n_bathroom",
    "n_beds",
    "f_property_type",
    "f_neighbourhood_cleansed"
]

# reviews
reviews = [
    "n_review_scores_value",
    "flag_review_scores_value",
    "n_review_scores_location",
    "flag_review_scores_location",
    "n_review_scores_communication",
    "flag_review_scores_communication",
    "n_review_scores_checkin",
    "flag_review_scores_checkin",
    "n_review_scores_cleanliness",
    "flag_review_scores_cleanliness",
    "n_reviews_per_month",
    "flag_reviews_per_month"
]

# Dummy variables
amenities = [col for col in data_apr if col.startswith("d_")]

# interactions for the LASSO
# from ch14
X1 = [
    "n_accommodates:f_property_type",
    "d_breakfast:f_property_type",
    "d_heating:n_accommodates",
    "d_ac:f_property_type",
]
# with neighbourhood
X2 = [
    "f_property_type:f_neighbourhood_cleansed",
    "n_accommodates:f_neighbourhood_cleansed",
    "d_wifi:f_neighbourhood_cleansed",
    "d_Smoke_alarm:f_neighbourhood_cleansed",
]

In [50]:
predictors_1 = basic_vars
predictors_2 = basic_vars + reviews + amenities
predictors_E = basic_vars + reviews + amenities + X1 + X2

In [51]:
amenities

['d_wifi',
 'd_Smoke_alarm',
 'd_kitchen',
 'd_heating',
 'd_ac',
 'd_Essentials',
 'd_tv',
 'd_Carbon_monoxide_alarm',
 'd_Hangers',
 'd_Hot_water',
 'd_hair_dryer',
 'd_Iron',
 'd_refrigerator',
 'd_shampoo',
 'd_Dishes_and_silverware',
 'd_Cooking_basics',
 'd_coffee',
 'd_Bed_linens',
 'd_microwave',
 'd_Fire_extinguisher',
 'd_stove',
 'd_oven',
 'd_Dedicated_workspace',
 'd_First_aid_kit',
 'd_laundry',
 'd_Free_street_parking',
 'd_Self_check_in',
 'd_Extra_pillows_and_blankets',
 'd_Long_term_stays_allowed',
 'd_body_wash',
 'd_Cleaning_products',
 'd_Bathtub',
 'd_clothing_storage',
 'd_Freezer',
 'd_Security_cameras_on_property',
 'd_Private_entrance',
 'd_Dryer',
 'd_Luggage_dropoff_allowed',
 'd_Dining_table',
 'd_Lock_on_bedroom_door',
 'd_Laundromat_nearby',
 'd_kettel',
 'd_dishwasher',
 'd_conditioner',
 'd_Elevator',
 'd_Wine_glasses',
 'd_Lockbox',
 'd_Pets_allowed',
 'd_toaster',
 'd_Room_darkening_shades',
 'd_backyard',
 'd_Baking_sheet',
 'd_Free_parking_on_premis

In [52]:
#Split the data
data_train, data_holdout = train_test_split(data_apr, train_size=0.8, random_state=42)

# Random Forest

# OLS

In [53]:
#print(data_apr[basic_vars].dtypes)
print(data_apr[reviews].dtypes)

n_review_scores_value               float64
flag_review_scores_value              int32
n_review_scores_location            float64
flag_review_scores_location           int32
n_review_scores_communication       float64
flag_review_scores_communication      int32
n_review_scores_checkin             float64
flag_review_scores_checkin            int32
n_review_scores_cleanliness         float64
flag_review_scores_cleanliness        int32
n_reviews_per_month                 float64
flag_reviews_per_month                int32
dtype: object


In [54]:
y, X = dmatrices("price ~ " + " + ".join(predictors_2), data_train)

ols_model = LinearRegression().fit(X,y)

#y_test, X_test = dmatrices("price ~ " + " + ".join(predictors_2), data_holdout)

y_hat = ols_model.predict(X)

ols_rmse = mean_squared_error(y,y_hat,squared=False)
ols_rmse

332.3087698879969

In [55]:
mod1 = smf.ols("price ~ " + " + ".join(amenities), data=data_train)

In [56]:
ols_model_coeffs_df = pd.DataFrame(
    ols_model.coef_.tolist()[0],
    index=X.design_info.column_names,
    columns=["ols_coefficient"],
).assign(ols_coefficient=lambda x: x.ols_coefficient.round(3))

In [57]:
ols_model_coeffs_df

Unnamed: 0,ols_coefficient
Intercept,0.0
f_property_type[T.Private room],-16.232
f_property_type[T.Shared Room],-20.751
f_neighbourhood_cleansed[T.Arden Heights],-38.88
f_neighbourhood_cleansed[T.Arrochar],32.716
f_neighbourhood_cleansed[T.Arverne],10.096
f_neighbourhood_cleansed[T.Astoria],36.421
f_neighbourhood_cleansed[T.Bath Beach],80.329
f_neighbourhood_cleansed[T.Battery Park City],71.127
f_neighbourhood_cleansed[T.Bay Ridge],60.842


# LASSO

In [58]:
from sklearn.linear_model import ElasticNet

In [59]:
lasso_model_cv = GridSearchCV(
    lasso_model,
    {"alpha": [i / 100 for i in range(1, 26, 1)]},
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=3,
)


NameError: name 'lasso_model' is not defined

In [None]:
y, X = dmatrices("price ~ " + " + ".join(predictors_E), data_train)

In [None]:
lasso_model_cv.fit(X, y.ravel())