In [1]:
import pandas as pd
import numpy as np
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import lars_path, LinearRegression, Lasso, LassoCV
from sklearn.metrics import r2_score, mean_squared_error
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
import requests, io, re

from patsy import dmatrices
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import partial_dependence, permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

In [2]:
#data_apr = pd.read_csv("/Users/Admin/Desktop/listings_APR_FINAL.csv")
data_apr = pd.read_csv("/Users/jacopobinati/Desktop/HM2/listings_APR_FINAL.csv")
data_apr.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,...,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,ln_price,property_House,property_Private room,property_Shared Room
0,100%,100%,f,2.0,5.0,"['email', 'phone']",t,t,Canarsie,Brooklyn,...,4.94,2,1,1,0,4.46,3.401197,False,True,False
1,,,f,1.0,5.0,"['email', 'phone']",t,t,Civic Center,Manhattan,...,,1,0,1,0,0.01,3.951244,False,True,False
2,,,f,1.0,1.0,"['email', 'phone']",t,t,Greenpoint,Brooklyn,...,5.0,1,0,1,0,0.11,4.787492,False,True,False
3,,,f,1.0,1.0,"['email', 'phone']",t,f,Upper West Side,Manhattan,...,4.9,1,1,0,0,0.12,5.010635,True,False,False
4,,,f,1.0,1.0,"['email', 'phone']",t,t,Washington Heights,Manhattan,...,5.0,1,0,1,0,0.01,3.218876,False,True,False


In [3]:
#Drop columns that are not needed

columns_to_drop = ['host_verifications', 'latitude', 'longitude', 'neighbourhood_group_cleansed', 'host_listings_count',
                   'host_total_listings_count', 'maximum_nights_avg_ntm', 'minimum_minimum_nights', 'calendar_last_scraped',
                   'first_review']

data_apr.drop(columns_to_drop, axis=1, inplace=True)


In [4]:
# Keep obs with 2 < accommodates < 6, and property_House = 1 only

data_apr = data_apr[(data_apr['accommodates'] >= 2) & (data_apr['accommodates'] <= 6) & (data_apr['property_House'] == 1)]


In [5]:
# Formatting columns    
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "has_availability",
]:
    data_apr[binary] = data_apr[binary].map({"t": True, "f": False})

data_apr["f_property_type"] = data_apr["property_type"].astype("category")
data_apr["f_neighbourhood_cleansed"] = data_apr["neighbourhood_cleansed"].astype("category")
data_apr['n_bathroom'] = data_apr['bathrooms_text'].str.extract('(\d+\.?\d*)').astype(float)

In [6]:
# add new numeric columns from certain columns

numericals = [
    "accommodates",
    "review_scores_value",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "review_scores_location",
    "review_scores_communication",
    "review_scores_checkin",
    "review_scores_cleanliness",
    "reviews_per_month",
    "minimum_nights",
    "beds",
    "bedrooms",
]

for col in numericals:
    data_apr["n_" + col] = pd.to_numeric(data_apr[col], errors="coerce")

In [7]:
#Assign median values to missing ones

data_apr["n_review_scores_value"].fillna(data_apr["n_review_scores_value"].mean(), inplace=True)
data_apr["n_review_scores_location"].fillna(data_apr["n_review_scores_location"].mean(), inplace=True)
data_apr["n_review_scores_communication"].fillna(data_apr["n_review_scores_communication"].mean(), inplace=True)
data_apr["n_review_scores_checkin"].fillna(data_apr["n_review_scores_checkin"].mean(), inplace=True)
data_apr["n_reviews_per_month"].fillna(data_apr["n_reviews_per_month"].mean(), inplace=True)

data_apr = data_apr.assign(
    flag_review_scores_value=np.multiply(data_apr.n_review_scores_value.isna(), 1),
    n_review_scores_rating=data_apr.n_review_scores_value.fillna(
        np.mean(data_apr.n_review_scores_value.dropna())
    ),
    flag_review_scores_location=np.multiply(data_apr.n_review_scores_location.isna(), 1),
    n_review_scores_location=data_apr.n_review_scores_location.fillna(
        np.mean(data_apr.n_review_scores_location.dropna())
    ),

    flag_review_scores_communication=np.multiply(data_apr.n_review_scores_communication.isna(), 1),
    n_review_scores_communication=data_apr.n_review_scores_communication.fillna(
        np.mean(data_apr.n_review_scores_communication.dropna())
    ),

    flag_review_scores_checkin=np.multiply(data_apr.n_review_scores_checkin.isna(), 1),
    n_review_scores_checkin=data_apr.n_review_scores_checkin.fillna(
        np.mean(data_apr.n_review_scores_checkin.dropna())
    ),

    flag_reviews_per_month=np.multiply(data_apr.n_reviews_per_month.isna(), 1),
    n_reviews_per_month=data_apr.n_reviews_per_month.fillna(
        np.mean(data_apr.n_reviews_per_month.dropna())
    ),

    flag_review_scores_cleanliness=np.multiply(data_apr.n_review_scores_cleanliness.isna(), 1),
    n_review_scores_cleanliness=data_apr.n_review_scores_cleanliness.fillna(
        np.mean(data_apr.n_review_scores_cleanliness.dropna())
    ),
)

In [8]:
variable_names = data_apr.columns.tolist()
variable_table = pd.DataFrame(variable_names, columns=["Variable Names"])
pd.set_option('display.max_rows', None)
print(variable_table)

                                  Variable Names
0                             host_response_rate
1                           host_acceptance_rate
2                              host_is_superhost
3                           host_has_profile_pic
4                         host_identity_verified
5                         neighbourhood_cleansed
6                                  property_type
7                                   accommodates
8                                 bathrooms_text
9                                       bedrooms
10                                          beds
11                                     amenities
12                                         price
13                                minimum_nights
14                                maximum_nights
15                              has_availability
16                               availability_30
17                               availability_60
18                               availability_90
19                  

In [9]:
# Remove various strings, split amenities and turn into dtype:object

replace_str_dict = {
    '"' : "",
    ", " : ",",
    "\\\\" : "",
    ":" : "",
    "\\+" : "_",
}

data_apr["amenities"] = data_apr["amenities"].replace(replace_str_dict, regex=True).str.strip("[]").str.split(",")

In [10]:
# Function to merge amenities containing any keyword from a dictionary (case-insensitive)
def merge_items_with_keywords(amenities_list, merge_dict):
    merged_amenities = []
    for amenities in amenities_list:
        merged_item = []
        for amenity in amenities:
            lower_amenity = amenity.lower()
            for new_category, old_categories in merge_dict.items():
                if any(old_category in lower_amenity for old_category in old_categories):
                    merged_item.append(new_category)
                    break
            else:
                merged_item.append(amenity)
        merged_amenities.append(list(set(merged_item)))
    return merged_amenities

In [11]:
# Dictionary to merge amenities

merge_dict = {
    'wifi': ['wifi'],
    'kitchen': ['kitchen', 'kitchenette'],
    'stove': ['stove'],
    'oven': ['oven'],
    'microwave': ['microwave'],
    'refrigerator': ['refrigerator', 'fridge'],
    'dishwasher': ['dishwasher'],
    'kettel': ['kettle'],
    'toaster': ['toaster'],
    'coffee': ['coffee maker', 'machine', 'coffee', 'espresso', 'nespresso'],
    'tv': ['tv'],
    'sound_system': ['speaker', 'sound'],
    'game_console': ['game console', 'ps2', 'ps3', 'ps4', 'ps5', 
                     'playstation', 'wii', 'xbox'],
    'baby': ['baby', 'toys'],
    'body_wash': ['body', 'soap', 'shower gel'],
    'shampoo': ['shampoo'],
    'conditioner': ['conditioner'],
    'hair dryer': ['hair dryer'],
    'laundry': ['washer', 'laundry'],
    'backyard': ['backyard'],
    'grill': ['grill'],
    'breakfast': ['breakfast'],
    'clothing_storage': ['clothing storage'],
    'ac': ['ac - split type ductless system', 'air conditioning', 'central air conditioning', 'window ac unit'],
    'heating': ['heating']
}

In [12]:
# Merge some amenities together

data_apr['amenities'] = merge_items_with_keywords(data_apr['amenities'], merge_dict)

In [13]:
# Generate dummies by amenities

dummies = data_apr['amenities'].str.join('|').str.get_dummies()
dummies.columns = "d_" + dummies.columns.str.replace('/', '_').str.replace(' ', '_').str.replace('-', '_').str.replace('\\\\', '')

In [14]:
dummies.head()

Unnamed: 0,d__R&Co,d_2_5_years_old,d_365,d_5_10_years_old,d_Aiwa,d_Amazon_Prime_Video,d_Amika,d_Avanti,d_Aveda,d_Baking_sheet,...,d_stove,d_tesla_only,d_toaster,d_treadmill,d_tv,d_walk_in_closet,d_wardrobe,d_wifi,d_wood_burning,d_yoga_mat
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [15]:
# Top amenities in NYC
top_150_columns = dummies.sum().sort_values(ascending=False).head(150).index
final_dummies = dummies[top_150_columns]

In [16]:
final_dummies.head()

Unnamed: 0,d_wifi,d_kitchen,d_ac,d_Smoke_alarm,d_heating,d_Essentials,d_tv,d_Carbon_monoxide_alarm,d_Hangers,d_hair_dryer,...,d_Exercise_equipment_yoga_mat,d_stationary_bike,d_Bread_maker,d_Sun_loungers,d_EV_charger,d_Beach_access,d_and_dresser,d_Paid_parking_garage_on_premises,d_treadmill,d_Private_gym_in_building
3,1,1,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,1,1,1,0,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#Add dummies amneties to data_apr
data_apr = pd.concat([data_apr, final_dummies], axis=1)

In [18]:
data_apr.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,property_type,accommodates,bathrooms_text,bedrooms,...,d_Exercise_equipment_yoga_mat,d_stationary_bike,d_Bread_maker,d_Sun_loungers,d_EV_charger,d_Beach_access,d_and_dresser,d_Paid_parking_garage_on_premises,d_treadmill,d_Private_gym_in_building
3,,,False,True,False,Upper West Side,House,3,1 bath,1.0,...,0,0,0,0,0,0,0,0,0,0
5,,,False,True,True,Williamsburg,House,3,1 bath,1.0,...,0,0,0,0,0,0,0,0,0,0
7,,,False,True,True,Long Island City,House,4,1 bath,2.0,...,0,0,0,0,0,0,0,0,0,0
8,,,False,True,True,Fort Greene,House,2,1 bath,,...,0,0,0,0,0,0,0,0,0,0
10,,,False,True,False,East Village,House,4,1 bath,1.0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Basic Variables inc neighnourhood
basic_vars = [
    "n_accommodates",
    "n_beds",
    "n_bathroom",
    "n_beds",
    "f_property_type",
    "f_neighbourhood_cleansed"
]

# reviews
reviews = [
    "n_review_scores_value",
    "flag_review_scores_value",
    "n_review_scores_location",
    "flag_review_scores_location",
    "n_review_scores_communication",
    "flag_review_scores_communication",
    "n_review_scores_checkin",
    "flag_review_scores_checkin",
    "n_review_scores_cleanliness",
    "flag_review_scores_cleanliness",
    "n_reviews_per_month",
    "flag_reviews_per_month"
]

# Dummy variables
amenities = [col for col in data_apr if col.startswith("d_")]

# interactions for the LASSO
# from ch14
X1 = [
    "n_accommodates:f_property_type",
    "d_breakfast:f_property_type",
    "d_heating:n_accommodates",
    "d_ac:f_property_type",
    "minimum_nights",
    "minimum_nights:n_accommodates",
    "availability_365:minimum_nights",
]
# with neighbourhood
X2 = [
    "f_property_type:f_neighbourhood_cleansed",
    "n_accommodates:f_neighbourhood_cleansed",
    "d_wifi:f_neighbourhood_cleansed",
    "d_Smoke_alarm:f_neighbourhood_cleansed",
]

In [20]:
predictors_1 = basic_vars
predictors_2 = basic_vars + reviews + amenities
predictors_3 = basic_vars + reviews + amenities + X1 + X2

In [21]:
amenities

['d_wifi',
 'd_kitchen',
 'd_ac',
 'd_Smoke_alarm',
 'd_heating',
 'd_Essentials',
 'd_tv',
 'd_Carbon_monoxide_alarm',
 'd_Hangers',
 'd_hair_dryer',
 'd_Hot_water',
 'd_Iron',
 'd_refrigerator',
 'd_Dishes_and_silverware',
 'd_shampoo',
 'd_Cooking_basics',
 'd_coffee',
 'd_Bed_linens',
 'd_microwave',
 'd_oven',
 'd_stove',
 'd_laundry',
 'd_Fire_extinguisher',
 'd_Dedicated_workspace',
 'd_Self_check_in',
 'd_Free_street_parking',
 'd_First_aid_kit',
 'd_Long_term_stays_allowed',
 'd_Extra_pillows_and_blankets',
 'd_body_wash',
 'd_Bathtub',
 'd_Cleaning_products',
 'd_Private_entrance',
 'd_Freezer',
 'd_clothing_storage',
 'd_dishwasher',
 'd_Dining_table',
 'd_Wine_glasses',
 'd_Lockbox',
 'd_Dryer',
 'd_Elevator',
 'd_kettel',
 'd_Pets_allowed',
 'd_conditioner',
 'd_Laundromat_nearby',
 'd_toaster',
 'd_Room_darkening_shades',
 'd_Security_cameras_on_property',
 'd_Luggage_dropoff_allowed',
 'd_Baking_sheet',
 'd_backyard',
 'd_Books_and_reading_material',
 'd_Blender',
 'd_Fr

In [22]:
#data_oct.to_csv('/Users/Admin/Desktop/listings_APR_FINAL2.csv', index=False)

In [23]:
#Split the data
data_train, data_holdout = train_test_split(data_apr, train_size=0.8, random_state=42)

# Random Forest

In [24]:
ncores = 3

In [26]:
rfr = RandomForestRegressor(
    random_state=42,
    criterion="squared_error",
    n_estimators=30,
    oob_score=True,
    n_jobs=-1,
)

tune_grid = {
    "max_features": [10, 12, 14],
    "min_samples_split": [6, 11, 16],
}

rf_random = GridSearchCV(
    rfr,
    tune_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=3,
)

y, X = dmatrices("price ~ " + " + ".join(predictors_2), data_train)

rf_model_2 = rf_random.fit(X, y.ravel())

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_features=10, min_samples_split=6;, score=-151.785 total time=   1.1s
[CV 2/5] END max_features=10, min_samples_split=6;, score=-321.036 total time=   1.1s
[CV 3/5] END max_features=10, min_samples_split=6;, score=-370.280 total time=   1.3s
[CV 4/5] END max_features=10, min_samples_split=6;, score=-297.686 total time=   1.3s
[CV 5/5] END max_features=10, min_samples_split=6;, score=-242.161 total time=   1.1s
[CV 1/5] END max_features=10, min_samples_split=11;, score=-151.042 total time=   1.3s
[CV 2/5] END max_features=10, min_samples_split=11;, score=-319.464 total time=   1.1s
[CV 3/5] END max_features=10, min_samples_split=11;, score=-367.699 total time=   1.3s
[CV 4/5] END max_features=10, min_samples_split=11;, score=-297.634 total time=   1.1s
[CV 5/5] END max_features=10, min_samples_split=11;, score=-240.920 total time=   1.5s
[CV 1/5] END max_features=10, min_samples_split=16;, score=-150.787 total t

In [27]:
rfr = RandomForestRegressor(
    random_state=42,
    criterion="squared_error",
    n_estimators=30,
    oob_score=True,
    n_jobs=-1,
)

tune_grid = {
    "max_features": [10, 12, 14],
    "min_samples_split": [6, 11, 16],
}

rf_random = GridSearchCV(
    rfr,
    tune_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=3,
)

y, X = dmatrices("price ~ " + " + ".join(predictors_3), data_train)

rf_model_3 = rf_random.fit(X, y.ravel())

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_features=10, min_samples_split=6;, score=-149.681 total time=   2.3s
[CV 2/5] END max_features=10, min_samples_split=6;, score=-317.141 total time=   2.3s
[CV 3/5] END max_features=10, min_samples_split=6;, score=-369.097 total time=   2.5s
[CV 4/5] END max_features=10, min_samples_split=6;, score=-297.057 total time=   2.0s
[CV 5/5] END max_features=10, min_samples_split=6;, score=-242.504 total time=   2.2s
[CV 1/5] END max_features=10, min_samples_split=11;, score=-148.940 total time=   2.7s
[CV 2/5] END max_features=10, min_samples_split=11;, score=-319.273 total time=   2.4s
[CV 3/5] END max_features=10, min_samples_split=11;, score=-368.721 total time=   2.3s
[CV 4/5] END max_features=10, min_samples_split=11;, score=-297.071 total time=   2.0s
[CV 5/5] END max_features=10, min_samples_split=11;, score=-241.515 total time=   2.4s
[CV 1/5] END max_features=10, min_samples_split=16;, score=-150.150 total t

In [29]:
pd.DataFrame(rf_model_2.cv_results_)[
    ["param_max_features", "param_min_samples_split", "mean_test_score"]
].assign(
    mean_test_score=lambda x: x["mean_test_score"] * -1,
    Variables=lambda x: x["param_max_features"],
    Min_nodes=lambda x: x["param_min_samples_split"] - 1,
).pivot(
    index="Min_nodes", columns="Variables", values="mean_test_score"
).round(
    2
)

Variables,10,12,14
Min_nodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,276.59,275.54,274.35
10,275.35,275.76,274.43
15,276.08,274.96,274.28


In [30]:
pd.DataFrame(rf_model_3.cv_results_)[
    ["param_max_features", "param_min_samples_split", "mean_test_score"]
].assign(
    mean_test_score=lambda x: x["mean_test_score"] * -1,
    Variables=lambda x: x["param_max_features"],
    Min_nodes=lambda x: x["param_min_samples_split"] - 1,
).pivot(
    index="Min_nodes", columns="Variables", values="mean_test_score"
).round(
    2
)

Variables,10,12,14
Min_nodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,275.1,275.55,275.65
10,275.1,275.33,274.27
15,275.46,275.12,274.9


In [31]:
pd.DataFrame(
    {
        "Min vars": [
            rf_model_2.best_estimator_.max_features,
            rf_model_3.best_estimator_.max_features,
        ],
        "Min nodes": [,
            rf_model_2.best_estimator_.min_samples_split - 1,
            rf_model_3.best_estimator_.min_samples_split - 1,
        ],
    },
    ["Model B", "Model C"],
)

Unnamed: 0,Min vars,Min nodes
Model A,8,10
Model B,14,15
Model C,14,10


In [32]:
pd.DataFrame(
    {
        "RMSE": [
            rf_model_2.cv_results_["mean_test_score"].min(),
            rf_model_3.cv_results_["mean_test_score"].min(),
        ]
    },
    ["Model A", "Model B", "Model C"],
).round(2) * -1

Unnamed: 0,RMSE
Model A,284.44
Model B,276.59
Model C,275.65


# OLS

In [33]:
#print(data_apr[basic_vars].dtypes)
print(data_apr[reviews].dtypes)

n_review_scores_value               float64
flag_review_scores_value              int64
n_review_scores_location            float64
flag_review_scores_location           int64
n_review_scores_communication       float64
flag_review_scores_communication      int64
n_review_scores_checkin             float64
flag_review_scores_checkin            int64
n_review_scores_cleanliness         float64
flag_review_scores_cleanliness        int64
n_reviews_per_month                 float64
flag_reviews_per_month                int64
dtype: object


In [34]:
'''
y, X = dmatrices("price ~ " + " + ".join(predictors_1), data_train)
ols_model1 = LinearRegression().fit(X,y)
y_hat1 = ols_model1.predict(X)
ols_rmse1 = mean_squared_error(y,y_hat1,squared=False)

y_test, X_test = dmatrices("price ~ " + " + ".join(predictors_1), data_holdout)
ols_test1 = LinearRegression().fit(X_test,y_test)
y_hat_test1 = ols_test1.predict(X_test)
ols_cv_rmse1 = mean_squared_error(y_test,y_hat_test1,squared=False)
#-----------------------------------#

y, X = dmatrices("price ~ " + " + ".join(predictors_2), data_train)
ols_model2 = LinearRegression().fit(X,y)
y_hat2 = ols_model2.predict(X)
ols_rmse2 = mean_squared_error(y,y_hat2,squared=False)


y_test, X_test = dmatrices("price ~ " + " + ".join(predictors_2), data_holdout)
ols_test2 = LinearRegression().fit(X_test,y_test)
y_hat_test2 = ols_test2.predict(X_test)
ols_cv_rmse2 = mean_squared_error(y_test,y_hat_test2,squared=False)
#-----------------------------------#

y, X = dmatrices("price ~ " + " + ".join(predictors_3), data_train)
ols_model3 = LinearRegression().fit(X,y)
y_hat3 = ols_model3.predict(X)
ols_rmse3 = mean_squared_error(y,y_hat3,squared=False)

y_test, X_test = dmatrices("price ~ " + " + ".join(predictors_3), data_holdout)
ols_test3 = LinearRegression().fit(X_test,y_test)
y_hat_test3 = ols_test3.predict(X_test)
ols_cv_rmse3 = mean_squared_error(y_test,y_hat_test3,squared=False)
'''

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
def cross_validate(data, formula, model, kf):
    rmse_values = []
    for train_index, test_index in kf.split(data):
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]

        y_train, X_train = patsy.dmatrices(formula, data_train)
        y_test, X_test = patsy.dmatrices(formula, data_test)

        results = model(y_train, X_train).fit()

        y_pred = results.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_values.append(rmse)

    average_rmse = np.mean(rmse_values)
    return average_rmse

In [None]:
formulas = [
    ("OLS 1", "price ~ " + " + ".join(predictors_1)),
    ("OLS 2", "price ~ " + " + ".join(predictors_2)),
    ("OLS 3", "price ~ " + " + ".join(predictors_3))
]

rmse_values = []
for model_name, formula in formulas:
    rmse = cross_validate(data_apr, formula, sm.OLS, kf)
    rmse_values.append(rmse)

In [None]:
rmse_values

# LASSO

In [35]:
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [36]:
lambdas = np.arange(0.05, 1.01, 0.05)
print(lambdas)

[0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65 0.7
 0.75 0.8  0.85 0.9  0.95 1.  ]


In [37]:
y, X = dmatrices("price ~ " + " + ".join(predictors_3), data_train)
X_featnames = X.design_info.column_names
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [38]:
lasso_fit = LassoCV(alphas=lambdas, cv=5, random_state=42).fit(X, y)

  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [39]:
lasso_fit.alpha_

1.0

In [40]:
rmse_lambda_folds = (
    pd.DataFrame(lasso_fit.mse_path_, index=lambdas[::-1])
    .apply(np.sqrt)
    .mean(axis=1)
    .to_frame(name="Test RMSE")
    .round(2)
)

In [41]:
lasso_model = ElasticNet(l1_ratio=1, fit_intercept=True)

lasso_model_cv = GridSearchCV(
    lasso_model,
    {"alpha": [i / 100 for i in range(1, 26, 1)]},
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=3,
    n_jobs=-1,
)

y, X = dmatrices("price ~ " + " + ".join(predictors_3), data_train)
y, X = np.array(y), np.array(X)

lasso_model_cv.fit(X, y.ravel())

lasso_rmse = (
    pd.DataFrame(lasso_model_cv.cv_results_)
    .loc[lambda x: x.param_alpha == lasso_model_cv.best_estimator_.alpha]
    .mean_test_score.values[0]
    * -1
)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 3/5] END .....................alpha=0.01;, score=-372.062 total time=  25.7s




[CV 1/5] END .....................alpha=0.01;, score=-152.977 total time=  34.4s


  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END .....................alpha=0.01;, score=-320.944 total time=  42.6s
[CV 4/5] END .....................alpha=0.01;, score=-298.673 total time=  33.0s
[CV 5/5] END .....................alpha=0.01;, score=-244.471 total time=  28.9s
[CV 1/5] END .....................alpha=0.02;, score=-151.773 total time=  25.2s
[CV 2/5] END .....................alpha=0.02;, score=-320.471 total time=  19.6s
[CV 3/5] END .....................alpha=0.02;, score=-371.381 total time=  19.3s
[CV 4/5] END .....................alpha=0.02;, score=-297.844 total time=  25.9s
[CV 5/5] END .....................alpha=0.02;, score=-243.872 total time=  21.8s
[CV 1/5] END .....................alpha=0.03;, score=-150.874 total time=  20.3s
[CV 2/5] END .....................alpha=0.03;, score=-320.072 total time=  18.1s
[CV 3/5] END .....................alpha=0.03;, score=-370.814 total time=  20.2s
[CV 4/5] END .....................alpha=0.03;, score=-297.287 total time=  20.3s
[CV 5/5] END ...............

In [42]:
lasso_rmse

272.4782381349003

# Final Comparison for April

In [43]:
rmse_values.extend([rmse_lambda_folds.loc[lasso_fit.alpha_].values[0], lasso_rmse, rf_model_3.cv_results_["mean_test_score"].min() * -1])
model_names = ['OLS 1', 'OLS 2', 'OLS 3', 'LASSO 1', 'LASSO 2', 'RF']

df = pd.DataFrame({'Model': model_names, 'Test RMSE': rmse_values})

df['Test RMSE'] = df['Test RMSE'].round(2)

print(df)  

     Model  Train RMSE  Test RMSE
0    OLS 1      282.08     227.74
1    OLS 2      277.31     220.18
2    OLS 3      271.67     212.12
3  LASSO 1         NaN     273.90
4  LASSO 2         NaN     272.48
5       RF         NaN     275.65


In [None]:
rf_model_3_var_imp_df = (
    pd.DataFrame(
        rf_model_3.best_estimator_.feature_importances_, X.design_info.column_names
    )
    .reset_index()
    .rename({"index": "varname", 0: "imp"}, axis=1)
    .assign(
        imp_percentage=lambda x: x["imp"] / x["imp"].sum(),
        varname=lambda x: x.varname.str.replace(
            "f_room_type[T.", "Room type:", regex=False
        )
        .str.replace("f_neighbourhood_cleansed[T.", "Borough:", regex=False)
        .str.replace("]", "", regex=False),
    )
    .sort_values(by=["imp"], ascending=False)
)

In [None]:
subset_df = rf_model_3_var_imp_df.iloc[:10, :]
color = ['blue']
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(subset_df['varname'], subset_df['imp_percentage'], color=color[0], s=60, label='Variable Importance')
for index, row in subset_df.iterrows():
    ax.plot([row['varname'], row['varname']], [0, row['imp_percentage']], color=color[0], linewidth=2)

ax.set_ylabel('Importance (Percent)')
ax.set_xlabel('Variable Name')
ax.set_title('Variable Importance')
plt.xticks(rotation=90)
ax.grid(True)
plt.tight_layout()
plt.show()