In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from utils.data_cleaning import load_and_clean
from utils.pipeline import create_pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from utils._feature_types import *

## First Run

Aimed at showing the basic functionality of the custom functions, pipeline and data cleaning.

In [2]:
# Load the full dataset and split into a train and test set

df = load_and_clean(verbose=True)

target = 'price'
y = df[target]
X = df.drop(target, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

Features dropped due to redundancy:
['calendar_last_scraped' 'description' 'host_about' 'host_id'
 'host_location' 'host_name' 'host_neighbourhood' 'host_picture_url'
 'host_thumbnail_url' 'host_url' 'id' 'last_scraped' 'listing_url' 'name'
 'neighborhood_overview' 'picture_url' 'scrape_id' 'source']

Features dropped due to high correlation (>0.8):
['host_total_listings_count', 'minimum_minimum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_60', 'availability_90', 'review_scores_accuracy', 'review_scores_value', 'calculated_host_listings_count_entire_homes']

Features dropped due to amount of missing values (>50.0%):
['neighbourhood', 'neighbourhood_group_cleansed', 'calendar_updated', 'license']

Features dropped due to only one unique value:
['has_availability']

Features dropped due to being text:
['bathrooms_text']

Number of outliers removed above quantile 0.99 ($1077.0): 626
number of rows removed due to missing price: 32367
Number of rows left after 

In [3]:
# Create the custom pipeline (see utils/pipeline.py for more details), and fit the data

pipe = create_pipeline(df)
pipe.fit(X_train, y_train)

In [4]:
# Predict the y-values for the training and test data
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

In [5]:
# Compute some metric for the model performance with train and test data to show overfitting
train_results = pd.DataFrame({'train_predict': y_train_pred, 'train_actual': y_train, 'abs_difference': abs(y_train_pred - y_train)})
test_results = pd.DataFrame({'test_predict': y_test_pred, 'test_actual': y_test, 'abs_difference': abs(y_test_pred - y_test)})

print("Train Results:")
display(train_results)
print(f"{'RMSE:': <5} {root_mean_squared_error(y_train, y_train_pred)}")
print(f"{'MAE:': <5} {mean_absolute_error(y_train, y_train_pred)}")
print(f"{'R2:': <5} {r2_score(y_train, y_train_pred)}")
print("\n\nTest Results:")
display(test_results)
print(f"{'RMSE:': <5} {root_mean_squared_error(y_test, y_test_pred)}")
print(f"{'MAE:': <5} {mean_absolute_error(y_test, y_test_pred)}")
print(f"{'R2:': <5} {r2_score(y_test, y_test_pred)}")

Train Results:


Unnamed: 0,train_predict,train_actual,abs_difference
49807,40.0,40.0,0.0
17544,286.0,286.0,0.0
81013,75.0,75.0,0.0
81042,190.0,190.0,0.0
87730,158.0,158.0,0.0
...,...,...,...
45123,114.0,114.0,0.0
90597,332.0,332.0,0.0
37783,144.0,144.0,0.0
1456,65.0,65.0,0.0


RMSE: 0.534673425676004
MAE:  0.004713249393244566
R2:   0.9999866823448859


Test Results:


Unnamed: 0,test_predict,test_actual,abs_difference
70028,45.0,45.0,0.0
47941,30.0,65.0,35.0
44383,141.0,163.0,22.0
27924,290.0,313.0,23.0
8291,163.0,314.0,151.0
...,...,...,...
36177,311.0,357.0,46.0
35599,130.0,190.0,60.0
49133,55.0,67.0,12.0
24446,66.0,83.0,17.0


RMSE: 117.81510563185117
MAE:  63.16838294448914
R2:   0.33346087924576384


In [6]:
# Extract the feature importance from the pipeline model
feats_out = pipe.steps[-2][1].get_feature_names_out()
feats_imp = pipe.steps[-1][1].feature_importances_

feature_importance = {feat: imp for feat, imp in zip(feats_out, feats_imp)}
feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

pd.DataFrame(feature_importance, columns=['feature name', 'importance'])

Unnamed: 0,feature name,importance
0,bedrooms,0.258836
1,neighbourhood_cleansed,0.129721
2,bathrooms,0.092115
3,property_type,0.065145
4,amenities,0.061319
5,longitude,0.038952
6,availability_30,0.034472
7,latitude,0.03225
8,accommodates,0.025803
9,availability_365,0.023047


In [7]:
# Rerun all steps, but this time with the top 20 features only
remove_feats = [feat for feat, _ in feature_importance[:20]]

new_df = df.drop(remove_feats + ['host_verifications', 'room_type'], axis=1)
display(new_df)

new_X = new_df.drop('price', axis=1)
new_y = new_df['price']
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42)

new_pipe = create_pipeline(new_df)
new_pipe.fit(new_X_train, new_y_train)
display(new_pipe)

new_y_train_pred = new_pipe.predict(new_X_train)
new_y_test_pred = new_pipe.predict(new_X_test)
print("Train Results:")
print(f"{'RMSE:': <5} {root_mean_squared_error(new_y_train, new_y_train_pred)}")
print(f"{'MAE:': <5} {mean_absolute_error(new_y_train, new_y_train_pred)}")
print(f"{'R2:': <5} {r2_score(new_y_train, new_y_train_pred)}")
print("\nTest Results:")
print(f"{'RMSE:': <5} {root_mean_squared_error(new_y_test, new_y_test_pred)}")
print(f"{'MAE:': <5} {mean_absolute_error(new_y_test, new_y_test_pred)}")
print(f"{'R2:': <5} {r2_score(new_y_test, new_y_test_pred)}")

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,beds,price,maximum_nights,maximum_maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,instant_bookable,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,within a day,99%,f,t,t,1.0,89.0,365,365.0,145,42,1,4.71,4.67,4.88,4.79,4.92,f,1,0
1,within a few hours,100%,,t,t,3.0,220.0,1125,1125.0,178,3,1,4.52,4.60,4.80,4.74,4.63,t,0,0
2,within a day,99%,f,t,t,1.0,88.0,730,730.0,170,57,1,4.61,4.69,4.87,4.78,4.91,f,1,0
3,within a day,99%,f,t,t,1.0,75.0,1125,1125.0,186,44,2,4.65,4.76,4.87,4.77,4.92,f,1,0
4,,,f,t,t,1.0,53.0,730,730.0,13,0,0,4.46,4.77,4.62,4.85,4.62,f,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95139,,,f,f,t,2.0,236.0,365,365.0,0,0,0,,,,,,t,0,0
95140,,,f,t,t,2.0,88.0,30,30.0,0,0,0,,,,,,f,0,0
95141,within an hour,100%,f,t,t,1.0,93.0,7,1125.0,0,0,0,,,,,,f,71,0
95142,,,f,f,f,1.0,79.0,365,365.0,0,0,0,,,,,,f,1,0


Train Results:
RMSE: 50.719584472044204
MAE:  13.622090722783113
R2:   0.8798808209621866

Test Results:
RMSE: 150.8784764977713
MAE:  88.49822000782609
R2:   -0.08274238182279037


## TODO
- What features to use (~20) in the final models, and how to select them?
- What imputing techniques to use per category? Is there a different prefered method between models?
- How to encode categorical features, both low cardinality and high cardinality? Is there a downside to negative values (encode missing)?
- Importance of scaling, and how to scale?
- Which metrics to use and why?
- How to tune the models (GridSearchCV, RandomSearchCV, etc.)?
- Train-test-validate split or not?