# Transformation pipeline

In [1]:
# libraries
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

In [2]:
df = pd.read_csv('../data/train.csv')

In [3]:
# drop columns with too many unique values

df.drop(['id', 'zipcode', 'thumbnail_url', 'amenities', 'description', 'host_since', 'name','longitude', 'latitude'],
        axis=1, inplace=True)

First, the categorical columns with too many values are going to be categorized according to their top 5 values.

In [4]:
df.property_type.value_counts()[:5]

Apartment      39158
House          13295
Condominium     2103
Townhouse       1341
Loft             992
Name: property_type, dtype: int64

In [5]:
df.neighbourhood.value_counts()[:5]

Williamsburg          2276
Bedford-Stuyvesant    1686
Bushwick              1281
Upper West Side       1151
Harlem                1112
Name: neighbourhood, dtype: int64

In [6]:
top_5 = ['Apartment', 'House', 'Condominium', 'Townhouse', 'Loft']

df.property_type = df.property_type.apply(lambda x: x if x in top_5 else 'Other')

top_5 = ['Williamsburg', 'Bedford-Stuyvesant', 'Bushwick', 'Upper West Side', 'Harlem']

df.neighbourhood = df.neighbourhood.apply(lambda x: x if x in top_5 else 'Other')

Now, bool columns will be normalized.

In [7]:
# cleaning_fee has trues and falses, changing the dtype change it to 0 and 1
df.cleaning_fee = df.cleaning_fee.astype(int)

In [8]:
# a nan in the reviews will be treated as no comments
df.first_review.fillna(0, inplace=True)
df.last_review.fillna(0, inplace=True)

df.first_review = df.first_review.apply(lambda x: x if x == 0 else 1)
df.last_review = df.last_review.apply(lambda x: x if x == 0 else 1)

In [9]:
# changing t and f to 1/0, while also treating nan as 0
df.host_has_profile_pic = df.host_has_profile_pic.apply(lambda x: 1 if x == 't' else 0)
df.host_identity_verified = df.host_identity_verified.apply(lambda x: 1 if x == 't' else 0)
df.instant_bookable = df.instant_bookable.apply(lambda x: 1 if x == 't' else 0)

The column host_response_rate will be changed to float and nan is going to be treated as no response.

In [10]:
df.host_response_rate.fillna(0, inplace=True)

df.host_response_rate = df.host_response_rate.apply(lambda x: float(x.strip('%')) / 100 if x != 0 else x)

Finally, we will create dummy columns for all categorical columns so that we can process it through our machine learning model.

In [11]:
df_dummy = pd.get_dummies(df, columns=['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'neighbourhood'], drop_first=True)
df_dummy.fillna(0, inplace=True) # fill na with 0 just in case

df_dummy.head()

Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,last_review,number_of_reviews,review_scores_rating,bedrooms,beds,property_type_Condominium,property_type_House,property_type_Loft,property_type_Other,property_type_Townhouse,room_type_Private room,room_type_Shared room,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,city_Chicago,city_DC,city_LA,city_NYC,city_SF,neighbourhood_Bushwick,neighbourhood_Harlem,neighbourhood_Other,neighbourhood_Upper West Side,neighbourhood_Williamsburg
0,4.49981,2,1.0,1,1,1,1,0.8,0,1,9,96.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,3.970292,2,1.0,1,1,1,1,0.0,0,1,8,100.0,1.0,1.0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
2,4.787492,4,1.0,1,1,1,1,1.0,0,1,12,100.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
3,4.70048,3,0.0,0,1,1,0,0.0,0,1,2,90.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,5.192957,3,1.0,1,1,1,0,0.0,0,1,1,80.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0


# Training

In [12]:
# libraries
from sklearn.model_selection import train_test_split as tts

In [13]:
# we select our X and y (y being the log_price and X everything else)

X = df_dummy.drop(['log_price'], axis=1)

y = df_dummy.log_price

In [14]:
# then we split X and y for our train and test
X_train, X_test, y_train, y_test = tts(X, y, 
                                       train_size=0.8, 
                                       test_size=0.2,
                                       random_state=42)

To see which model fits best we will run this function

In [15]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

def model_test(X_train, y_train, X_test, y_test):
    models = {
        "linear_regression": LinearRegression(),
        "ridge_regression": Ridge(),
        "lasso_regression": Lasso(),
        "decision_tree_regressor": DecisionTreeRegressor(),
        "random_forest_regressor": RandomForestRegressor(),
        "gradient_boosting_regressor": GradientBoostingRegressor()
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        error = mean_squared_error(y_test, y_pred)
        print(f"Modelo: {name} - MSE: {error}")

model_test(X_train, y_train, X_test, y_test)

Modelo: linear_regression - MSE: 0.22293335714334173
Modelo: ridge_regression - MSE: 0.22291936252693648
Modelo: lasso_regression - MSE: 0.5149474813532444
Modelo: decision_tree_regressor - MSE: 0.36530360683494784
Modelo: random_forest_regressor - MSE: 0.22361767298758298
Modelo: gradient_boosting_regressor - MSE: 0.20707807931834685


For this particular case and with the models used, gradient boosting regressor is the most accurate, so we will be using that.

In [17]:
# we fit the model with the train dfs and make the y_pred
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

# Predictions

Now that the model has been trained, we will use it to predict the prices.

In [18]:
# first, load the df
df = pd.read_csv('../data/test.csv')

In [19]:
# now the pipeline we created before 
df.drop(['id', 'zipcode', 'thumbnail_url', 'amenities', 'description', 'host_since', 'name','longitude', 'latitude'],
        axis=1, inplace=True)

top_5 = ['Apartment', 'House', 'Condominium', 'Townhouse', 'Loft']

df.property_type = df.property_type.apply(lambda x: x if x in top_5 else 'Other')

top_5 = ['Williamsburg', 'Bedford-Stuyvesant', 'Bushwick', 'Upper West Side', 'Harlem']

df.neighbourhood = df.neighbourhood.apply(lambda x: x if x in top_5 else 'Other')

df.cleaning_fee = df.cleaning_fee.astype(int)

df.first_review.fillna(0, inplace=True)

df.first_review = df.first_review.apply(lambda x: x if x == 0 else 1)

df.last_review.fillna(0, inplace=True)

df.last_review = df.last_review.apply(lambda x: x if x == 0 else 1)

df.host_has_profile_pic = df.host_has_profile_pic.apply(lambda x: 1 if x == 't' else 0)
df.host_identity_verified = df.host_identity_verified.apply(lambda x: 1 if x == 't' else 0)
df.instant_bookable = df.instant_bookable.apply(lambda x: 1 if x == 't' else 0)

df.host_response_rate.fillna(0, inplace=True)

df.host_response_rate = df.host_response_rate.apply(lambda x: float(x.strip('%')) / 100 if x != 0 else x)

df_dummy = pd.get_dummies(df, columns=['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'neighbourhood'], drop_first=True)

df_dummy.fillna(0, inplace=True)

In [21]:
# and finally, get our price prediction
price_pred = gbr.predict(df_dummy)

price_pred[:5]

array([5.67998801, 4.96167803, 4.03776292, 4.92694724, 4.87741408])