# Transformation pipeline

In [1]:
# libraries
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/train.csv')

In [4]:
# drop columns with too many unique values

df.drop(['id', 'zipcode', 'thumbnail_url', 'amenities', 'description', 'host_since', 'name','longitude', 'latitude'],
        axis=1, inplace=True)

First, the categorical columns with too many values are going to be categorized according to their top 5 values.

In [6]:
df.property_type.value_counts()[:5]

Apartment      39158
House          13295
Condominium     2103
Townhouse       1341
Loft             992
Name: property_type, dtype: int64

In [7]:
df.neighbourhood.value_counts()[:5]

Williamsburg          2276
Bedford-Stuyvesant    1686
Bushwick              1281
Upper West Side       1151
Harlem                1112
Name: neighbourhood, dtype: int64

In [8]:
top_5 = ['Apartment', 'House', 'Condominium', 'Townhouse', 'Loft']

df.property_type = df.property_type.apply(lambda x: x if x in top_5 else 'Other')

top_5 = ['Williamsburg', 'Bedford-Stuyvesant', 'Bushwick', 'Upper West Side', 'Harlem']

df.neighbourhood = df.neighbourhood.apply(lambda x: x if x in top_5 else 'Other')

Now, bool columns will be normalized.

In [9]:
# cleaning_fee has trues and falses, changing the dtype change it to 0 and 1
df.cleaning_fee = df.cleaning_fee.astype(int)

In [10]:
# a nan in the reviews will be treated as no comments
df.first_review.fillna(0, inplace=True)
df.last_review.fillna(0, inplace=True)

df.first_review = df.first_review.apply(lambda x: x if x == 0 else 1)
df.last_review = df.last_review.apply(lambda x: x if x == 0 else 1)

In [11]:
# changing t and f to 1/0, while also treating nan as 0
df.host_has_profile_pic = df.host_has_profile_pic.apply(lambda x: 1 if x == 't' else 0)
df.host_identity_verified = df.host_identity_verified.apply(lambda x: 1 if x == 't' else 0)
df.instant_bookable = df.instant_bookable.apply(lambda x: 1 if x == 't' else 0)

The column host_response_rate will be changed to float and nan is going to be treated as no response.

In [12]:
df.host_response_rate.fillna(0, inplace=True)

df.host_response_rate = df.host_response_rate.apply(lambda x: float(x.strip('%')) / 100 if x != 0 else x)

Finally, we will create dummy columns for all categorical columns so that we can process it through our machine learning model.

In [13]:
df_dummy = pd.get_dummies(df, columns=['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'neighbourhood'], drop_first=True)
df_dummy.fillna(0, inplace=True) # fill na with 0 just in case

df_dummy.head()

Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,last_review,...,city_Chicago,city_DC,city_LA,city_NYC,city_SF,neighbourhood_Bushwick,neighbourhood_Harlem,neighbourhood_Other,neighbourhood_Upper West Side,neighbourhood_Williamsburg
0,4.49981,2,1.0,1,1,1,1,0.8,0,1,...,0,0,0,1,0,0,0,1,0,0
1,3.970292,2,1.0,1,1,1,1,0.0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,4.787492,4,1.0,1,1,1,1,1.0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,4.70048,3,,0,1,1,0,0.0,0,1,...,0,0,0,1,0,0,0,1,0,0
4,5.192957,3,1.0,1,1,1,0,0.0,0,1,...,0,0,0,1,0,0,0,1,0,0


# Training

In [14]:
# libraries
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression

In [15]:
# we select our X and y (y being the log_price and X everything else)

X = df_dummy.drop(['log_price'], axis=1)

y = df_dummy.log_price

In [16]:
# then we split X and y for our train and test
X_train, X_test, y_train, y_test = tts(X, y, 
                                       train_size=0.8, 
                                       test_size=0.2,
                                       random_state=42)

In [17]:
# we fit the model with the train dfs and make the y_pred
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values