# Transformation pipeline

In [1]:
# libraries
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/train.csv')

In [4]:
# drop columns with too many unique values

df.drop(['id', 'zipcode', 'thumbnail_url', 'amenities', 'description', 'host_since', 'name','longitude', 'latitude'],
        axis=1, inplace=True)

First, the categorical columns with too many values are going to be categorized according to their top 5 values.

In [6]:
df.property_type.value_counts()[:5]

Apartment      39158
House          13295
Condominium     2103
Townhouse       1341
Loft             992
Name: property_type, dtype: int64

In [7]:
df.neighbourhood.value_counts()[:5]

Williamsburg          2276
Bedford-Stuyvesant    1686
Bushwick              1281
Upper West Side       1151
Harlem                1112
Name: neighbourhood, dtype: int64

In [8]:
top_5 = ['Apartment', 'House', 'Condominium', 'Townhouse', 'Loft']

df.property_type = df.property_type.apply(lambda x: x if x in top_5 else 'Other')

top_5 = ['Williamsburg', 'Bedford-Stuyvesant', 'Bushwick', 'Upper West Side', 'Harlem']

df.neighbourhood = df.neighbourhood.apply(lambda x: x if x in top_5 else 'Other')

Now, bool columns will be normalized.

In [9]:
# cleaning_fee has trues and falses, changing the dtype change it to 0 and 1
df.cleaning_fee = df.cleaning_fee.astype(int)

In [10]:
# a nan in the reviews will be treated as no comments
df.first_review.fillna(0, inplace=True)
df.last_review.fillna(0, inplace=True)

df.first_review = df.first_review.apply(lambda x: x if x == 0 else 1)
df.last_review = df.last_review.apply(lambda x: x if x == 0 else 1)

In [11]:
# changing t and f to 1/0, while also treating nan as 0
df.host_has_profile_pic = df.host_has_profile_pic.apply(lambda x: 1 if x == 't' else 0)
df.host_identity_verified = df.host_identity_verified.apply(lambda x: 1 if x == 't' else 0)
df.instant_bookable = df.instant_bookable.apply(lambda x: 1 if x == 't' else 0)

The column host_response_rate will be changed to float and nan is going to be treated as no response.

In [12]:
df.host_response_rate.fillna(0, inplace=True)

df.host_response_rate = df.host_response_rate.apply(lambda x: float(x.strip('%')) / 100 if x != 0 else x)

Finally,

# Training

In [None]:
# 