In [1]:
import csv
import pandas as pd
import numpy as np
df = pd.read_csv("train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
id                        74111 non-null int64
log_price                 74111 non-null float64
property_type             74111 non-null object
room_type                 74111 non-null object
amenities                 74111 non-null object
accommodates              74111 non-null int64
bathrooms                 73911 non-null float64
bed_type                  74111 non-null object
cancellation_policy       74111 non-null object
cleaning_fee              74111 non-null bool
city                      74111 non-null object
description               74111 non-null object
first_review              58247 non-null object
host_has_profile_pic      73923 non-null object
host_identity_verified    73923 non-null object
host_response_rate        55812 non-null object
host_since                73923 non-null object
instant_bookable          74111 non-null object
last_review               582

In [2]:
# parsing the amenities variable
df['amenities'] = df['amenities'].str.replace('{', '')
df['amenities'] = df['amenities'].str.replace('}', '')
df['amenities'] = df['amenities'].str.replace('"', '')

# make a tuple to store all the amenities
amenities_all = np.unique(np.concatenate(df['amenities'].map(lambda x: x.split(","))))

# match each record's amenities
for amenity in amenities_all:
    df[amenity] = df['amenities'].map(lambda x: True if x.find(amenity) > 0 else False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Columns: 160 entries, id to translation missing: en.hosting_amenity_50
dtypes: bool(132), float64(7), int64(3), object(18)
memory usage: 25.2+ MB


In [4]:
# exclude variables that are meaningless or are out of this project's scope
df = df.drop(['id', 'thumbnail_url', 'amenities', 'latitude', 'longitude', 'name', 'description', 'neighbourhood'], axis=1)

In [5]:
# exclude variables that do not exist at the initial listing
df = df.drop(['first_review', 'last_review', 'number_of_reviews', 'review_scores_rating', 'host_response_rate', 'host_since'], axis=1)

In [6]:
# clean zipcode
df['zipcode'] = df['zipcode'].astype(str)
df['zipcode']=df['zipcode'].map(lambda x: x.strip('.0'))
df['zipcode']=df['zipcode'].map(lambda x: x.strip('lm'))
df['zipcode']=df['zipcode'].map(lambda x: x.strip('Near'))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("\n", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("\r", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("-", ""))
df['zipcode']=df['zipcode'].map(lambda x: x.replace(" ", ""))

df['zipcode']=df['zipcode'].map(lambda x: x[:5])
df['zipcode']=df['zipcode'].map(lambda x: x.zfill(5))
df['zipcode']=df['zipcode'].map(lambda x: x.replace("00nan", "NA"))

In [7]:
# transform true/false features to binary
df['host_has_profile_pic'] = df['host_has_profile_pic'].map(lambda x: 0 if x == "f" else 1)
df['host_identity_verified'] = df['host_identity_verified'].map(lambda x: 0 if x == "f" else 1)
df['instant_bookable'] = df['instant_bookable'].map(lambda x: 0 if x == "f" else 1)
df['cleaning_fee'] = df['cleaning_fee'].map(lambda x: 0 if x == False else 1)

In [8]:
# transform categorical variables to dummies
df['property_type'] = df['property_type'].astype('category')
df['room_type'] = df['room_type'].astype('category')
df['bed_type'] = df['bed_type'].astype('category')
df['cancellation_policy'] = df['cancellation_policy'].astype('category')

for i in ['cancellation_policy', 'property_type', 'room_type', 'bed_type', 'zipcode', 'city']:
    df = pd.concat([df, pd.get_dummies(df[i])], axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Columns: 852 entries, log_price to SF
dtypes: bool(131), category(4), float64(4), int64(5), object(2), uint8(706)
memory usage: 65.7+ MB


In [10]:
df.head()

Unnamed: 0,log_price,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,host_has_profile_pic,...,94401,95202,99135,NA,Boston,Chicago,DC,LA,NYC,SF
0,5.010635,Apartment,Entire home/apt,3,1.0,Real Bed,strict,1,NYC,1,...,0,0,0,0,0,0,0,0,1,0
1,5.129899,Apartment,Entire home/apt,7,1.0,Real Bed,strict,1,NYC,1,...,0,0,0,0,0,0,0,0,1,0
2,4.976734,Apartment,Entire home/apt,5,1.0,Real Bed,moderate,1,NYC,1,...,0,0,0,0,0,0,0,0,1,0
3,6.620073,House,Entire home/apt,4,1.0,Real Bed,flexible,1,SF,1,...,0,0,0,0,0,0,0,0,0,1
4,4.744932,Apartment,Entire home/apt,2,1.0,Real Bed,moderate,1,DC,1,...,0,0,0,0,0,0,1,0,0,0


In [11]:
df1 = df.dropna()

In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73766 entries, 0 to 74110
Columns: 852 entries, log_price to SF
dtypes: bool(131), category(4), float64(4), int64(5), object(2), uint8(706)
memory usage: 65.9+ MB


In [13]:
df1 = df1.drop(['cancellation_policy', 'property_type', 'room_type', 'bed_type', 'zipcode', 'city', 'NA'], axis=1)

In [14]:
# view column names
df1.columns.values

array(['log_price', 'accommodates', 'bathrooms', 'cleaning_fee',
       'host_has_profile_pic', 'host_identity_verified',
       'instant_bookable', 'bedrooms', 'beds', '',
       ' smooth pathway to front door', '24-hour check-in',
       'Accessible-height bed', 'Accessible-height toilet',
       'Air conditioning', 'Air purifier', 'BBQ grill', 'Baby bath',
       'Baby monitor', 'Babysitter recommendations', 'Bath towel',
       'Bathtub', 'Bathtub with shower chair', 'Beach essentials',
       'Beachfront', 'Bed linens', 'Body soap', 'Breakfast',
       'Buzzer/wireless intercom', 'Cable TV', 'Carbon monoxide detector',
       'Cat(s)', 'Changing table', 'Children’s books and toys',
       'Children’s dinnerware', 'Cleaning before checkout',
       'Coffee maker', 'Cooking basics', 'Crib', 'Disabled parking spot',
       'Dishes and silverware', 'Dishwasher', 'Dog(s)', 'Doorman',
       'Doorman Entry', 'Dryer', 'EV charger', 'Elevator',
       'Elevator in building', 'Essentials',

### Linear regression code

Given the large number of features in this dataset, lasso regression is used to enable dimension reduction. Cross-nested valdiation is used to evalute the appropriate value of alpha, the constant that multiplies the L1 term and hence controls regularization strength.

In [15]:
# set up X and y attributes
y = df1.iloc[:,0]
X = df1.iloc[:,1:]

X.describe()

Unnamed: 0,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,bedrooms,beds,flexible,moderate,...,94158,94401,95202,99135,Boston,Chicago,DC,LA,NYC,SF
count,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,...,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0,73766.0
mean,3.159518,1.23563,0.734891,0.996963,0.674091,0.26294,1.266369,1.71232,0.304137,0.257449,...,0.000515,1.4e-05,1.4e-05,1.4e-05,0.046878,0.050362,0.076797,0.302944,0.436149,0.086869
std,2.155023,0.582462,0.441394,0.055022,0.468717,0.440233,0.853104,1.255326,0.460044,0.437232,...,0.022691,0.003682,0.003682,0.003682,0.211379,0.218692,0.266271,0.459535,0.49591,0.281645
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,16.0,8.0,1.0,1.0,1.0,1.0,10.0,18.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
# nested cross-validation for lasso regression, derived from professor's code in model evaluation notebook
# Warning: this cell takes a long time to compute

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

gs_lasso = GridSearchCV(estimator=Lasso(random_state=75),
                  param_grid= [{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 500]}],
                  scoring='neg_mean_squared_error',
                  cv=10, iid=False,
                  n_jobs=4)
gs_lasso = gs_lasso.fit(X, y)

# nested cross-validation with parameter optimization for alpha values via Grid Search
#gs_lasso = GridSearchCV(estimator=Lasso(random_state=75),
#                  param_grid= [{'alpha': [0.0001, 0.005, 0.001, 0.05, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]}],
#                  scoring='neg_mean_squared_error',
#                  cv=10, iid=False,
#                  n_jobs=4)
#gs_lasso = gs_lasso.fit(X, y)

# outer cross validation layer for performance estimate
mse = cross_val_score(gs_lasso, X, y, scoring='neg_mean_squared_error', cv=10)

In [20]:
# get best value of alpha
print(gs_lasso.best_params_)

# get overall model performance. Adjusted sign of MSE because 
print('Lasso MSE: %.3f +/- %.3f' % (-np.mean(mse), np.std(mse)))

{'alpha': 0.0001}
Lasso MSE: 0.175 +/- 0.005


In [24]:
# look at coefficients
gs_lasso.best_estimator_

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=75,
   selection='cyclic', tol=0.0001, warm_start=False)

In [36]:
# Fit and look at coefficients for all data using new model.
test = Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=75,
   selection='cyclic', tol=0.0001, warm_start=False)
test = test.fit(X, y)

#test.coef_

array([ 7.77442896e-02,  1.11813969e-01, -5.56587463e-02, -6.39894232e-02,
       -2.89521961e-02, -3.05074593e-02,  1.56453570e-01, -3.70016490e-02,
        0.00000000e+00,  0.00000000e+00, -2.58219253e-02, -5.64440364e-03,
       -0.00000000e+00,  2.94507391e-02, -0.00000000e+00,  1.16506714e-02,
       -1.17420625e-02,  0.00000000e+00,  2.15685881e-02,  0.00000000e+00,
       -3.45251364e-03,  0.00000000e+00, -0.00000000e+00, -6.70531489e-02,
       -0.00000000e+00,  0.00000000e+00,  3.20976372e-02,  1.76942563e-02,
        4.70553487e-02,  1.43917991e-02, -3.92482247e-02, -0.00000000e+00,
       -1.46683153e-02, -0.00000000e+00,  1.21189304e-02,  2.96421673e-02,
        3.77091047e-02,  2.14094721e-02, -0.00000000e+00,  9.72576193e-03,
        7.70518944e-02,  9.14417138e-03,  6.38821287e-02, -4.46843631e-02,
        3.60665504e-02, -0.00000000e+00,  6.88979448e-02, -2.18954163e-02,
       -2.43931916e-02,  0.00000000e+00,  0.00000000e+00,  1.69003928e-02,
        1.77293671e-02, -