In [4]:
pip install --upgrade pip

Requirement already up-to-date: pip in /Users/veohntiafokpa/opt/anaconda3/lib/python3.7/site-packages (20.1.1)
Note: you may need to restart the kernel to use updated packages.


In [68]:
from sklearn import tree
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from datetime import datetime
import time
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score


## Data Manipulation 

In [28]:
df = pd.read_csv(os.path.join("listings_clean.csv"))
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,name,summary,description,host_id,host_since,host_location,host_is_superhost,...,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews
0,0,0,2060,Modern NYC,,"Lovely, spacious, sunny 1 BR apartment in 6th ...",2259,2008-08-18,US,f,...,"{Internet,Wifi}",100.0,,,1,0.0,1,730,365,1
1,1,2,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...","Enjoy 500 s.f. top floor in 1899 brownstone, w...","Enjoy 500 s.f. top floor in 1899 brownstone, w...",4869,2008-12-07,"New York, New York, United States",f,...,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",89.0,$500.00,,1,0.0,1,730,276,322
2,2,5,5136,"Spacious Brooklyn Duplex, Patio + Garden",We welcome you to stay in our lovely 2 br dupl...,We welcome you to stay in our lovely 2 br dupl...,7378,2009-02-03,"New York, New York, United States",f,...,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",253.0,,,4,25.0,14,1125,358,1
3,3,6,5178,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,Please don’t expect the luxury here just a bas...,8967,2009-03-03,"New York, New York, United States",f,...,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",79.0,,$15.00,1,12.0,2,14,294,473
4,4,7,5203,Cozy Clean Guest Room - Family Apt,"Our best guests are seeking a safe, clean, spa...","Our best guests are seeking a safe, clean, spa...",7490,2009-02-05,"New York, New York, United States",f,...,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",79.0,,,1,0.0,2,14,0,118


In [29]:
# Creating a dataframe with just our potential features to analyze their level of importance

features_df = df.loc[:,["id","accommodates","bathrooms","bedrooms","beds","bed_type","neighbourhood_group_cleansed","room_type", "zipcode","price","property_type"]]
features_df.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,bed_type,neighbourhood_group_cleansed,room_type,zipcode,price,property_type
0,2060,2,1.0,1.0,2.0,Real Bed,Manhattan,Private room,10040,100.0,Other
1,3831,3,1.0,1.0,3.0,Real Bed,Brooklyn,Entire home/apt,11238,89.0,Guest suite
2,5136,4,1.5,2.0,2.0,Real Bed,Brooklyn,Entire home/apt,11232,253.0,Apartment
3,5178,2,1.0,1.0,1.0,Real Bed,Manhattan,Private room,10019,79.0,Apartment
4,5203,1,1.0,1.0,1.0,Real Bed,Manhattan,Private room,10025,79.0,Apartment


In [30]:
# Turning my categorical values (Bed Type) into dummy variables with binary values for each possible value

bed_type_dummy = pd.get_dummies(features_df['bed_type'])
print(bed_type_dummy)

       Airbed  Couch  Futon  Pull-out Sofa  Real Bed
0           0      0      0              0         1
1           0      0      0              0         1
2           0      0      0              0         1
3           0      0      0              0         1
4           0      0      0              0         1
...       ...    ...    ...            ...       ...
46917       0      0      0              0         1
46918       0      0      0              0         1
46919       0      0      0              0         1
46920       0      0      0              0         1
46921       0      0      0              0         1

[46922 rows x 5 columns]


In [31]:
# Turning my categorical values (Neighborhood) into dummy variables with binary values for each possible value. 
# I'm doing Borough and not neighborhood in order to not over-fit the data

borough_dummy = pd.get_dummies(features_df['neighbourhood_group_cleansed'])
print(borough_dummy)

       Bronx  Brooklyn  Manhattan  Queens  Staten Island
0          0         0          1       0              0
1          0         1          0       0              0
2          0         1          0       0              0
3          0         0          1       0              0
4          0         0          1       0              0
...      ...       ...        ...     ...            ...
46917      0         0          1       0              0
46918      0         1          0       0              0
46919      0         0          1       0              0
46920      0         0          1       0              0
46921      0         0          1       0              0

[46922 rows x 5 columns]


In [32]:
# Turning my categorical values (Room Type) into dummy variables with binary values for each possible value

room_type_dummy = pd.get_dummies(features_df['room_type'])
print(room_type_dummy)

       Entire home/apt  Hotel room  Private room  Shared room
0                    0           0             1            0
1                    1           0             0            0
2                    1           0             0            0
3                    0           0             1            0
4                    0           0             1            0
...                ...         ...           ...          ...
46917                1           0             0            0
46918                1           0             0            0
46919                0           0             1            0
46920                0           0             1            0
46921                1           0             0            0

[46922 rows x 4 columns]


In [33]:
# Turning my categorical values (Property Type) into dummy variables with binary values for each possible value

property_type_dummy = pd.get_dummies(features_df['property_type'])
print(property_type_dummy)

       Aparthotel  Apartment  Barn  Bed and breakfast  Boat  Boutique hotel  \
0               0          0     0                  0     0               0   
1               0          0     0                  0     0               0   
2               0          1     0                  0     0               0   
3               0          1     0                  0     0               0   
4               0          1     0                  0     0               0   
...           ...        ...   ...                ...   ...             ...   
46917           0          1     0                  0     0               0   
46918           0          1     0                  0     0               0   
46919           0          1     0                  0     0               0   
46920           0          1     0                  0     0               0   
46921           0          1     0                  0     0               0   

       Bungalow  Bus  Cabin  Camper/RV  ...  Other 

In [34]:
# Turning my categorical values (Zipcode Type) into dummy variables with binary values for each possible value

zipcode_dummy = pd.get_dummies(features_df['zipcode'])
print(zipcode_dummy)

        10036   11374  07302  10001  10002  10003  10004  10005  10006  10007  \
0           0       0      0      0      0      0      0      0      0      0   
1           0       0      0      0      0      0      0      0      0      0   
2           0       0      0      0      0      0      0      0      0      0   
3           0       0      0      0      0      0      0      0      0      0   
4           0       0      0      0      0      0      0      0      0      0   
...       ...     ...    ...    ...    ...    ...    ...    ...    ...    ...   
46917       0       0      0      0      0      1      0      0      0      0   
46918       0       0      0      0      0      0      0      0      0      0   
46919       0       0      0      0      0      0      0      0      0      0   
46920       0       0      0      0      0      0      0      0      0      0   
46921       0       0      0      1      0      0      0      0      0      0   

       ...  NY 10024  NY 10

In [35]:
try1 = pd.concat([features_df, bed_type_dummy], axis =1)
try2 = pd.concat([try1, borough_dummy], axis =1)
try3 = pd.concat([try2, property_type_dummy], axis=1)
try4 = pd.concat([try3, zipcode_dummy], axis=1)
features_df_updated = pd.concat([try4, room_type_dummy], axis =1)
features_df_updated.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,bed_type,neighbourhood_group_cleansed,room_type,zipcode,price,...,NY 10065,NY 10075,NY 10128,NY 10280,NY 11201,NY 11249,Entire home/apt,Hotel room,Private room,Shared room
0,2060,2,1.0,1.0,2.0,Real Bed,Manhattan,Private room,10040,100.0,...,0,0,0,0,0,0,0,0,1,0
1,3831,3,1.0,1.0,3.0,Real Bed,Brooklyn,Entire home/apt,11238,89.0,...,0,0,0,0,0,0,1,0,0,0
2,5136,4,1.5,2.0,2.0,Real Bed,Brooklyn,Entire home/apt,11232,253.0,...,0,0,0,0,0,0,1,0,0,0
3,5178,2,1.0,1.0,1.0,Real Bed,Manhattan,Private room,10019,79.0,...,0,0,0,0,0,0,0,0,1,0
4,5203,1,1.0,1.0,1.0,Real Bed,Manhattan,Private room,10025,79.0,...,0,0,0,0,0,0,0,0,1,0


In [37]:
# turn price into float 
target = features_df_updated["price"]
target_names = ["negative", "positive"]


In [38]:
data= features_df_updated.drop(["price","id","bed_type","neighbourhood_group_cleansed","room_type","zipcode","property_type"],axis=1)
feature_names = data.columns
data.head()
data.dtypes

accommodates         int64
bathrooms          float64
bedrooms           float64
beds               float64
Airbed               uint8
                    ...   
NY 11249             uint8
Entire home/apt      uint8
Hotel room           uint8
Private room         uint8
Shared room          uint8
Length: 279, dtype: object

In [39]:
data = data.astype(float)
data.dtypes

accommodates       float64
bathrooms          float64
bedrooms           float64
beds               float64
Airbed             float64
                    ...   
NY 11249           float64
Entire home/apt    float64
Hotel room         float64
Private room       float64
Shared room        float64
Length: 279, dtype: object

# Using the Trees Methodology

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [41]:
from sklearn.model_selection import GridSearchCV
import numpy as np
param_grid = {'max_depth': np.arange(1,10),
              'min_samples_split': [2, 3,4,5],
              'min_samples_leaf': [2, 3,4,5]
             }    

In [42]:
clf = tree.DecisionTreeClassifier()
grid = GridSearchCV(clf, param_grid)
grid.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_leaf': [2, 3, 4, 5]

In [43]:
print(grid.best_params_)

{'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 2}


In [44]:
print(grid.best_score_)

0.08547640021596431


In [45]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.08498849202966499

In [46]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.17642262347915721, 'accommodates'),
 (0.14848704073776592, 'beds'),
 (0.0842077463303333, 'bathrooms'),
 (0.08304142353981675, 'bedrooms'),
 (0.01986659277839092, 'Apartment'),
 (0.011854206570298202, 'Private room'),
 (0.011762637796099167, 'Entire home/apt'),
 (0.011238081411240658, 'House'),
 (0.010813810428607122, 'Condominium'),
 (0.009118068304109122, 'Townhouse'),
 (0.00815911181716236, 'Loft'),
 (0.007345362126922776, 'Real Bed'),
 (0.006051855548766031, '11216'),
 (0.005720187646682745, '11221'),
 (0.005553380423754211, '11206'),
 (0.005511858477911441, '10019'),
 (0.005087449756893998, '11211'),
 (0.005011671507504626, 'Guest suite'),
 (0.0049743411950520415, '11238'),
 (0.004868242196194728, '11215'),
 (0.00484135261569292, '11233'),
 (0.004743419737666892, '10002'),
 (0.004742832795273794, '11205'),
 (0.004649313482913421, '11222'),
 (0.004633440651353408, '10027'),
 (0.004611402402874899, 'Manhattan'),
 (0.004585110911313129, '10025'),
 (0.0045011301558594745, '10003')

## Additional Try with Updated Cleaned CSV 

In [47]:
cleaned_df = pd.read_csv(os.path.join("listings_clean_ver4.csv"))
cleaned_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,name,summary,description,host_id,host_since,host_location,host_is_superhost,...,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews
0,0,0,2060,Modern NYC,,"Lovely, spacious, sunny 1 BR apartment in 6th ...",2259,2008-08-18,US,f,...,"{Internet,Wifi}",100.0,0,0,1,0.0,1,730,365,1
1,1,2,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...","Enjoy 500 s.f. top floor in 1899 brownstone, w...","Enjoy 500 s.f. top floor in 1899 brownstone, w...",4869,2008-12-07,"New York, New York, United States",f,...,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",89.0,500,0,1,0.0,1,730,276,322
2,2,5,5136,"Spacious Brooklyn Duplex, Patio + Garden",We welcome you to stay in our lovely 2 br dupl...,We welcome you to stay in our lovely 2 br dupl...,7378,2009-02-03,"New York, New York, United States",f,...,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",253.0,0,0,4,25.0,14,1125,358,1
3,3,6,5178,Large Furnished Room Near B'way,Please don’t expect the luxury here just a bas...,Please don’t expect the luxury here just a bas...,8967,2009-03-03,"New York, New York, United States",f,...,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",79.0,0,15,1,12.0,2,14,294,473
4,4,7,5203,Cozy Clean Guest Room - Family Apt,"Our best guests are seeking a safe, clean, spa...","Our best guests are seeking a safe, clean, spa...",7490,2009-02-05,"New York, New York, United States",f,...,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",79.0,0,0,1,0.0,2,14,0,118


In [48]:
new_df = cleaned_df.loc[:,["id","accommodates","bathrooms","bedrooms","beds","bed_type","neighbourhood_group_cleansed","room_type","price","property_type","host_listings_count","cleaning_fee","security_deposit","guests_included","extra_people","minimum_nights", "maximum_nights","availability_365","number_of_reviews"]]
new_df.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,bed_type,neighbourhood_group_cleansed,room_type,price,property_type,host_listings_count,cleaning_fee,security_deposit,guests_included,extra_people,minimum_nights,maximum_nights,availability_365,number_of_reviews
0,2060,2,1.0,1.0,2.0,Real Bed,Manhattan,Private room,100.0,Other,0.0,0,0,1,0.0,1,730,365,1
1,3831,3,1.0,1.0,3.0,Real Bed,Brooklyn,Entire home/apt,89.0,Other,1.0,0,500,1,0.0,1,730,276,322
2,5136,4,1.5,2.0,2.0,Real Bed,Brooklyn,Entire home/apt,253.0,Apartment,1.0,0,0,4,25.0,14,1125,358,1
3,5178,2,1.0,1.0,1.0,Real Bed,Manhattan,Private room,79.0,Apartment,1.0,15,0,1,12.0,2,14,294,473
4,5203,1,1.0,1.0,1.0,Real Bed,Manhattan,Private room,79.0,Apartment,1.0,0,0,1,0.0,2,14,0,118


In [49]:
transformed_df = pd.get_dummies(new_df)
transformed_df.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,price,host_listings_count,cleaning_fee,security_deposit,guests_included,...,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,property_type_Apartment,property_type_House,property_type_Other
0,2060,2,1.0,1.0,2.0,100.0,0.0,0,0,1,...,1,0,0,0,0,1,0,0,0,1
1,3831,3,1.0,1.0,3.0,89.0,1.0,0,500,1,...,0,0,0,1,0,0,0,0,0,1
2,5136,4,1.5,2.0,2.0,253.0,1.0,0,0,4,...,0,0,0,1,0,0,0,1,0,0
3,5178,2,1.0,1.0,1.0,79.0,1.0,15,0,1,...,1,0,0,0,0,1,0,1,0,0
4,5203,1,1.0,1.0,1.0,79.0,1.0,0,0,1,...,1,0,0,0,0,1,0,1,0,0


In [51]:
transformed_df = transformed_df.fillna(0)
transformed_df
transformed_df.isnull().sum(axis = 0)

id                                            0
accommodates                                  0
bathrooms                                     0
bedrooms                                      0
beds                                          0
price                                         0
host_listings_count                           0
cleaning_fee                                  0
security_deposit                              0
guests_included                               0
extra_people                                  0
minimum_nights                                0
maximum_nights                                0
availability_365                              0
number_of_reviews                             0
bed_type_Airbed                               0
bed_type_Couch                                0
bed_type_Futon                                0
bed_type_Pull-out Sofa                        0
bed_type_Real Bed                             0
neighbourhood_group_cleansed_Bronx      

In [53]:
transformed_df.drop(['bedrooms','beds'], axis = 1, inplace = True)

In [55]:
for col in num_cols:
    transformed_df[col] = transformed_df[col].astype('float64').replace(0.0, 0.01) # Replacing 0s with 0.01
    transformed_df[col] = np.log(transformed_df[col])

In [57]:
X = transformed_df.drop('price', axis =1)
y = transformed_df.price

# Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))

In [58]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Ridge Method


In [59]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.7)
reg.fit(X_train, y_train)

Ridge(alpha=0.7, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [60]:
reg.score(X_test, y_test)

0.0653173521370638

In [62]:
print("Model Score:", round(reg.score(X_test, y_test),4))

Model Score: 0.0653


# SVR Method

In [67]:
from sklearn import svm
reg = svm.SVR(kernel='rbf')
reg.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [64]:
print("Model Score:", round(reg.score(X_test, y_test),4))

Model Score: 0.0267
