# RentHop - Modeling - Level 1 - Linear Models

In [1]:
"""
Author - Lily Elizabeth John
Date - 04/04/2017
Project - Kaggle - Renthop - Women Who Code - Workshop
Project Description : To predict interest level (Low, Medium, high) for listings. The dataset was provided by Two Sigma and Renthop.
"""

'\nAuthor - Lily Elizabeth John\nDate - 04/04/2017\nProject - Kaggle - Renthop - Women Who Code - Workshop\nProject Description : To predict interest level (Low, Medium, high) for listings. The dataset was provided by Two Sigma and Renthop.\n'

In [17]:
#Import libraries
import numpy as np
import pandas as pd

#Import CV libraries
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

#Import model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Import metrics libraries
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

In [18]:
#Read Data
train_df=pd.read_json("C:/Users/ljohn/Documents/2017/Personal/WWC - Kaggle/Exploring Train.JSON/train.json")

In [41]:
train_df['created']=pd.to_datetime(train_df['created'])

In [42]:
#Exclude price outliers
ulimit = np.percentile(train_df.price.values, 99)
train_df=train_df[train_df['price']<ulimit]

In [43]:
#Exclude Latitude longitude outliers
outlier_mask = ( (train_df.latitude < 40.6) | (train_df.latitude > 40.9) | 
                 (train_df.longitude < -74.1) | (train_df.longitude > -73.8) )
lat_long_price_train_df = train_df.drop(train_df.index[outlier_mask], axis=0)

In [44]:
#Define New Features
lat_long_price_train_df["num_photos"] = lat_long_price_train_df["photos"].apply(len)
lat_long_price_train_df["num_features"] = lat_long_price_train_df["features"].apply(len)

In [45]:
lat_long_price_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48117 entries, 10 to 99994
Data columns (total 17 columns):
bathrooms          48117 non-null float64
bedrooms           48117 non-null int64
building_id        48117 non-null object
created            48117 non-null datetime64[ns]
description        48117 non-null object
display_address    48117 non-null object
features           48117 non-null object
interest_level     48117 non-null object
latitude           48117 non-null float64
listing_id         48117 non-null int64
longitude          48117 non-null float64
manager_id         48117 non-null object
photos             48117 non-null object
price              48117 non-null float64
street_address     48117 non-null object
num_photos         48117 non-null int64
num_features       48117 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(4), object(8)
memory usage: 6.6+ MB


In [46]:
#Encode target variable
class_mapping={label:idx for idx,label in enumerate(np.unique(lat_long_price_train_df['interest_level']))}
class_mapping

{'high': 0, 'low': 1, 'medium': 2}

In [47]:
class_mapping={label:idx for idx,label in enumerate(['low', 'medium','high'])}
class_mapping

{'high': 2, 'low': 0, 'medium': 1}

In [48]:
lat_long_price_train_df['interest_level']=lat_long_price_train_df['interest_level'].map(class_mapping)

In [55]:
#Split training dataset into train and test for cross validation
from sklearn.cross_validation import train_test_split
x=lat_long_price_train_df.drop(['interest_level','created','features','building_id','description',\
                               'display_address','listing_id','manager_id','photos','street_address'],axis=1)
y=lat_long_price_train_df['interest_level']

In [56]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48117 entries, 10 to 99994
Data columns (total 7 columns):
bathrooms       48117 non-null float64
bedrooms        48117 non-null int64
latitude        48117 non-null float64
longitude       48117 non-null float64
price           48117 non-null float64
num_photos      48117 non-null int64
num_features    48117 non-null int64
dtypes: float64(4), int64(3)
memory usage: 2.9 MB


In [71]:
#Train Test Split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=0)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(38493, 7) (9624, 7) (38493,) (9624,)


In [86]:
#Logistic Regression
lr=LogisticRegression(random_state=0)
lr.fit(X_train,Y_train)
y_pred=lr.predict(X_test)

In [87]:
print("LR: Accuracy Score:",accuracy_score(y_pred,Y_test))
#print("LR: Log Loss:", log_loss(Y_test,y_pred))

LR: Accuracy Score: 0.691604322527


In [90]:
#Decision Tree
tree=DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=0)
tree.fit(X_train,Y_train)
y_pred=tree.predict(X_test)

In [98]:
#Plot Decision Tree
from os import system
export_graphviz(tree,out_file='tree.dot',
                feature_names=['bathrooms','bedrooms','latitude','longitude','price','num_photos','num_features'])
system("dot -Tpng tree.dot -o tree.png")

1

In [99]:
print("Tree: Accuracy Score:",accuracy_score(y_pred,Y_test))

Tree: Accuracy Score: 0.698462177889


In [102]:
#Random Forest
forest=RandomForestClassifier(criterion='entropy',n_estimators=10,random_state=0,n_jobs=2)
forest.fit(X_train,Y_train)
y_pred=forest.predict(X_test)

In [103]:
print("Forest: Accuracy Score:",accuracy_score(y_pred,Y_test))

Forest: Accuracy Score: 0.710099750623
