# RentHop - Modeling - Level 2 - Linear Models - Feature Engineering

In [75]:
"""
Author - Lily Elizabeth John
Date - 04/05/2017
Project - Kaggle - Renthop - Women Who Code - Workshop
Project Description : To predict interest level (Low, Medium, high) for listings. The dataset was provided by Two Sigma and Renthop.
"""

'\nAuthor - Lily Elizabeth John\nDate - 04/05/2017\nProject - Kaggle - Renthop - Women Who Code - Workshop\nProject Description : To predict interest level (Low, Medium, high) for listings. The dataset was provided by Two Sigma and Renthop.\n'

In [76]:
#Import libraries
import numpy as np
import pandas as pd

#Import CV libraries
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

#Import model libraries
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Import metrics libraries
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

In [77]:
#Read Data
train_df=pd.read_json("C:/Users/ljohn/Documents/2017/Personal/WWC - Kaggle/Exploring Train.JSON/train.json")
test_df=pd.read_json("C:/Users/ljohn/Documents/2017/Personal/WWC - Kaggle/Exploring Train.JSON/test.json")

In [78]:
#Exclude price outliers from training data
ulimit = np.percentile(train_df.price.values, 99)
train_df=train_df[train_df['price']<ulimit]

In [79]:
#Exclude Latitude longitude outliers from training data
outlier_mask = ( (train_df.latitude < 40.6) | (train_df.latitude > 40.9) | 
                 (train_df.longitude < -74.1) | (train_df.longitude > -73.8) )
train_df = train_df.drop(train_df.index[outlier_mask], axis=0)

In [80]:
#Combine test and train dataset for feature engineering
train_df['test']=False
test_df['test']=True

cdf=train_df.append(test_df).reset_index(drop=True)
cdf.shape

(123265, 16)

In [81]:
#Define New Features
cdf['created']=pd.to_datetime(cdf['created'])
cdf['month']=cdf['created'].dt.month
cdf['weekday']=cdf['created'].dt.weekday
cdf["num_photos"] = cdf["photos"].apply(len)
cdf["num_features"] = cdf["features"].apply(len)
cdf["words_in_description"] = cdf["description"].apply(len)

In [82]:
#Recode variables Month, Weekday
cdf['month']=cdf['month'].astype('str')
cdf['weekday']=cdf['weekday'].astype('str')
df=pd.get_dummies(cdf[['month','weekday']])
df.head()

Unnamed: 0,month_4,month_5,month_6,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [83]:
cdf = cdf.merge(df,left_index=True, right_index=True)

In [84]:
cdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123265 entries, 0 to 123264
Data columns (total 31 columns):
bathrooms               123265 non-null float64
bedrooms                123265 non-null int64
building_id             123265 non-null object
created                 123265 non-null datetime64[ns]
description             123265 non-null object
display_address         123265 non-null object
features                123265 non-null object
interest_level          48606 non-null object
latitude                123265 non-null float64
listing_id              123265 non-null int64
longitude               123265 non-null float64
manager_id              123265 non-null object
photos                  123265 non-null object
price                   123265 non-null int64
street_address          123265 non-null object
test                    123265 non-null bool
month                   123265 non-null object
weekday                 123265 non-null object
num_photos              123265 non-nul

In [85]:
#Encode target variable
class_mapping={label:idx for idx,label in enumerate(['high', 'medium','low'])}
class_mapping

{'high': 0, 'low': 2, 'medium': 1}

In [86]:
cdf['interest_level']=cdf['interest_level'].map(class_mapping)

In [87]:
#Split train and test dataset
df = cdf[cdf.test == False]
tdf = cdf[cdf.test == True]

In [88]:
#Split training dataset into train and test for cross validation
from sklearn.cross_validation import train_test_split
x=df.drop(['interest_level','created','features','building_id','description',\
                               'display_address','listing_id','manager_id',\
           'photos','street_address','test','month','weekday','month_4','weekday_0'],axis=1)
y=df['interest_level']

In [89]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48606 entries, 0 to 48605
Data columns (total 16 columns):
bathrooms               48606 non-null float64
bedrooms                48606 non-null int64
latitude                48606 non-null float64
longitude               48606 non-null float64
price                   48606 non-null int64
num_photos              48606 non-null int64
num_features            48606 non-null int64
words_in_description    48606 non-null int64
month_5                 48606 non-null float64
month_6                 48606 non-null float64
weekday_1               48606 non-null float64
weekday_2               48606 non-null float64
weekday_3               48606 non-null float64
weekday_4               48606 non-null float64
weekday_5               48606 non-null float64
weekday_6               48606 non-null float64
dtypes: float64(11), int64(5)
memory usage: 6.3 MB


In [90]:
#Train Test Split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=0)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(38884, 16) (9722, 16) (38884,) (9722,)


In [98]:
#Setup Y_test for Log Loss scoring
Y_test_onehot = pd.get_dummies(Y_test)
Y_test_onehot.head()

Unnamed: 0,0.0,1.0,2.0
31895,0.0,0.0,1.0
6582,0.0,0.0,1.0
2508,0.0,0.0,1.0
6730,0.0,0.0,1.0
20986,0.0,0.0,1.0


In [99]:
#Logistic Regression
lr=LogisticRegressionCV(cv=3, multi_class='multinomial', verbose=2, scoring='accuracy', max_iter=5000)
lr.fit(X_train,Y_train)
y_pred=lr.predict_proba(X_test)

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   30.8s finished


In [100]:
y_pred.shape

(9722, 3)

In [101]:
#print("LR: Accuracy Score:",accuracy_score(y_pred,Y_test))
print("LR: Log Loss:", log_loss(Y_test_onehot,y_pred))

LR: Log Loss: 0.731636710802


In [102]:
#Decision Tree
tree=DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=0)
tree.fit(X_train,Y_train)
y_pred=tree.predict_proba(X_test)

In [103]:
y_pred

array([[ 0.        ,  0.00307692,  0.99692308],
       [ 0.17311609,  0.35709437,  0.46978955],
       [ 0.11329532,  0.32998199,  0.55672269],
       ..., 
       [ 0.14400879,  0.37999267,  0.47599853],
       [ 0.06965799,  0.26074678,  0.66959523],
       [ 0.17311609,  0.35709437,  0.46978955]])

In [104]:
#Plot Decision Tree
from os import system
export_graphviz(tree,out_file='tree.dot',
                feature_names=['bathrooms','bedrooms','latitude','longitude','price','num_photos','num_features'])
system("dot -Tpng tree.dot -o tree.png")

IndexError: list index out of range

In [105]:
print("Tree: Log Loss:", log_loss(Y_test_onehot,y_pred))

Tree: Log Loss: 0.695163223937


In [106]:
#Random Forest
forest=RandomForestClassifier(criterion='entropy',n_estimators=500,verbose=False,max_depth=18)
forest.fit(X_train,Y_train)
y_pred=forest.predict_proba(X_test)

In [107]:
print("Forest: Log Loss:", log_loss(Y_test_onehot,y_pred))

Forest: Log Loss: 0.628146701812


In [None]:
#Any feature engineering required?
#Any other outliers to be removed?
#How to add date features?
#Pipeline function programming
#Deep Learning
#Testing?
#One Hot Encoding
#Multinomial
#Grid Search
#XGBoost
