# Importing stuff

In [1]:
import numpy as np
import sklearn
import pandas as pd
import csv
import math
import matplotlib.pyplot as plt
import datetime
%matplotlib
from sklearn import datasets, linear_model
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics 
from scipy.stats.stats import pearsonr 
pd.options.display.max_columns = 999
from sklearn.metrics import confusion_matrix as cm
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import pickle
from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_boston

Using matplotlib backend: MacOSX


# Loading data

In [2]:
X_train = pd.read_json('data/train.json')
X_test = pd.read_json('data/test.json') 

Y_train = X_train['interest_level']

## mapping interest level str to int

In [3]:
num_Y_map = {'high':0, 'medium':1, 'low':2}
Y_train = np.array(X_train['interest_level'].apply(lambda x: num_Y_map[x]))

# looking at the data

In [4]:
X_train.pivot_table(index='bedrooms', columns='interest_level', values='price')

interest_level,high,low,medium
bedrooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2069.439197,2649.370206,2251.518957
1,2152.919406,3418.025437,2636.015191
2,2714.034456,4745.92562,3194.77901
3,3777.537162,5650.445526,4196.131323
4,5305.711409,7776.179238,5639.886654
5,6012.5,10502.658436,5700.0
6,,11002.704545,7625.0
7,,14500.0,6923.0
8,,8247.5,


# Feature engeneering

### Number of photos

In [5]:
X_train['num_photos'] = X_train['photos'].apply(len)
X_test['num_photos'] = X_test['photos'].apply(len)

### Date of creation

In [6]:
X_train['created'] = pd.to_datetime(X_train["created"])
X_test['created'] = pd.to_datetime(X_test["created"])

In [7]:
X_train["created_year"] = X_train["created"].dt.year
X_test["created_year"] = X_test["created"].dt.year
X_train["created_month"] = X_train["created"].dt.month
X_test["created_month"] = X_test["created"].dt.month
X_train["created_day"] = X_train["created"].dt.day
X_test["created_day"] = X_test["created"].dt.day
X_train["created_hour"] = X_train["created"].dt.hour
X_test["created_hour"] = X_test["created"].dt.hour

### Dummies from bedrooms

In [8]:
X_train['bedrooms'] = X_train['bedrooms'].replace(8,7)

In [9]:
X_train = X_train.join(pd.get_dummies(X_train['bedrooms'], prefix='bedrooms'))
del X_train['bedrooms']

X_test = X_test.join(pd.get_dummies(X_test['bedrooms'], prefix='bedrooms'))
del X_test['bedrooms']

### Dealing with managers and buildings

In [10]:
X_train['manager_freq'] = 0
X_test['manager_freq'] = 0

X_train['manager_freq'] = X_train.groupby(['manager_id'])['manager_freq'].transform('count')
X_test['manager_freq'] = X_test.groupby(['manager_id'])['manager_freq'].transform('count')

In [11]:
X_train['building_freq'] = 0
X_test['building_freq'] = 0

X_train['building_freq'] = X_train.groupby(['building_id'])['building_freq'].transform('count')
X_test['building_freq'] = X_test.groupby(['building_id'])['building_freq'].transform('count')

### Dealing with description

In [12]:
X_test['description_feature'] = 0
X_train['description_feature'] = 0

X_test['description_feature'] = X_test['description'].apply(len)
X_train['description_feature'] = X_train['description'].apply(len)

### Dealing with features

In [None]:
FEATURES_LIST = []
for x in range(len(X_train['features'])):
    for y in range(len(X_train['features'][x])):
        FEATURES_LIST.append(X_train['features'][x][y])

In [None]:
FEATURES_LIST = list(set(FEATURES_LIST))

### Distance from mean

In [13]:
from geopy.distance import vincenty

In [14]:
X_test['dist_from_mean'] = 0
X_train['dist_from_mean'] = 0

In [15]:
lat = ((X_train['latitude'].mean()) + (X_test['latitude'].mean()))/2
long = ((X_train['longitude'].mean()) + (X_test['longitude'].mean()))/2

mean_coor = (lat, long)

In [16]:
X_test['act_coor'] = 0
X_train['act_coor'] = 0

In [23]:
X_test['dist_from_mean'] = vincenty((X_test['latitude'], X_test['longitude']), mean_coor)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:

for x in range(len(X_train['act_coor'])):
    act = (X_train['latitude'][x],X_train['latitude'][x])
    X_train['act_coor'][x] = vincenty(mean_coor, act).kilometers

In [None]:
for x in range(len(X_test['act_coor'])):
    act = (X_test['latitude'][x],X_test['latitude'][x])
    X_test['act_coor'][x] = vincenty(mean_coor, act).kilometers

In [21]:
X_test.head()

Unnamed: 0,bathrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,num_photos,created_year,created_month,created_day,created_hour,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,manager_freq,building_freq,description_feature,dist_from_mean,act_coor
0,1.0,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street,8,2016,6,11,5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,38,5,587,0,-33.268
1,1.0,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street,3,2016,6,24,6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,17,12378,245,0,-33.2722
100,1.0,3dbbb69fd52e0d25131aa1cd459c87eb,2016-06-03 04:29:40,New York chic has reached a new level ...,101 East 10th Street,"[Doorman, Elevator, No Fee]",40.7306,7103890,-73.989,9ca6f3baa475c37a3b3521a394d65467,[https://photos.renthop.com/2/7103890_85b33077...,3758,101 East 10th Street,6,2016,6,3,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4,6,1211,0,-33.2584
1000,1.0,783d21d013a7e655bddc4ed0d461cc5e,2016-06-11 06:17:35,Step into this fantastic new Construction in t...,South Third Street\r,"[Roof Deck, Balcony, Elevator, Laundry in Buil...",40.7109,7143442,-73.9571,0b9d5db96db8472d7aeb67c67338c4d2,[https://photos.renthop.com/2/7143442_0879e9e0...,3300,251 South Third Street\r,6,2016,6,11,6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,117,2,1327,0,-33.2462
100000,2.0,6134e7c4dd1a98d9aee36623c9872b49,2016-04-12 05:24:17,"~Take a stroll in Central Park, enjoy the ente...","Midtown West, 8th Ave","[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.765,6860601,-73.9845,b5eda0eb31b042ce2124fd9e9fcfce2f,[https://photos.renthop.com/2/6860601_c96164d8...,4900,260 West 54th Street,7,2016,4,12,5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,96,161,1162,0,-33.2195


## Getting ready to GO

In [74]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()

In [77]:
X_train.columns

Index(['level_0', 'index', 'bathrooms', 'building_id', 'created',
       'description', 'display_address', 'features', 'interest_level',
       'latitude', 'listing_id', 'longitude', 'manager_id', 'photos', 'price',
       'street_address', 'num_photos', 'created_year', 'created_month',
       'created_day', 'created_hour', 'bedrooms_0', 'bedrooms_1', 'bedrooms_2',
       'bedrooms_3', 'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7',
       'KM', 'manager_freq'],
      dtype='object')

In [97]:
features_to_use = ['bathrooms',
       'latitude','listing_id', 'longitude', 'price',
       'num_photos', 'created_year', 'created_month',
       'created_day', 'created_hour', 'bedrooms_0', 'bedrooms_1', 'bedrooms_2',
       'bedrooms_3', 'bedrooms_4', 'bedrooms_5', 'bedrooms_6', 'bedrooms_7', 'manager_freq', 
        'building_freq', 'description_feature', 'KM']

# Kmeans clustering

In [88]:
from sklearn.cluster import KMeans

In [89]:
kmeans = KMeans(n_clusters=30, random_state=42).fit(X_train[features_to_use])
X_train = X_train.join(pd.DataFrame(kmeans.labels_))
X_train['KM'] = X_train[0]
del X_train[0]

kmeans2 = KMeans(n_clusters=30, random_state=42).fit(X_test[features_to_use])
X_test = X_test.join(pd.DataFrame(kmeans2.labels_))
X_test['KM'] = X_test[0]
del X_test[0]

In [20]:
X_train[features_to_use].head()

Unnamed: 0,bathrooms,latitude,listing_id,longitude,price,num_photos,created_year,created_month,created_day,created_hour,bedrooms_0,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6,bedrooms_7,KM
0,1.5,40.7145,7211212,-73.9425,3000,5,2016,6,24,7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8
1,1.0,40.7947,7150865,-73.9667,5465,11,2016,6,12,12,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,16
2,1.0,40.7388,6887163,-74.0018,2850,8,2016,4,17,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23
3,1.0,40.7539,6888711,-73.9677,3275,3,2016,4,18,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23
4,1.0,40.8241,6934781,-73.9493,3350,3,2016,4,28,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,14


# Implementing XGBoost

In [79]:
Y_train = pd.Series(Y_train)

In [80]:
rng = np.random.RandomState(3)
kf = KFold(len(Y_train), n_folds=5, shuffle=True, random_state=rng)

In [98]:
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(X_train[features_to_use].iloc[train_index],Y_train.iloc[train_index])
    predictions = xgb_model.predict_proba(X_train[features_to_use].iloc[test_index])
    actuals = Y_train.iloc[test_index]
    print(sklearn.metrics.log_loss(actuals, predictions))

0.562620341121
0.57851392097
0.56808782298
0.581703368169
0.576483635706


learning_rate=0.05
n_estimators=700

0.599808973253
0.612607784257
0.606060899577
0.619159023335
0.611381953856

------
+ building and manager freq
0.563473640155
0.581334070483
0.571320501188
0.586356014218
0.577214906643

------
+ description feature
0.57851392097
0.56808782298
0.581703368169
0.576483635706

------
+ description feature
0.562620341121
0.57851392097
0.56808782298
0.581703368169
0.576483635706

------
+ distance from mean

In [37]:
xgb.plot_importance(xgb_model)

<matplotlib.axes._subplots.AxesSubplot at 0x13a54acc0>

In [27]:
xgb_model_gs = xgb.XGBClassifier()
clf = GridSearchCV(xgb_model_gs,
                   {'max_depth': [6],
                    'learning_rate': [0.05],
                    'reg_alpha':[1],
                    'reg_lambda':[1],
                    'objective':['multi:softprob'],
                    'n_estimators':[700]},
                   verbose=1)
clf.fit(X_train[features_to_use],Y_train)
print(clf.best_estimator_)
print(clf.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  7.3min finished


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
{'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 1000, 'objective': 'multi:softprob', 'max_depth': 6, 'learning_rate': 0.02}


# Generating result

In [31]:
xgb_model = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='multi:softprob', reg_alpha=1, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(X_train[features_to_use],Y_train)

In [33]:
predictions = xgb_model.predict_proba(X_test[features_to_use])

In [34]:
predictions = pd.DataFrame(predictions).join(X_test['listing_id'])
predictions.columns = ['high', 'medium', 'low', 'listing_id']
predictions = predictions[['listing_id', 'high', 'medium', 'low']]

In [35]:
predictions.head()

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.061105,0.365431,0.573464
1,7210040,0.595864,0.237663,0.166472
2,7103890,0.018516,0.127029,0.854455
3,7143442,0.02146,0.248896,0.729643
4,6860601,0.014135,0.196159,0.789706


In [36]:
predictions.to_csv('results/result2.csv', index = False)