## Preparing the data set

The first step before running the clustering algorithm is to prepare the training and the testing data set. 


In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set(style="whitegrid", color_codes=True)
sns.palplot(sns.color_palette("RdBu", n_colors=7))

fileNameTrain = "C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\train.json\\train.json"
train_df = pd.read_json(fileNameTrain)

fileNameTest = "C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\test.json\\test.json"
test_df = pd.read_json(fileNameTest)



As a next step, we will extract the key words from the description variable - by key words, we define words that are in the description of the unit but are not stop words as defined by the ntlk.corpus package.

In [83]:
from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")

description_key_words_ls = []

for ind, row in train_df.iterrows():
        #print(row['features'])
        #description = description.lower()
        description = row['description'].lower().rstrip(',?!.')
        description = ' '.join([word for word in description.split() if word not in cachedStopWords])
        description_ls = description.split(" ")
        description_key_words_ls += [description_ls]

train_df['description_key_words'] = pd.Series(description_key_words_ls, index=train_df.index)

description_key_words_ls = []

for ind, row in test_df.iterrows():
        #print(row['features'])
        #description = description.lower()
        description = row['description'].lower().rstrip(',?!.')
        description = ' '.join([word for word in description.split() if word not in cachedStopWords])
        description_ls = description.split(" ")
        description_key_words_ls += [description_ls]

test_df['description_key_words'] = pd.Series(description_key_words_ls, index=test_df.index)

We will create two numeric variables which describe the number of features and number of key words in the description section

In [84]:
train_df['num_features'] = train_df.features.apply(len)
train_df['num_key_words_description'] = train_df.description_key_words.apply(len)

test_df['num_features'] = test_df.features.apply(len)
test_df['num_key_words_description'] = test_df.description_key_words.apply(len)

From the Created variable, we will extract into new variables the exact data when the listing was created, the day of year, week of year, weekday and hour.

In [85]:
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["date"]= train_df["created"].dt.date

train_df["dayofyear"] = train_df["created"].dt.dayofyear
train_df["weekofyear"] = train_df["created"].dt.weekofyear
train_df["weekday"] = train_df["created"].dt.weekday
train_df["hour"] = train_df["created"].dt.hour

test_df["created"] = pd.to_datetime(test_df["created"])
test_df["date"]= test_df["created"].dt.date

test_df["dayofyear"] = test_df["created"].dt.dayofyear
test_df["weekofyear"] = test_df["created"].dt.weekofyear
test_df["weekday"] = test_df["created"].dt.weekday
test_df["hour"] = test_df["created"].dt.hour

We will also add the number of photos of each listing as a new variable in the training and testing dataframe.

In [86]:
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)


Another step is to create two variables which describe the price per bathroom and price per bedroom. 

In [87]:
train_df["price_per_bathroom"] = train_df["price"]/(train_df["bathrooms"] + 1)
train_df["price_per_bedroom"] = train_df["price"]/(train_df["bedrooms"] + 1)

test_df["price_per_bathroom"] = test_df["price"]/(test_df["bathrooms"] + 1)
test_df["price_per_bedroom"] = test_df["price"]/(test_df["bedrooms"] + 1)

We will also use the manager id as an indicator of how much interest there may be in a listing based on who is managing the property.

In [88]:
from sklearn import preprocessing

lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df['building_id'].values) + list(test_df['building_id'].values))
train_df['building_id'] = lbl.transform(list(train_df['building_id'].values))

test_df['building_id'] = lbl.transform(list(test_df['building_id'].values))

Some buildings are more often seen in rental ads than others. We will introduce a number of few variables which describe how often the building is been seen in a rental post.

As a next step in the data preparation, we will add the neighbourhood as another variable.

In [89]:
import shapefile

sf = shapefile.Reader("C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\ZillowNeighborhoods-NY\\ZillowNeighborhoods-NY.shp")

shapes = sf.shapes()
records = sf.records()

towns_values = [records[i][2] for i in range(len(records))]
neighb_values = [records[i][3] for i in range(len(records))]
west_values = [shapes[i].bbox[0] for i in range(len(records))]
south_values = [shapes[i].bbox[1] for i in range(len(records))]
east_values = [shapes[i].bbox[2] for i in range(len(records))]
north_values = [shapes[i].bbox[3] for i in range(len(records))]

west, south, east, north = -74.02, 40.64, -73.85, 40.86

neighbourhood_pd = pd.DataFrame({'Town' : towns_values,
                                 'Neighbourhood' : neighb_values,
                                 'West' : west_values,
                                 'South' : south_values,
                                 'East' : east_values,
                                 'North' : north_values})

neighbourhood_pd = neighbourhood_pd[neighbourhood_pd.Town == "New York"]
neighbourhood_pd = neighbourhood_pd.ix[(neighbourhood_pd.West >= west) & 
                                     (neighbourhood_pd.East <= east) & 
                                     (neighbourhood_pd.South >= south) & 
                                     (neighbourhood_pd.North <= north)]

neighbourhood_sorted_pd = neighbourhood_pd.sort_values(['West'])


neighbourhood_ls = []
for num in range(0, train_df.shape[0]):
    temp = neighbourhood_sorted_pd[(neighbourhood_sorted_pd.West<train_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.East>train_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.South<train_df.latitude.values[num]) &
                                   (neighbourhood_sorted_pd.North>train_df.latitude.values[num])]
    if temp.shape[0] > 0:
        neighbourhood_ls += [temp.Neighbourhood.values[0]]
    else:
        neighbourhood_ls += ["Other"]
    
train_df['neighbourhood'] = pd.Series(neighbourhood_ls, index=train_df.index)

neighbourhood_ls = []
for num in range(0, test_df.shape[0]):
    temp = neighbourhood_sorted_pd[(neighbourhood_sorted_pd.West<test_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.East>test_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.South<test_df.latitude.values[num]) &
                                   (neighbourhood_sorted_pd.North>test_df.latitude.values[num])]
    if temp.shape[0] > 0:
        neighbourhood_ls += [temp.Neighbourhood.values[0]]
    else:
        neighbourhood_ls += ["Other"]
    
test_df['neighbourhood'] = pd.Series(neighbourhood_ls, index=test_df.index)

In [95]:
train_df = train_df.drop(["price_median_x", "price_median_y", "price_diff"], axis=1)
test_df = test_df.drop(["price_median_x", "price_median_y", "price_diff"], axis=1)

train_group_by = train_df[["bedrooms", "neighbourhood", "price"]].groupby(["bedrooms", "neighbourhood"]).median()
train_group_by = train_group_by.add_suffix('_median').reset_index()

train_df = train_df.merge(train_group_by, left_on=['bedrooms', 'neighbourhood'], right_on=['bedrooms', 'neighbourhood'], how='inner')
test_df = test_df.merge(train_group_by, left_on=['bedrooms', 'neighbourhood'], right_on=['bedrooms', 'neighbourhood'], how='inner')

train_df["price_diff"] = train_df["price"] - train_df["price_median"]
test_df["price_diff"] = test_df["price"] - test_df["price_median"]


In [96]:
print(train_df.columns.values)

['bathrooms' 'bedrooms' 'building_id' 'created' 'description'
 'display_address' 'features' 'interest_level' 'latitude' 'listing_id'
 'longitude' 'manager_id' 'photos' 'price' 'street_address'
 'description_key_words' 'num_features' 'num_key_words_description' 'date'
 'dayofyear' 'weekofyear' 'weekday' 'hour' 'num_photos'
 'price_per_bathroom' 'price_per_bedroom' 'neighbourhood' 'price_median'
 'price_diff']


In [90]:
#train_df.drop(["price_median_x", "price_median_x", "price"], axis=1)
#test_df.drop(["price_median_x", "price_median_x", "price"], axis=1)

train_group_by = train_df[["bedrooms", "price"]].groupby(["bedrooms"]).median()
train_group_by = train_group_by.add_suffix('_median').reset_index()

train_df = train_df.merge(train_group_by, left_on=['bedrooms'], right_on=['bedrooms'], how='inner')
test_df = test_df.merge(train_group_by, left_on=['bedrooms'], right_on=['bedrooms'], how='inner')

train_df["price_diff"] = train_df["price"] - train_df["price_median"]
test_df["price_diff"] = test_df["price"] - test_df["price_median"]


We will also use a variable describing which neighbourhood the unit is in.

After all the variable transformations, we will apply now the Random Forest Algorithm. The algorithm is copied from https://www.kaggle.com/den3b81/two-sigma-connect-rental-listing-inquiries/improve-perfomances-using-manager-features

In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["bathrooms", "bedrooms", "price", "num_features", "num_key_words_description",
                   "dayofyear", "weekofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                    "building_id", "price_diff"]


X = train_df[selected_vars]
y = train_df["interest_level"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

random.seed(123)
clf = RandomForestClassifier(n_estimators=1000)
result  = clf.fit(X_train[selected_vars], y_train)
y_val_pred = clf.predict_proba(X_val[selected_vars])
log_loss(y_val, y_val_pred)

0.59070397469461389

We will explore which are the most important variables used for the random forest algorithm.

In [103]:
selected_vars = np.array(selected_vars)
importances = result.feature_importances_
important_names = selected_vars[importances > np.mean(importances)]
print (important_names)

['price' 'num_key_words_description' 'latitude' 'longitude' 'building_id'
 'price_diff']


In [104]:
test_predprob = clf.predict_proba(test_df[selected_vars])
labels2idx = {label: i for i, label in enumerate(clf.classes_)}

out_df = pd.DataFrame()
out_df["listing_id"] = test_df["listing_id"]
for label in ["high", "medium", "low"]:
    out_df[label] = test_predprob[:, labels2idx[label]]
out_df.to_csv("random_forest_results_v5.csv", index=False)