## Preparing the data set

The first step before running the clustering algorithm is to prepare the training and the testing data set. 


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss


sns.set(style="whitegrid", color_codes=True)
sns.palplot(sns.color_palette("RdBu", n_colors=7))

fileNameTrain = "C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\train.json\\train.json"
train_df = pd.read_json(fileNameTrain)

fileNameTest = "C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\test.json\\test.json"
test_df = pd.read_json(fileNameTest)



As a next step, we will extract the key words from the description variable - by key words, we define words that are in the description of the unit but are not stop words as defined by the ntlk.corpus package.

In [2]:
from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")

description_key_words_ls = []

for ind, row in train_df.iterrows():
        #print(row['features'])
        #description = description.lower()
        description = row['description'].lower().rstrip(',?!.')
        description = ' '.join([word for word in description.split() if word not in cachedStopWords])
        description_ls = description.split(" ")
        description_key_words_ls += [description_ls]

train_df['description_key_words'] = pd.Series(description_key_words_ls, index=train_df.index)

description_key_words_ls = []

for ind, row in test_df.iterrows():
        #print(row['features'])
        #description = description.lower()
        description = row['description'].lower().rstrip(',?!.')
        description = ' '.join([word for word in description.split() if word not in cachedStopWords])
        description_ls = description.split(" ")
        description_key_words_ls += [description_ls]

test_df['description_key_words'] = pd.Series(description_key_words_ls, index=test_df.index)

We will create two numeric variables which describe the number of features and number of key words in the description section

In [3]:
train_df['num_features'] = train_df.features.apply(len)
train_df['num_key_words_description'] = train_df.description_key_words.apply(len)

test_df['num_features'] = test_df.features.apply(len)
test_df['num_key_words_description'] = test_df.description_key_words.apply(len)

From the Created variable, we will extract into new variables the exact data when the listing was created, the day of year, week of year, weekday and hour.

In [4]:
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["date"]= train_df["created"].dt.date

train_df["dayofyear"] = train_df["created"].dt.dayofyear
train_df["weekofyear"] = train_df["created"].dt.weekofyear
train_df["weekday"] = train_df["created"].dt.weekday
train_df["hour"] = train_df["created"].dt.hour

test_df["created"] = pd.to_datetime(test_df["created"])
test_df["date"]= test_df["created"].dt.date

test_df["dayofyear"] = test_df["created"].dt.dayofyear
test_df["weekofyear"] = test_df["created"].dt.weekofyear
test_df["weekday"] = test_df["created"].dt.weekday
test_df["hour"] = test_df["created"].dt.hour

We will also add the number of photos of each listing as a new variable in the training and testing dataframe.

In [5]:
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)


Another step is to create two variables which describe the price per bathroom and price per bedroom. 

In [6]:
train_df["price_per_bathroom"] = train_df["price"]/(train_df["bathrooms"] + 1)
train_df["price_per_bedroom"] = train_df["price"]/(train_df["bedrooms"] + 1)

test_df["price_per_bathroom"] = test_df["price"]/(test_df["bathrooms"] + 1)
test_df["price_per_bedroom"] = test_df["price"]/(test_df["bedrooms"] + 1)

We will also use the manager id as an indicator of how much interest there may be in a listing based on who is managing the property.

In [7]:
from sklearn import preprocessing

lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df['manager_id'].values) + list(test_df['manager_id'].values))
train_df['manager_id'] = lbl.transform(list(train_df['manager_id'].values))

test_df['manager_id'] = lbl.transform(list(test_df['manager_id'].values))


lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df['building_id'].values) + list(test_df['building_id'].values))
train_df['building_id'] = lbl.transform(list(train_df['building_id'].values))

test_df['building_id'] = lbl.transform(list(test_df['building_id'].values))

Some buildings are more often seen in rental ads than others. We will introduce a number of few variables which describe how often the building is been seen in a rental post. https://www.kaggle.com/visnaga/two-sigma-connect-rental-listing-inquiries/xgboost-for-the-millionth-time-0-54724-lb

In [9]:
buildings_count = train_df['building_id'].value_counts()

train_df['top_10_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_df['top_25_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_df['top_5_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_df['top_50_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_df['top_1_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_df['top_2_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_df['top_15_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_df['top_20_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_df['top_30_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

test_df['top_10_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
test_df['top_25_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
test_df['top_5_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
test_df['top_50_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
test_df['top_1_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
test_df['top_2_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
test_df['top_15_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
test_df['top_20_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
test_df['top_30_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

In [10]:
print(train_df.columns.values)

['bathrooms' 'bedrooms' 'building_id' 'created' 'description'
 'display_address' 'features' 'interest_level' 'latitude' 'listing_id'
 'longitude' 'manager_id' 'photos' 'price' 'street_address'
 'description_key_words' 'num_features' 'num_key_words_description' 'date'
 'dayofyear' 'weekofyear' 'weekday' 'hour' 'num_photos'
 'price_per_bathroom' 'price_per_bedroom' 'top_10_building'
 'top_25_building' 'top_5_building' 'top_50_building' 'top_1_building'
 'top_2_building' 'top_15_building' 'top_20_building' 'top_30_building']


After all the variable transformations, we will apply now the Random Forest Algorithm. The algorithm is copied from https://www.kaggle.com/den3b81/two-sigma-connect-rental-listing-inquiries/improve-perfomances-using-manager-features

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss


selected_vars  = ["bathrooms", "bedrooms", "price", "num_features", "num_key_words_description",
                   "dayofyear", "weekofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                    "building_id", "manager_id", "price_per_bathroom", "price_per_bedroom", "top_1_building", 
                    "top_5_building", "top_10_building", "top_25_building", "top_50_building"]


X = train_df[selected_vars]
y = train_df["interest_level"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

clf = RandomForestClassifier(n_estimators=1000)
result  = clf.fit(X_train[selected_vars], y_train)
y_val_pred = clf.predict_proba(X_val[selected_vars])
log_loss(y_val, y_val_pred)

0.59364727636485259

In [27]:
selected_vars = np.array(selected_vars)
importances = result.feature_importances_
important_names = selected_vars[importances > np.mean(importances)]
print (important_names)

['price' 'num_features' 'num_key_words_description' 'dayofyear' 'hour'
 'num_photos' 'latitude' 'longitude' 'building_id' 'manager_id'
 'price_per_bathroom' 'price_per_bedroom']


In [23]:
test_predprob = clf.predict_proba(test_df[selected_vars])
labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx


{'high': 0, 'low': 1, 'medium': 2}

In [24]:
out_df = pd.DataFrame()
out_df["listing_id"] = test_df["listing_id"]
for label in ["high", "medium", "low"]:
    out_df[label] = test_predprob[:, labels2idx[label]]
out_df.to_csv("random_forest_results_v4.csv", index=False)