In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold,StratifiedKFold



In [26]:
import re

In [2]:
#A lot of fucntions, mostly of which for etl, are saved in mochi.py
from mochi import *
import pickle
import datetime

In [3]:
"""
Loading data.
The might-be0leakage feature is generously shared in 
https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/discussion/31870
"""
data_path = "/data/kaggleData/2sigma/"

pic_file = data_path + "listing_image_time.csv"
pic_df = pd.read_csv(pic_file).set_index('Listing_Id')

train_df = pd.read_json(data_path + 'train.json')
test_df = pd.read_json(data_path+'test.json')

train_df=train_df.join(pic_df,on='listing_id',how='left')
test_df=test_df.join(pic_df,on='listing_id',how='left')

train_df.to_json(data_path + "train_t.json")
test_df.to_json(data_path + "test_t.json")

In [4]:
"""
use the dictionary to manage the feature sets,
which will be added to and validate in the model sequentially later
"""

feature_set_dict = {}
feature_set_dict['basic_numerical']=["bathrooms", "bedrooms", "latitude", "longitude", "price",'time_stamp']
feature_set_dict['basic_categorical'] = ["display_address", "street_address",'building_id','manager_id']

In [5]:
"""features constructed by useing the non structral part size"""

train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

feature_set_dict['unstructured_derived_numerical']=['num_photos','num_features','num_description_words']

In [6]:
"""
feature constructed from the "created"
"""

#convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

#for latter use
train_df["dayofyear"] = train_df["created"].dt.dayofyear
test_df["dayofyear"] = test_df["created"].dt.dayofyear

feature_set_dict['created_time_derived']=['created_month','created_day','created_hour']

In [11]:
"""
Features constucted from price
"""
#some new numerical features related to the price
train_df["price_per_bed"] = ((train_df["price"]+1e-7)/(train_df["bedrooms"]+1e-7))
train_df["bath_per_bed"] = ((train_df["bathrooms"]+1e-7)/(train_df["bedrooms"]+1e-7))
train_df["price_per_room"] = ((train_df["price"]+1e-7)/(train_df["bedrooms"]+train_df["bathrooms"]+1e-7))

test_df["price_per_bed"] = ((test_df["price"]+1e-7)/(test_df["bedrooms"]+1e-7))
test_df["bath_per_bed"] = ((test_df["bathrooms"]+1e-7)/(test_df["bedrooms"]+1e-7))
test_df["price_per_room"] = ((test_df["price"]+1e-7)/(test_df["bedrooms"]+test_df["bathrooms"]+1e-7))

feature_set_dict['price_and_room_related']=["price_per_bed","bath_per_bed","price_per_room"]

In [7]:
"""
new categorical data generated from the old ones
"""

#new feature for the street_address, use them instead of the original one
train_df["street_name"] = train_df["street_address"].apply(proecessStreet)
test_df["street_name"] = test_df["street_address"].apply(proecessStreet)

train_df['building0']=map(lambda x:1 if x== '0' else 0,train_df['building_id'])
test_df['building0']=map(lambda x:1 if x== '0' else 0,test_df['building_id'])

#the house type
train_df['house_type']=map(lambda x,y:(x,y),train_df['bedrooms'],train_df['bathrooms'])
train_df['house_type'] = train_df['house_type'].apply(str)
test_df['house_type']=map(lambda x,y:(x,y),test_df['bedrooms'],test_df['bathrooms'])
test_df['house_type'] = test_df['house_type'].apply(str)

feature_set_dict['new_categoricals'] = ['street_name','building0','house_type']

In [8]:
"""
feature of the house,
selecting the those appeared in the data set for at least 0.001
"""

#preprocessing for features
train_df["features"] = train_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_') 
                                                            for i in x])
test_df["features"] = test_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_') 
                                                          for i in x])

#create the accept list
accept_list = list(featureList(train_df,test_df,limit = 0.001))

#map the feature to dummy slots
featureMapping(train_df,test_df,accept_list)

feature_set_dict['featured_derived'] = map(lambda x : 'with_'+x,accept_list)

In [9]:
"""
preprocess for the spatial data and generate the spatial cluster id 
"""

processMap(train_df)
processMap(test_df)

train_df = train_df.fillna(-1)
test_df=test_df.fillna(-1)

getCluster(train_df,test_df,30)
getCluster(train_df,test_df,10)

feature_set_dict['cluseter_id'] = ['cluster_id_10','cluster_id_30']

In [10]:
"""
The manager perfomace based features
K-FOLD construction of the statistic features, for avoiding overfitting
"""

skf=KFold(len(train_df['interest_level']),5,shuffle=True,random_state = 42)

#dev set adding manager skill
for train,test in skf:
        performance_eval(train_df.iloc[train,:],train_df.iloc[test,:],feature='manager_id',
                       update_df = train_df,smoothing=False)
        temporalManagerPerf_f(train_df.iloc[train,:],train_df.iloc[test,:],update_df = train_df)
                
performance_eval(train_df,test_df,feature='manager_id',smoothing=False)
temporalManagerPerf_f(train_df,test_df)

feature_set_dict['manager_performance'] = ['manager_id_perf','m3perf','m7perf','m14perf','m30perf',
                                          'm3perf_f','m7perf_f','m14perf_f','m30perf_f','manager_id_nrank']

In [12]:
"""
Constructing some statistics features to better discribe some categorical features
"""

#features that will be used to construct some statistical features
main_st_nf = ["bathrooms", "bedrooms","price_per_bed","bath_per_bed","price_per_room","num_photos", "num_features", "num_description_words",'price']
main_statistics =['mean','max','min','median']

for f in main_st_nf:
    #print f
    categorical_statistics(train_df,test_df,'manager_id',f)
    categorical_statistics(train_df,test_df,'house_type',f)
    categorical_size(train_df,test_df,'manager_id')
    categorical_size(train_df,test_df,'house_type')
    
    categorical_statistics(train_df,test_df,'cluster_id_10',f)
    categorical_statistics(train_df,test_df,'cluster_id_30',f)

manager_lon_lat(train_df,test_df)

temp_feature_set = []
for st in main_statistics:
    temp_feature_set.extend(map(lambda x : 'manager_id_'+x+'_'+st,main_st_nf))
    temp_feature_set.extend(map(lambda x : 'house_type_'+x+'_'+st,main_st_nf)) 
temp_feature_set.extend(['m_m_distance','mlon','mlat'])
temp_feature_set.extend(['manager_id_size','house_type_size'])
feature_set_dict['manager_and_house_sts'] = temp_feature_set 

temp_feature_set = []    
temp_feature_set.extend(map(lambda x : 'cluster_id_10_'+x+'_'+'mean',main_st_nf))
temp_feature_set.extend(map(lambda x : 'cluster_id_30_'+x+'_'+'mean',main_st_nf))
feature_set_dict['spatial_sts'] = temp_feature_set 

In [13]:
"""
Encoding the categorical futures to numbers for xgb/lgbm usage
"""
#encoding the categorical data into numerical type
for f in feature_set_dict['basic_categorical'] + feature_set_dict['new_categoricals'] :
    if train_df[f].dtype=='object':
        #print(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f])+list(test_df[f]))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

In [15]:
print train_df.shape
print test_df.shape

(49352, 292)
(74659, 291)


In [20]:
"""
save the train_set and the test set for latter use for tree based models
also save the feature_set_dict
"""
train_df.to_json(data_path+'processed_train.json',force_ascii = False)
test_df.to_json(data_path+'processed_test.json',force_ascii = False)

pd.to_pickle(feature_set_dict,data_path+'feature_set_dict.pkl')

In [34]:
"""
generating normalized feature set for linear regression and neural network
"""

#setting the feature sets for latter processing

numericals = [u'bath_per_bed',u'bathrooms',u'bedrooms',u'building0',u'cluster_id_10_d',u'cluster_id_30_d',u'dayofyear',
 u'latitude',u'listing_id',u'longitude',u'm14perf',u'm14perf_f',u'm30perf',u'm30perf_f',u'm3perf',u'm3perf_f',
 u'm7perf',u'm7perf_f',u'm_c_distance',u'm_m_distance',u'manager_id_nrank',u'manager_id_perf',u'mlat',
 u'mlon',u'num_description_words',u'num_features', u'num_photos',
 u'price',u'price_per_bed',u'price_per_room',]

numerical_may_processed = [ u'created_day',u'created_hour',u'created_month',u'time_stamp'] 

new_hcc_categoricals=[ u'building_id',u'cluster_id_10',u'cluster_id_30',u'street_name']
hcc_categoricals = ['manager_id','house_type']

#statiscals
statistical = []
for feature in test_df.columns:
    if re.match('((manager_id)|(house_type))\S+((mean)|(median)|(min)|(max))',feature) !=None:
        statistical.append(feature)

#features
with_feat = []
for feature in test_df.columns:
    if re.match('with_\S+',feature) !=None:
        with_feat.append(feature)



In [24]:
#adding some features for the properties of time circulation
transferred=['another_day','another_hour']

train_df["created"] = pd.to_datetime(train_df["created"],unit='ms')
test_df["created"] = pd.to_datetime(test_df["created"],unit='ms')

train_df['another_day']=(train_df['created']+pd.tseries.offsets.DateOffset(days=15)).dt.day
train_df['another_hour']=(train_df['created']+pd.tseries.offsets.DateOffset(hours=12)).dt.hour
test_df['another_day']=(test_df['created']+pd.tseries.offsets.DateOffset(days=15)).dt.day
test_df['another_hour']=(test_df['created']+pd.tseries.offsets.DateOffset(hours=12)).dt.hour

#creating some more statistical features as it is harder for linear models to get overfitted
new_hcc_feature = []

skf=KFold(len(train_df['interest_level']),5,shuffle=True,random_state = 42)
#hcc encoding for the old hcc features
for feature in hcc_categoricals:    
    for train,test in skf:
        hcc_scoring(train_df.iloc[train,:],train_df.iloc[test,:],feature,'high',\
                   update_df = train_df)
        hcc_scoring(train_df.iloc[train,:],train_df.iloc[test,:],feature,'medium',\
                   update_df = train_df)

    hcc_scoring(train_df,test_df,feature,'high')
    hcc_scoring(train_df,test_df,feature,'medium')
    new_hcc_feature.append('hcc_'+feature+'_high')
    new_hcc_feature.append('hcc_'+feature+'_medium')

#The second part for statistical computing for construncting new features
new_new_hcc_features=[]
for feature in new_hcc_categoricals:
    for train,test in skf:
        hcc_scoring(train_df.iloc[train,:],train_df.iloc[test,:],feature,'high',\
                   update_df = train_df)
        hcc_scoring(train_df.iloc[train,:],train_df.iloc[test,:],feature,'medium',\
                   update_df = train_df)
        performance_eval(train_df.iloc[train,:],train_df.iloc[test,:],feature,\
                   update_df = train_df,random=0.01)

    hcc_scoring(train_df,test_df,feature,'high')
    hcc_scoring(train_df,test_df,feature,'medium')
    performance_eval(train_df,test_df,feature,random=0.01)
    new_new_hcc_features.append('hcc_'+feature+'_high')
    new_new_hcc_features.append('hcc_'+feature+'_medium')
    new_new_hcc_features.append(feature+'_nrank_s_r')
    new_new_hcc_features.append(feature+'_perf_s_r')

In [28]:
price_columns = []
for each_col in train_df.columns:
    if re.match(r'price',each_col):
        print each_col
        price_columns.append(each_col)

price
price_per_bed
price_per_room


In [31]:
from scipy.stats import boxcox

In [55]:
#box-cox tranforming for the price related features
box_cox_trans = []
for each_feature in price_columns:
    train_df[each_feature+'_box_cox'],_=boxcox(train_df[each_feature])
    test_df[each_feature+'_box_cox'],_ = boxcox(test_df[each_feature])
    box_cox_trans.append(each_feature+'_box_cox')

In [62]:
processing_features = numericals+numerical_may_processed\
                      +new_hcc_feature+transferred+\
                      statistical+new_new_hcc_features+box_cox_trans
        
#refilling the -1 to be nan and use median to fill later, for this feature set is for linear model
for f in processing_features:
    train_df[f]=train_df[f].replace(np.Inf,np.nan)
    test_df[f]=test_df[f].replace(np.Inf,np.nan)
    train_df.loc[train_df[f]==-1,f]=np.nan
    test_df.loc[test_df[f]==-1,f]=np.nan

In [63]:
normalized_train = train_df.copy()
normalized_test = test_df.copy()

train_test = pd.concat([train_df.drop('interest_level',axis=1),test_df])

for f in processing_features:
    normalized_train[f]=normalized_train[f].fillna(train_test[f].median())
    normalized_test[f]=normalized_test[f].fillna(train_test[f].median())
        
for f in processing_features:
    normalized_train[f]=(normalized_train[f]-train_test[f].mean())/train_test[f].std()
    normalized_test[f]=(normalized_test[f]-train_test[f].mean())/train_test[f].std()


In [64]:
normalized_train[processing_features+['interest_level']].to_json(data_path+"normalized_train.json")
normalized_test[processing_features].to_json(data_path+"normalized_test.json")


In [65]:
normalized_train

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,hcc_street_name_medium,street_name_perf_s_r,street_name_rank_s_r,street_name_nrank_s_r,pricebox_cox,price_per_bedbox_cox,price_per_roombox_cox,price_box_cox,price_per_bed_box_cox,price_per_room_box_cox
10,0.482818,1.311959,3797,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,12282,[],medium,-0.980981,1.481173,...,-1.466682,0.085756,407.5,-0.209763,3000.0,1.000000e+03,666.666652,-1.228628,-1.008704,-1.287335
10000,-0.357223,0.411222,8986,2016-06-12 12:19:27,,9080,"[doorman, elevator, fitness_center, cats_allow...",low,1.173818,1.003667,...,0.253284,-0.155874,368.0,-0.249103,5465.0,2.732500e+03,1821.666606,-1.189220,-0.287173,-1.199532
100004,-0.357223,-0.489514,8889,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",13719,"[laundry_in_building, dishwasher, hardwood_flo...",high,-0.328093,-1.082922,...,0.556391,0.245367,458.0,-0.244136,2850.0,2.850000e+03,1424.999929,-1.232413,-0.261094,-1.217346
100007,-0.357223,-0.489514,1848,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,10866,"[hardwood_floors, no_fee]",low,0.077611,-1.070673,...,-0.723717,-0.958276,167.0,-0.249859,3275.0,3.275000e+03,1637.499918,-1.222318,-0.177112,-1.207013
100013,-0.357223,2.212695,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,15072,[pre_war],low,1.963732,-0.706136,...,2.039060,0.111260,429.5,-0.192188,3350.0,8.375000e+02,669.999987,-1.220721,-1.158039,-1.286790
100014,1.322860,2.212695,2544,2016-04-19 04:24:47,,15194,[],medium,-0.217935,-1.024756,...,1.108041,0.683708,577.0,-0.230556,7995.0,1.998750e+03,1332.499978,-1.168308,-0.490620,-1.222580
100016,-0.357223,0.411222,2711,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,14964,"[prewar, elevator, dogs_allowed, cats_allowed,...",low,1.348459,-0.737866,...,-0.773452,-0.309053,321.5,-0.244770,3600.0,1.800000e+03,1199.999960,-1.215732,-0.562775,-1.231082
100020,1.322860,-0.489514,169,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",15223,"[doorman, elevator, pre_war, terrace, laundry_...",low,-0.223308,-1.239363,...,-0.466242,-0.298532,332.0,-0.243702,5645.0,5.644999e+03,1881.666604,-1.187324,0.122560,-1.197330
100026,-0.357223,-0.489514,7635,2016-04-20 02:36:35,<p><a website_redacted,11646,"[cats_allowed, dogs_allowed, elevator, laundry...",medium,1.944924,-0.990850,...,2.206437,1.553974,2716.0,0.156967,1725.0,1.725000e+03,862.499957,-1.273413,-0.592696,-1.260781
100027,1.322860,2.212695,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,6459,"[dishwasher, hardwood_floors]",low,-0.623639,-1.659210,...,3.620783,1.468560,2650.0,1.351896,5800.0,1.450000e+03,966.666651,-1.185757,-0.718486,-1.250006
