In [1]:
import pandas as pd
from sklearn.cross_validation import KFold,StratifiedKFold
from mochi import *



In [2]:
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

(49352, 15)
(74659, 14)


In [3]:
#basic feature engineering
#basic feature generation
#some transfromed features
train_df["num_photos"] = train_df["photos"].apply(len)
train_df["num_features"] = train_df["features"].apply(len)
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

#time-related
train_df["created"] = pd.to_datetime(train_df["created"])

train_df["created_year"] = train_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
train_df["created_weekday"] = train_df["created"].dt.dayofweek

train_df["dayofyear"] = train_df["created"].dt.dayofyear

train_df["sine_hour"] = np.sin(2*np.pi*train_df["created_hour"]/24)
train_df["cos_hour"] = np.cos(2*np.pi*train_df["created_hour"]/24)

train_df["sine_weekday"] = np.sin(2*np.pi*train_df["created_weekday"]/7)
train_df["cos_weekday"] = np.cos(2*np.pi*train_df["created_weekday"]/7)

train_df["sine_day"] = np.sin(2*np.pi*train_df["created_day"]/7)
train_df["cos_day"] = np.cos(2*np.pi*train_df["created_day"]/7)

#some new numerical features related to the price
train_df["price_per_bath"] =  (train_df["price"]*1.0/(train_df["bathrooms"]+0.01))
train_df["price_per_bed"] = (train_df["price"]*1.0/(train_df["bedrooms"])+0.01)
train_df["bath_per_bed"] = (train_df["bathrooms"]*1.0/(train_df["bedrooms"]+1))
train_df["price_per_room"] = (train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"]+0.01))

train_df['house_type']=map(lambda x,y:(x,y),train_df['bedrooms'],train_df['bathrooms'])
train_df['house_type'] = train_df['house_type'].apply(str)

ny_lat = 40.785091
ny_lon = -73.968285
train_df['central_distance']= np.sqrt((train_df['latitude']-ny_lat)**2 + (train_df['longitude']-ny_lon)**2) 

In [4]:
#from non-structured features
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_features"] = test_df["features"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

#time-related
test_df["created"] = pd.to_datetime(test_df["created"])

test_df["created_year"] = test_df["created"].dt.year
test_df["created_month"] = test_df["created"].dt.month
test_df["created_day"] = test_df["created"].dt.day
test_df["created_hour"] = test_df["created"].dt.hour
test_df["created_weekday"] = test_df["created"].dt.dayofweek

test_df["dayofyear"] = test_df["created"].dt.dayofyear

test_df["sine_hour"] = np.sin(2*np.pi*test_df["created_hour"]/24)
test_df["cos_hour"] = np.cos(2*np.pi*test_df["created_hour"]/24)

test_df["sine_weekday"] = np.sin(2*np.pi*test_df["created_weekday"]/7)
test_df["cos_weekday"] = np.cos(2*np.pi*test_df["created_weekday"]/7)

test_df["sine_day"] = np.sin(2*np.pi*test_df["created_day"]/7)
test_df["cos_day"] = np.cos(2*np.pi*test_df["created_day"]/7)

#some new numerical features related to the price

test_df["price_per_bath"] =  (test_df["price"]*1.0/(test_df["bathrooms"]+0.01))
test_df["price_per_bed"] = (test_df["price"]*1.0/(test_df["bedrooms"]+0.01))
test_df["bath_per_bed"] = (test_df["bathrooms"]*1.0/(test_df["bedrooms"]+1))
test_df["price_per_room"] = (test_df["price"]*1.0/(test_df["bedrooms"]+test_df["bathrooms"]+0.01))

test_df['house_type']=map(lambda x,y:(x,y),test_df['bedrooms'],test_df['bathrooms'])
test_df['house_type'] = test_df['house_type'].apply(str)

test_df['central_distance']= np.sqrt((test_df['latitude']-ny_lat)**2 + (test_df['longitude']-ny_lon)**2) 

In [5]:
"""
new categorical data generated from the old ones
"""
#new feature for the street_address, use them instead of the original one
train_df["street_name"] = train_df["street_address"].apply(proecessStreet)
test_df["street_name"] = test_df["street_address"].apply(proecessStreet)

train_df['building0']=map(lambda x:1 if x== '0' else 0,train_df['building_id'])
test_df['building0']=map(lambda x:1 if x== '0' else 0,test_df['building_id'])

In [6]:
#dealing with features
#preprocessing for features
train_df["features"] = train_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_') \
                                                            for i in x])
test_df["features"] = test_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_')\
                                                          for i in x])
#create the accept list
accept_list = list(featureList(train_df,test_df,limit = 0.001))

#map the feature to dummy slots
featureMapping(train_df,test_df,accept_list)
#features_to_use.extend(map(lambda x : 'with_'+x,accept_list))
#map(lambda x : 'with_'+x,accept_list)

In [7]:
processMap(train_df)
processMap(test_df)
train_df['latitude']=train_df['latitude'].fillna(-1)
train_df['longitude']=train_df['longitude'].fillna(-1)

In [12]:
test_df['latitude']=test_df['latitude'].fillna(-1)
test_df['longitude']=test_df['longitude'].fillna(-1)

In [13]:
#features from clustering
getCluster(train_df,test_df,30)
getCluster(train_df,test_df,10)

In [14]:
#store the basic transformed train and test
train_df.to_json(data_path+'basic_train_df.json')
test_df.to_json(data_path+'basic_test_df.json')

In [15]:
KF=StratifiedKFold(train_df['interest_level'],5,shuffle=True,random_state = 1983)
i=0
for dev_index, val_index in KF: 
    dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:]
    dev_set.to_json(data_path+'b_dev_set_'+str(i)+'.json')
    val_set.to_json(data_path+'b_val_set_'+str(i)+'.json')
    i+=1

In [16]:
processing_features = ['bathrooms',
 'bedrooms',
 'price',
 'latitude',
 'longitude',
 'num_photos',
 'num_features',
 'num_description_words',
 'dayofyear',
 'listing_id',
 'price_per_bed',
 'bath_per_bed',
 'price_per_room',
 'price_per_bath',
 'central_distance',
'cluster_id_30_d','cluster_id_10_d']

In [32]:
skF=StratifiedKFold(train_df['interest_level'],5,shuffle=True,random_state = 1983)

In [43]:
kf_l = []
skf_l=[]

In [44]:
for dev,val in KF:
    kf_l.append(dev)

In [45]:
for dev,val in skF:
    skf_l.append(dev)

In [47]:
for i in range(5):
    print sum(skf_l[i]-kf_l[i])

0
0
0
0
0


In [48]:
train_df['listing_id']

10        7211212
10000     7150865
100004    6887163
100007    6888711
100013    6934781
100014    6894514
100016    6930771
100020    6867392
100026    6898799
100027    6814332
100030    6869199
10004     7102986
100044    6895442
100048    6846213
10005     7089402
100051    6889043
100052    6913348
100053    6894111
100055    6900220
100058    6848536
100062    6858062
100063    6836760
100065    6866830
100066    6885927
10007     7120132
100071    6933499
100075    6921632
100076    6913084
100079    6907079
100081    6925264
           ...   
99915     6921019
99917     6926146
99919     6844805
99921     6943991
99923     6822618
99924     6918969
99931     6921162
99933     6819357
99935     6893263
99937     6873182
9994      7114685
99953     6924210
99956     6884807
99960     6825168
99961     6911061
99964     6942494
99965     6819478
99966     6878391
99979     6871559
99980     6933865
99982     6837242
99984     6815109
99986     6871681
99987     6856001
99988     

In [17]:
#type(train_df.drop('interest_level',axis=1))
train_test = pd.concat([train_df.drop('interest_level',axis=1),test_df])

In [18]:
#normalized features
normalized_train = train_df.copy()
normalized_test = test_df.copy()

for f in processing_features:
    normalized_train[f]=normalized_train[f].fillna(train_test[f].median())
    normalized_test[f]=normalized_test[f].fillna(train_test[f].median())

for f in processing_features:
    normalized_train[f]=(normalized_train[f]-train_test[f].mean())/train_test[f].std()
    normalized_test[f]=(normalized_test[f]-train_test[f].mean())/train_test[f].std()

#store the basic transformed train and test
normalized_train.to_json(data_path+'normal_train_df.json')
normalized_test.to_json(data_path+'normal_test_df.json')

In [19]:
i=0
for dev_index, val_index in KF: 
    dev_set, val_set = normalized_train.iloc[dev_index,:] , normalized_train.iloc[val_index,:]
    dev_set.to_json(data_path+'n_dev_set_'+str(i)+'.json')
    val_set.to_json(data_path+'n_val_set_'+str(i)+'.json')
    i+=1

In [20]:
#log transformed features
#normalized features
log_train = train_df.copy()
log_test = test_df.copy()

for f in processing_features:
    log_train[f]=np.log(log_train[f]+1)
    log_test[f]=np.log(log_test[f]+1)


#store the basic transformed train and test
log_train.to_json(data_path+'log_train_df.json')
log_test.to_json(data_path+'log_test_df.json')



In [21]:
i=0
for dev_index, val_index in KF: 
    dev_set, val_set = log_train.iloc[dev_index,:] , log_train.iloc[val_index,:]
    dev_set.to_json(data_path+'l_dev_set_'+str(i)+'.json')
    val_set.to_json(data_path+'l_val_set_'+str(i)+'.json')
    i+=1

In [22]:
log_train_test = pd.concat([log_train.drop('interest_level',axis=1),log_test])

In [23]:
#log transformed and normalized features
#normalized features
normalized_log_train = log_train.copy()
normalized_log_test = log_test.copy()

for f in processing_features:
    normalized_log_train[f]=normalized_log_train[f].fillna(log_train_test[f].median())
    normalized_log_test[f]=normalized_log_test[f].fillna(log_train_test[f].median())

for f in processing_features:
    normalized_log_train[f]=(normalized_log_train[f]-log_train_test[f].mean())/log_train_test[f].std()
    normalized_log_test[f]=(normalized_log_test[f]-log_train_test[f].mean())/log_train_test[f].std()

#store the basic transformed train and test
normalized_log_train.to_json(data_path+'lognor_train_df.json')
normalized_log_test.to_json(data_path+'lognor_test_df.json')

In [24]:
i=0
for dev_index, val_index in KF: 
    dev_set, val_set = normalized_log_train.iloc[dev_index,:] , normalized_log_train.iloc[val_index,:]
    dev_set.to_json(data_path+'ln_dev_set_'+str(i)+'.json')
    val_set.to_json(data_path+'ln_val_set_'+str(i)+'.json')
    i+=1