In [6]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold
import re
from sklearn.linear_model import LogisticRegression

In [31]:
from mochi import *

In [18]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [19]:
features_to_use  = ["bathrooms", "bedrooms", "price"]

In [20]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
#test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
#test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
#test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
#test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
#test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
#test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
#test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
#test_df["created_hour"] = test_df["created"].dt.hour
train_df["created_weekday"] = train_df["created"].dt.dayofweek
#test_df["created_weekday"] = test_df["created"].dt.dayofweek


#some new numerical features related to the price
train_df["price_per_bath"] =  (train_df["price"]*1.0/train_df["bathrooms"]).replace(np.Inf,np.nan)
train_df["price_per_bed"] = (train_df["price"]*1.0/train_df["bedrooms"]).replace(np.Inf,np.nan)
train_df["bath_per_bed"] = (train_df["bathrooms"]*1.0/train_df["bedrooms"]).replace(np.Inf,np.nan)
train_df["price_per_room"] = (train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"])).replace(np.Inf,np.nan)

#test_df["price_per_bath"] =  (test_df["price"]*1.0/test_df["bathrooms"]).replace(np.Inf,np.nan)
#test_df["price_per_bed"] = (test_df["price"]*1.0/test_df["bedrooms"]).replace(np.Inf,np.nan)
#test_df["bath_per_bed"] = (test_df["bathrooms"]*1.0/test_df["bedrooms"]).replace(np.Inf,np.nan)
#test_df["price_per_room"] = (test_df["price"]*1.0/(test_df["bedrooms"]+test_df["bathrooms"])).replace(np.Inf,np.nan)

train_df["sine_hour"] = np.sin(2*np.pi*train_df["created_hour"]/24)
train_df["cos_hour"] = np.cos(2*np.pi*train_df["created_hour"]/24)

train_df["sine_weekday"] = np.sin(2*np.pi*train_df["created_weekday"]/7)
train_df["cos_weekday"] = np.cos(2*np.pi*train_df["created_weekday"]/7)

train_df["sine_day"] = np.sin(2*np.pi*train_df["created_day"]/7)
train_df["cos_day"] = np.cos(2*np.pi*train_df["created_day"]/7)

#for latter use
train_df["dayofyear"] = train_df["created"].dt.dayofyear
#test_df["dayofyear"] = test_df["created"].dt.dayofyear

ny_lat = 40.785091
ny_lon = -73.968285
train_df['central_distance']= np.sqrt((train_df['latitude']-ny_lat)**2 + (train_df['longitude']-ny_lon)**2) 
#test_df['central_distance']= np.sqrt((test_df['latitude']-ny_lat)**2 + (test_df['longitude']-ny_lon)**2) 

In [21]:
# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words",\
                        "sine_weekday", "cos_weekday","sine_day","cos_day", \
                        "sine_hour","cos_hour",'dayofyear'])
#price new features
features_to_use.extend(["price_per_bed","bath_per_bed","price_per_room",'central_distance'])

In [22]:
train_df['house_type']=map(lambda x,y:(x,y),train_df['bedrooms'],train_df['bathrooms'])
train_df['house_type'] = train_df['house_type'].apply(str)

In [23]:
#dealing with features

#preprocessing for features
train_df["features"] = train_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_') \
                                                            for i in x])
#test_df["features"] = test_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_')\
#                                                          for i in x])
#create the accept list
accept_list = list(featureList(train_df,test_df,limit = 0.001))

#map the feature to dummy slots
featureMapping(train_df,test_df,accept_list)
features_to_use.extend(map(lambda x : 'with_'+x,accept_list))

In [24]:
features = list(features_to_use)
features.extend(['manager_id_perf_s_r','house_type_perf_s_r'])

In [None]:
#standardization and nan filling for logestic regression


In [28]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

#train_X = train_df[features]
#test_X = test_df[features]

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_df),5,shuffle=True,random_state = 42)

In [35]:
#cross-validation
cv_scores = []

#using entropy
for dev_index, val_index in KF:
    
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        
        skf=KFold(len(dev_set['interest_level']),5,shuffle=True,random_state = 42)
        for train,test in skf:
            performance_eval(dev_set.iloc[train,:],dev_set.iloc[test,:],'manager_id',\
                           update_df = dev_set,random = 0.01,k=5)
            performance_eval(dev_set.iloc[train,:],dev_set.iloc[test,:],'house_type',\
                           update_df = dev_set,random = 0.01,k=5)
            temporalManagerPerf(dev_set.iloc[train,:],dev_set.iloc[test,:],update_df = dev_set)
        
        performance_eval(dev_set,val_set,feature='manager_id',smoothing=True,random=0.01,k=5)
        performance_eval(dev_set,val_set,feature='house_type',smoothing=True,random=0.01,k=5)
        temporalManagerPerf(dev_set,val_set)
        
        manager_lon_lat(dev_set,val_set)
        
        dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        #random forest us
        lr = LogisticRegression(multi_class = 'multinomial')
        lr.fit(dev_X,dev_y)
        preds = lr.predict_proba(val_X)
        
        cv_scores.append(log_loss(val_y, preds))
        

        print(cv_scores)
        print np.mean(cv_scores)
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  updateDTrain = train_df[['manager_id']].join(std_value, on='manager_id', how="left")['m_m_distance'].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  for f in ['mlat','mlon','m_m_distance']:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train_df.update(updateMTrain)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [20]:
len(features)

216