In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold,StratifiedKFold
import matplotlib.pyplot as plt
from mochi import *

In [2]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [3]:
#try xgboost
#fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
     seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,\
     max_depth = 6,cv_dict = None,verbose_eval=True):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = eta
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,\
        early_stopping_rounds=early_stop,evals_result = cv_dict,verbose_eval = verbose_eval)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [4]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [5]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour

#some new numerical features related to the price
train_df["price_per_bath"] =  (train_df["price"]*1.0/train_df["bathrooms"]).replace(np.Inf,-1)
train_df["price_per_bed"] = (train_df["price"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["bath_per_bed"] = (train_df["bathrooms"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["price_per_room"] = (train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"])).replace(np.Inf,-1)

# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words",\
                        "created_year","listing_id", "created_month", "created_day", "created_hour"])
#price new features
features_to_use.extend(["price_per_bed","bath_per_bed","price_per_room"])


In [6]:
#for latter use
train_df['house_type']=map(lambda x,y:(x,y),train_df['bedrooms'],train_df['bathrooms'])
train_df['house_type']=train_df['house_type'].apply(str)

In [53]:
features = list(features_to_use)
categorical = ["display_address", "street_address",'building_id','manager_id','house_type']
features.extend(categorical)
features.extend(['manager_id_perf','manager_id_nrank'])
features.extend(['cluster_id_10','cluster_id_30'])

features.extend(['building_id_perf','building_id_size'])

In [8]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_df),5,shuffle=True,random_state = 42)

In [54]:
#running and getting the cv from xgboost
cv_scores = []
cv_result = []

#K-FOLD already defined.If not ,use
#KF=KFold(len(train_X),5,shuffle=True,random_state = 42)
i=0
for dev_index, val_index in KF:
        result_dict = {}
                
        #split the orginal train set into dev_set and val_set
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        
        getCluster(dev_set,val_set,30)
        getCluster(dev_set,val_set,10)
        
        """some preprocessing like feature constructed in cv manners"""
        skf=KFold(len(dev_set['interest_level']),5,shuffle=True,random_state = 42)
        #dev set adding manager skill
        for train,test in skf:
            performance_eval(dev_set.iloc[train,:],dev_set.iloc[test,:],feature='manager_id',k=5,g=1,
                           update_df = dev_set,smoothing=False)
            performance_eval(dev_set.iloc[train,:],dev_set.iloc[test,:],feature='building_id',k=5,g=1,
                           update_df = dev_set,smoothing=False)
        performance_eval(dev_set,val_set,feature='manager_id',k=5,g=1,smoothing=False)
        performance_eval(dev_set,val_set,feature='building_id',k=5,g=1,smoothing=False)
        
        categorical_size(dev_set,val_set,'building_id')

        
        for f in categorical:
            if dev_set[f].dtype=='object':
                #print(f)
                lbl = preprocessing.LabelEncoder()
                lbl.fit(list(dev_set[f])+list(val_set[f]))
                dev_set[f] = lbl.transform(list(dev_set[f].values))
                val_set[f] = lbl.transform(list(val_set[f].values))
        
        dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        """ 
         runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
         seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,\
         max_depth = 6,cv_dict = None):
         """
        
        preds, model = runXGB(dev_X, dev_y, val_X, val_y,early_stop  = 20,\
                              feature_names = features,cv_dict = result_dict,verbose_eval=100)
        loss = log_loss(val_y, preds)
        cv_scores.append(loss)
        cv_result.append(result_dict)
        i+=1
        print 'loss for the turn '+str(i)+' is '+str(loss)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[0]	train-mlogloss:1.03305	test-mlogloss:1.03483
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.471335	test-mlogloss:0.55186
[200]	train-mlogloss:0.400502	test-mlogloss:0.544027
Stopping. Best iteration:
[215]	train-mlogloss:0.392105	test-mlogloss:0.54358

loss for the turn 1 is 0.5438707966
[0]	train-mlogloss:1.03283	test-mlogloss:1.03423
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.469883	test-mlogloss:0.549173
[200]	train-mlogloss:0.399794	test-mlogloss:0.541183
Stopping. Best iteration:
[201]	train-mlogloss:0.399337	test-mlogloss:0.54112

loss for the turn 2 is 0.541572847557
[0]	train-mlogloss:1.03195	test-mlogloss:1.03312
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until t

In [12]:
#plot the validation curv
cvResult = CVstatistics(cv_result,'mlogloss')
cvResult.turns

test0    251
test1    268
test2    279
test3    194
test4    246
Name: 278, dtype: int32

In [55]:
#cvResult.validCurve(stop=False)
#some errors at certain turn to see the descending
cv_scores
np.mean(cv_scores)

0.54251683483278379

In [49]:
#show the importance of the features
showImportance(model,'gain')

[('manager_id_perf', 0.1779163150019739),
 ('price', 0.06993089086594802),
 ('price_per_bed', 0.05930946660133056),
 ('bathrooms', 0.05805780091792404),
 ('bedrooms', 0.05747880996695548),
 ('bath_per_bed', 0.05145312060575195),
 ('building_id', 0.05121131946896568),
 ('price_per_room', 0.050984592547528934),
 ('num_features', 0.04036526912959051),
 ('longitude', 0.04023480203071151),
 ('num_photos', 0.04019668503967685),
 ('latitude', 0.039253473311456615),
 ('created_hour', 0.03561258903317282),
 ('manager_id_nrank', 0.03128699074697629),
 ('listing_id', 0.029991333326323196),
 ('street_address', 0.029884237209319976),
 ('num_description_words', 0.029670880986026715),
 ('display_address', 0.02911134725264291),
 ('manager_id', 0.02745103521961843),
 ('created_month', 0.026534532982693905),
 ('created_day', 0.02406450775541194)]

In [38]:
temp=pd.concat([dev_set[['building_id','building_id_perf']],pd.get_dummies(dev_set.interest_level)], axis = 1)\
     .groupby('building_id').mean()
temp['count'] = dev_set.groupby('building_id').count().iloc[:,1]


In [39]:
temp2 = temp.sort_values(by = 'building_id_perf')[temp['count']>5]

  if __name__ == '__main__':


In [41]:
plt.plot(temp2['building_id_perf'],temp2['low'],\
        temp2['building_id_perf'],temp2['medium'],\
        temp2['building_id_perf'],temp2['high'])
plt.show()