In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt

In [2]:
#try xgboost
#original fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
     seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,\
     max_depth = 6,cv_dict = None,verbose_eval=True):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = eta
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,\
        early_stopping_rounds=early_stop,evals_result = cv_dict,verbose_eval = verbose_eval)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

class CVstatistics(object):
    
    """
    self.result : the result dataframe storing the cv results
    self.endpoint : the first ending point for the validations
    self.turns: the turns for each validation
    
    validCurve : plot the validation curve,stop at the first endpoint
    errorsAt: return the average errors at a certain turn
    """
    def __init__(self,result_dict,metric,k=5):
        self.metric = metric
        if type(result_dict) == pd.DataFrame:
            self.result = result_dict
        else:
            temp_dict = {}
            for phase in ['train','test']:
                for turn in range(k):
                    temp_dict[phase+str(turn)]=cv_result[turn][phase][metric]
                    self.result=pd.DataFrame(dict([ (key,pd.Series(v)) for key,v in temp_dict.iteritems()]))    
        
        self.endpoint =len(self.result.filter(like = 'train').dropna())
        
        self.turns = self.result.filter(like = 'test').\
            apply(lambda x : ~np.isnan(x)).cumsum(axis=0).iloc[len(self.result)-1,:]

    def validCurve(self,start = 0, stop_at_first = True):
        if stop_at_first:
            eout = self.result.iloc[start:,:].filter(like = 'test').dropna().mean(axis=1)
            ein =  self.result.iloc[start:,:].filter(like = 'train').dropna().mean(axis=1)
        else:
            eout = self.result.iloc[start:,:].filter(like = 'test').mean(axis=1)
            ein =  self.result.iloc[start:,:].filter(like = 'train').mean(axis=1)
        plt.plot(map(lambda x :x+start,range(len(eout))), eout,
        map(lambda x :x+start,range(len(ein))), ein)
        plt.xlabel("turn")
        plt.ylabel(self.metric)
        plt.title('Validation Curve')
        
        plt.show()
    
    def eoutCurve(self,stop_at_first = True):
        if stop_at_first:
            eout = self.result.iloc[start:,:].filter(like = 'test').dropna().mean(axis=1)
        else:
            eout = self.result.iloc[start:,:].filter(like = 'test').mean(axis=1)
        plt.plot(map(lambda x :x+start,range(len(eout))), eout)
        plt.xlabel("turn")
        plt.ylabel(self.metric)
        plt.title('Eout Curve')
    
    def minAvgEout(self):
        meanTestError = cvResult.result.filter(like='test').mean(axis=1)
        return meanTestError[meanTestError==np.min(meanTestError)]
    
    def errorsAt(self,turn):
        eout = self.result.filter(like = 'test').loc[turn].mean()
        ein = self.result.filter(like = 'train').loc[turn].mean()
        return eout,ein
    
def xgbImportance(model,factor_name):
    factors = model.get_score(importance_type=factor_name)
    factor_list = []
    total = sum(factors.values())
    for key in factors:
        factors[key] = factors[key]*1.0/total
        factor_list.append((key,factors[key]))
    return sorted(factor_list,key=lambda x : x[1],reverse=True)
    


In [None]:
"""
new features:
['ht_mean_bathrooms','ht_mean_bedrooms','ht_mean_price','ht_mean_price_per_bed','ht_mean_bath_per_bed',\
'ht_mean_price_per_room','ht_mean_num_photos','ht_mean_num_features','ht_mean_num_description_words']
"""

def house_type_statistics(train_df,test_df,update_df =None,random = None):
        
    #adding the features about other things
    other_feature = ['bathrooms','bedrooms','price',"price_per_bed","bath_per_bed",\
                     "price_per_room",'num_photos','num_features','num_description_words']
    
    mean_value = train_df.groupby('house_type')[other_feature].mean()
    mean_value = mean_value.add_prefix('ht_mean_')
    
    new_mean_feature = list(mean_value.columns)
    
    updateM = test_df[['house_type']].join(mean_value, on = 'house_type', how="left")[new_mean_feature]
    
    for f in new_mean_feature:
        if f not in update_df.columns: 
            update_df[f] = np.nan

    update_df.update(updateM)

In [37]:
    #adding the features about other things
    other_feature = ['bathrooms','bedrooms','price',"price_per_bed","bath_per_bed",\
                     "price_per_room",'num_photos','num_features','num_description_words']
    
    mean_value = train_df.groupby('house_type')[other_feature].mean()
    mean_value = mean_value.add_prefix('ht_mean_')
    
    new_mean_feature = list(mean_value.columns)

In [38]:
train_df['house_type']

['ht_mean_bathrooms',
 'ht_mean_bedrooms',
 'ht_mean_price',
 'ht_mean_price_per_bed',
 'ht_mean_bath_per_bed',
 'ht_mean_price_per_room',
 'ht_mean_num_photos',
 'ht_mean_num_features',
 'ht_mean_num_description_words']

In [3]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [4]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [34]:
other_feature = ['price',"price_per_bed",\
                     "price_per_room",'num_photos','num_features','num_description_words']

In [35]:
mean_value = train_df.groupby('house_type')[other_feature].mean().fillna(-1)
mean_value = mean_value.add_prefix('ht_mean_')
new_mean_feature = list(mean_value.columns)

In [5]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour

#some new numerical features related to the price
train_df["price_per_bath"] =  (train_df["price"]*1.0/train_df["bathrooms"]).replace(np.Inf,-1)
train_df["price_per_bed"] = (train_df["price"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["bath_per_bed"] = (train_df["bathrooms"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["price_per_room"] = (train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"])).replace(np.Inf,-1)

# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words",\
                        "created_year","listing_id", "created_month", "created_day", "created_hour"])
#price new features
features_to_use.extend(["price_per_bed","bath_per_bed","price_per_room"])


In [6]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_df),5,shuffle=True,random_state = 42)

In [7]:
#running and getting the cv from xgboost
cv_scores = []
cv_result = []

#K-FOLD already defined.If not ,use
#KF=KFold(len(train_X),5,shuffle=True,random_state = 42)
i=0
for dev_index, val_index in KF:
        result_dict = {}
        
        """some preprocessing like feature constructed in cv manners"""
        
        #split the orginal train set into dev_set and val_set
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        
        
        for train,test in skf:
            performance_eval(dev_set.iloc[train,:],dev_set.iloc[test,:],feature='manager_id',k=5,g=10,
                           update_df = dev_set)
            manager_statistics(dev_set.iloc[train,:],dev_set.iloc[test,:],\
                              update_df = dev_set)
        
        performance_eval(dev_set,val_set,feature='manager_id',k=5,g=10)
        manager_statistics(dev_set,val_set)
        
        dev_X, val_X = dev_set[features_to_use].as_matrix(), val_set[features_to_use].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        """ 
         runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
         seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,\
         max_depth = 6,cv_dict = None):
         """
        
        preds, model = runXGB(dev_X, dev_y, val_X, val_y,early_stop  = 20,\
                              feature_names = features_to_use,cv_dict = result_dict,verbose_eval=100)
        loss = log_loss(val_y, preds)
        cv_scores.append(loss)
        cv_result.append(result_dict)
        i+=1
        print 'loss for the turn '+str(i)+' is '+str(loss)

[0]	train-mlogloss:1.04113	test-mlogloss:1.04253
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.530516	test-mlogloss:0.596574
[200]	train-mlogloss:0.471687	test-mlogloss:0.586205
Stopping. Best iteration:
[234]	train-mlogloss:0.454878	test-mlogloss:0.584909

loss for the turn 1 is 0.585006479598
[0]	train-mlogloss:1.04152	test-mlogloss:1.04238
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[100]	train-mlogloss:0.532199	test-mlogloss:0.592885
[200]	train-mlogloss:0.47349	test-mlogloss:0.583251
Stopping. Best iteration:
[240]	train-mlogloss:0.454853	test-mlogloss:0.581729

loss for the turn 2 is 0.582005291003
[0]	train-mlogloss:1.03854	test-mlogloss:1.03907
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train unt

In [8]:
#plot the validation curv
cvResult = CVstatistics(cv_result,'mlogloss')
cvResult.turns

test0    254
test1    260
test2    342
test3    296
test4    273
Name: 341, dtype: int32

In [11]:
cvResult.errorsAt(250)

(0.5850856, 0.44855280000000003)

In [9]:
#cvResult.validCurve(stop=False)
#some errors at certain turn to see the descending
cv_scores
np.mean(cv_scores)

0.58505953557896273

In [15]:
#show the importance of the features
showImportance(model,'gain')

[('bathrooms', 0.09725565800916634), ('price', 0.09499099365937312), ('price_per_room', 0.09205531264056366), ('created_hour', 0.09194288123746672), ('price_per_bed', 0.08666569315447636), ('num_photos', 0.07428368001559033), ('bedrooms', 0.07012971325635668), ('bath_per_bed', 0.061415666612366956), ('num_features', 0.06043546567125035), ('latitude', 0.058875420604080596), ('longitude', 0.05741437403240373), ('num_description_words', 0.04480527126354566), ('listing_id', 0.042255845208890516), ('created_day', 0.03503384908900436), ('created_month', 0.03244017554546452)]
