In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss,accuracy_score
from sklearn.cross_validation import KFold,StratifiedKFold
import re
import string
from collections import defaultdict, Counter


In [2]:
#try xgboost
#fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
     seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1, max_depth = 6):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = eta
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,\
        early_stopping_rounds=early_stop)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
#feature processing functions
#define punctutaion filter
def removePunctuation(x):
    #filter the head or tail blanks
    x = re.sub(r'^\s+',r' ',x)
    x = re.sub(r'\s+$',r' ',x)
    
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars, warning if you are dealing with other languages!!!!!!!!!!!!!!!
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    #change all the blank to space
    x = re.sub(r'\s',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    removing = string.punctuation#.replace('-','')# except '-'
    removed = re.sub("["+removing+"]", "", x)
    #removing the line-changing
    #removed = re.sub('\\n'," ",removed)    
    return removed

#feature processing functions
def proecessStreet(address):
    #remove the building number
    pattern = re.compile('^[\d-]*[\s]+')
    street = removePunctuation(pattern.sub('',address))
    
    #sub the st to street
    pattern = re.compile('( st)$')
    street = pattern.sub(' street',street)
    
    #sub the ave to avenue
    pattern = re.compile('( ave)$')
    street = pattern.sub(' avenue',street)
    
    pattern = re.compile('(\d+)((th)|(st)|(rd)|(nd))')
    street = pattern.sub('\g<1>',street)
    
    #deal with the w 14 street => west 14 street
    pattern = re.compile('(w)(\s+)(\d+)')    
    street = pattern.sub('west \g<3>',street)
    
    #deal with the e....
    pattern = re.compile('(e)(\s+)(\d+)')    
    street = pattern.sub('east \g<3>',street)
    
    return street
    
#from "this is a lit"s python version by rakhlin
def singleValueConvert(df1,df2,column,minimum_size=5):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= minimum_size, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= minimum_size, column] = -1
    return df1, df2



In [4]:
def performance_eval(train_df,test_df,feature,k,g=1,f=1,update_df =None,random = None):
    target_num_map = {'High':2, 'Medium':1, 'Low':0}
    temp=pd.concat([train_df[feature],pd.get_dummies(train_df.interest_level)], axis = 1)\
         .groupby(feature).mean()
     
    new_feature = feature+'_perf'
    
    temp.columns = ['tempHigh','tempLow', 'tempMed']
    
    temp['count'] = train_df.groupby(feature).count().iloc[:,1]
    temp["lambda"] = g / (g + np.exp((k - temp["count"] )/f))
    temp[feature+'_origin'] = temp['tempHigh']*2 + temp['tempMed']
    mean_values = temp.loc[:, feature+'_origin'].mean()
    
    temp[new_feature] = temp["lambda"]*temp[feature+'_origin']+(1-temp["lambda"])*mean_values    
    
    # Add uniform noise. Not mentioned in original paper.adding to each manager
    if random:
        temp[new_feature] *= np.random.uniform(1 - random, 1 + random, len(temp))     

    value = test_df[[feature]].join(temp, on=feature, how="left")[new_feature].fillna(mean_values)
    
    if update_df is None: update_df = test_df
    if new_feature not in update_df.columns: update_df[new_feature] = np.nan
    update_df.update(value)

In [5]:
#hcc encoding for building id instead of the performance eval
def hcc_scoring(train_df,test_df,feature,labelValue,randomize=0.01,k=5,f=1,g=1,unrank_threshold =5,update_df =None):    
    #input is the train dataframe with its labels mapped to dummies
    #such as:
    tempTrain = train_df.join(pd.get_dummies(train_df[u'interest_level']).astype(int))
    
    new_feature = '_'.join(['hcc',feature,labelValue])
    
    #take the mean  for the feature on the given featureValue which is mapped to dummies
    prob = tempTrain[labelValue].mean()
    
    #take the mean and count for each feature value
    grouped = tempTrain.groupby(feature)[labelValue].agg({'count':'size','mean':'mean'})
    
    #perform the transform for lambda and the final score
    grouped["lambda"] = g / (g + np.exp((k - grouped["count"]) / f))
    grouped[new_feature] = grouped['lambda']*grouped['mean']+(1-grouped['lambda'])*prob
    
    #get the average score for the unrank features and reset them to this average
    unrankedMean = grouped.ix[grouped['count']<unrank_threshold,new_feature].mean()
    grouped.ix[grouped['count']<unrank_threshold,new_feature] = unrankedMean
    grouped = grouped.reset_index()
    
    #adding to the test_df
    update_value  = test_df[[feature]].merge(grouped,on = feature,how='left')[new_feature].fillna(prob)
    
    if randomize : update_value *= np.random.uniform(1 - randomize, 1 + randomize, len(test_df))
        
    #adding some noise to the new 
    #print 'New feature added:'+new_feature

    if update_df is None:
        update_df = test_df
    if new_feature not in update_df.columns: 
        update_df[new_feature] = np.nan
        
    update_df.update(update_value)
    return

In [6]:
#functions for features
def featureList(train_df,test_df,limit = 0.001):
    #acquiring the feature lists
    features_in_train = train_df["features"].apply(pd.Series).unstack().reset_index(drop = True).dropna().value_counts()
    features_in_test = test_df["features"].apply(pd.Series).unstack().reset_index(drop = True).dropna().value_counts()
    
    filtered_features_in_train = features_in_train[features_in_train > limit*len(train_df)]
    filtered_features_in_test = features_in_test[features_in_test > limit*len(test_df)]
    accept_list = set(filtered_features_in_train.index).union(set(filtered_features_in_test.index))
    return accept_list

def featureMapping(train_df,test_df,feature_list):
    for feature in feature_list:
        #add the feature column for both
        #if feature in the row, then set the value for (row,feature) to 1
        train_df['with_'+feature]=train_df['features'].apply(lambda x : 1 if feature in x else 0)
        test_df['with_'+feature]=test_df['features'].apply(lambda x : 1 if feature in x else 0)
    return


In [7]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [8]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [9]:
#some new numerical features related to the price
train_df["price_per_bath"] =  (train_df["price"]*1.0/train_df["bathrooms"]).replace(np.Inf,-1)
train_df["price_per_bed"] = (train_df["price"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["bath_per_bed"] = (train_df["bathrooms"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["price_per_room"] = (train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"])).replace(np.Inf,-1)

test_df["price_per_bath"] =  (test_df["price"]*1.0/test_df["bathrooms"]).replace(np.Inf,-1)
test_df["price_per_bed"] = (test_df["price"]*1.0/test_df["bedrooms"]).replace(np.Inf,-1)
test_df["bath_per_bed"] = (test_df["bathrooms"]*1.0/test_df["bedrooms"]).replace(np.Inf,-1)
test_df["price_per_room"] = (test_df["price"]*1.0/(test_df["bedrooms"]+test_df["bathrooms"])).replace(np.Inf,-1)

features_to_use.extend(["price_per_bed","bath_per_bed","price_per_room"])
#features_to_use.append('price_per_bed')

In [10]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year","listing_id", "created_month", "created_day", "created_hour"])

In [11]:
"""
new categorical data append and converting label dummies for future use
"""
#new feature for the street_address, use them instead of the original one
train_df["street_name"] = train_df["street_address"].apply(proecessStreet)
test_df["street_name"] = test_df["street_address"].apply(proecessStreet)


In [12]:
#dealing with features

#preprocessing for features
train_df["features"] = train_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_') \
                                                            for i in x])
test_df["features"] = test_df["features"].apply(lambda x:["_".join(i.split(" ")).lower().strip().replace('-','_')\
                                                          for i in x])
#create the accept list
accept_list = list(featureList(train_df,test_df,limit = 0.001))

#map the feature to dummy slots
featureMapping(train_df,test_df,accept_list)
features_to_use.extend(map(lambda x : 'with_'+x,accept_list))

In [13]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=StratifiedKFold(train_y,5,shuffle=True,random_state = 42)

train_df = train_df.fillna(-1)
test_df = test_df.fillna(-1)

In [14]:
features_to_use.extend(['manager_id_perf'])
features_to_use.extend(['hcc_building_id_medium'])
features_to_use.extend(['hcc_building_id_high'])

#using these categorical features for direct labeling
categorical = ["display_address", "street_address","street_name",'manager_id']
features_to_use.extend(categorical)

In [17]:
features_to_use.remove('hcc_building_id_high')
features_to_use.remove('hcc_building_id_medium')
features_to_use.remove('street_name')

In [22]:
#features_to_use.append('building_id')
categorical.append('building_id')

In [20]:
features_to_use

['bathrooms',
 'bedrooms',
 'latitude',
 'longitude',
 'price',
 'price_per_bed',
 'bath_per_bed',
 'price_per_room',
 'num_photos',
 'num_features',
 'num_description_words',
 'created_year',
 'listing_id',
 'created_month',
 'created_day',
 'created_hour',
 u'with_exclusive',
 u'with_furnished',
 u'with_lowrise',
 u'with_common_parking/garage',
 u'with_pets_on_approval',
 u'with_terrace',
 u'with_live_in_superintendent',
 u'with_newly_renovated',
 u'with_full_time_doorman',
 u'with_duplex',
 u'with_dryer_in_unit',
 u'with_multi_level',
 u'with_garden',
 u'with_hardwood_floors',
 u'with_on_site_garage',
 u'with_fireplace',
 u'with_eat_in_kitchen',
 u'with_wifi_access',
 u'with_garage',
 u'with_subway',
 u'with_dining_room',
 u'with_view',
 u'with_publicoutdoor',
 u'with_hardwood',
 u'with_fitness_center',
 u'with_high_speed_internet',
 u'with_laundry_in_building',
 u'with_parking',
 u'with_garden/patio',
 u'with_prewar',
 u'with_on_site_laundry',
 u'with_valet',
 u'with_green_building

In [23]:
cv_scores = []

#mini_ranking = 15

for dev_index, val_index in KF:
        #split the orginal train set into dev_set and val_set
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        
#====================================================================        
        """feature engineerings for the categorical features"""
        #fill substitute the small size values by their mean
        for f in ['display_address','manager_id','building_id','street_name']:
            dev_set,val_set  = singleValueConvert(dev_set,val_set,f,1)
        
        
        #K-FOLD evaluation for the manager skill
        
        skf=StratifiedKFold(dev_set['interest_level'],5,shuffle=True,random_state = 42)
        #dev set adding manager skill
        for feature in ['manager_id']:
            for train,test in skf:
                performance_eval(dev_set.iloc[train,:],dev_set.iloc[test,:],feature=feature,k=5,g=10,
                               update_df = dev_set,random = 0.01)
                
                
                hcc_scoring(dev_set.iloc[train,:],dev_set.iloc[test,:],'building_id','high',randomize=0.01,\
                             unrank_threshold =1,update_df =dev_set)
                hcc_scoring(dev_set.iloc[train,:],dev_set.iloc[test,:],'building_id','medium',randomize=0.01,\
                             unrank_threshold =1,update_df =dev_set)
                
            
            performance_eval(dev_set,val_set,feature=feature,k=5,g=10,random = 0.01)
            
            hcc_scoring(dev_set,val_set,'building_id','high',randomize=0.01,\
                             unrank_threshold =1)
            hcc_scoring(dev_set,val_set,'building_id','medium',randomize=0.01,\
                            unrank_threshold =1)
            
        for f in categorical:

            if dev_set[f].dtype=='object':
                #print(f)
                lbl = preprocessing.LabelEncoder()
                lbl.fit(list(dev_set[f])+list(val_set[f]))
                dev_set[f] = lbl.transform(list(dev_set[f].values))
                val_set[f] = lbl.transform(list(val_set[f].values))
        
#===================================================================
                
        #filter the features
        dev_X, val_X = dev_set[features_to_use].as_matrix(), val_set[features_to_use].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        """
        runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
        seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1, max_depth = 6)
        """        
        
        preds, model = runXGB(dev_X, dev_y, val_X, val_y,\
        feature_names=features_to_use,early_stop=64,
        num_rounds = 20000, eta = 0.1,max_depth = 4)
    
        #using rf for feature choosing
        #model = ensemble.RandomForestClassifier(500,random_state = 42,class_weight='balanced')
        #model.fit(dev_X,dev_y)
        #pred_prob = model.predict_proba(val_X)
        #pred = model.predict(val_X)
            
        cv_scores.append(log_loss(val_y, preds))
        break
print cv_scores
#print accuracy_score(val_y,pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[0]	train-mlogloss:1.04187	test-mlogloss:1.04194
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 64 rounds.
[1]	train-mlogloss:0.987843	test-mlogloss:0.988401
[2]	train-mlogloss:0.943556	test-mlogloss:0.944462
[3]	train-mlogloss:0.903352	test-mlogloss:0.904668
[4]	train-mlogloss:0.869269	test-mlogloss:0.871077
[5]	train-mlogloss:0.839196	test-mlogloss:0.841235
[6]	train-mlogloss:0.812991	test-mlogloss:0.81528
[7]	train-mlogloss:0.790213	test-mlogloss:0.792859
[8]	train-mlogloss:0.770565	test-mlogloss:0.773343
[9]	train-mlogloss:0.752718	test-mlogloss:0.755686
[10]	train-mlogloss:0.737359	test-mlogloss:0.740574
[11]	train-mlogloss:0.722879	test-mlogloss:0.726218
[12]	train-mlogloss:0.709878	test-mlogloss:0.71336
[13]	train-mlogloss:0.699288	test-mlogloss:0.702913
[14]	train-mlogloss:0.689615	test-mlogloss:0.693401
[15]	train-mlogloss:0.680105	test-mlogloss:0.684221
[16]	train-mlogloss:0.671702	te

In [16]:
#features_to_use.append('manager_skill')
#categorical = ["display_address", "manager_id", "building_id", "street_address","street_name"]
#features_to_use.extend(categorical)
#features_to_use.extend(['diff_price','diff_price_per_bed','diff_price_per_bath','diff_price_per_room'])

#====================================================================        
"""feature engineerings for the categorical features"""

train_set, test_set =manager_skill_eval(train_df,test_df,\
unrank_threshold = mini_ranking)


#fill substitute the small size values by their mean
for f in categorical:
    train_set,test_set  = singleValueConvert(train_set,test_set,f,mini_ranking)

    if train_set[f].dtype=='object':
        #print(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f])+list(test_df[f]))
        train_set[f] = lbl.transform(list(train_set[f].values))
        test_set[f] = lbl.transform(list(test_set[f].values))

addAvgDiff(train_set,test_set,nn=15)

#===================================================================

train_X = train_set[features_to_use]
test_X = test_set[features_to_use]

train_X_m = train_X.as_matrix()
test_X_m = test_X.as_matrix()

preds, model = runXGB(train_X_m, train_y, test_X_m, num_rounds=243)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_beta1point251-nndiff.csv", index=False)

Index([u'ManHigh', u'ManLow', u'ManMedium'], dtype='object')




In [16]:
dev_set.columns

Index([            u'bathrooms',              u'bedrooms',
                 u'building_id',               u'created',
                 u'description',       u'display_address',
                    u'features',        u'interest_level',
                    u'latitude',            u'listing_id',
                   u'longitude',            u'manager_id',
                      u'photos',                 u'price',
              u'street_address',            u'num_photos',
                u'num_features', u'num_description_words',
                u'created_year',         u'created_month',
                 u'created_day',          u'created_hour'],
      dtype='object')

In [9]:
#ananlysis by the feature importance by weight
weight = model.get_score()
total = sum(weight.values())
for key in weight:
    weight[key] = weight[key]*1.0/total
weight

{'bathrooms': 0.010285346346753424,
 'bedrooms': 0.030466314219872576,
 'building_id': 0.06857462383082553,
 'created_day': 0.046309475396502646,
 'created_hour': 0.04346278975193168,
 'created_month': 0.006015317879896977,
 'display_address': 0.0806899823776603,
 'latitude': 0.09202589128371967,
 'listing_id': 0.09887149247661652,
 'longitude': 0.07911413853870138,
 'manager_id': 0.09904093805069812,
 'num_description_words': 0.0829605530703538,
 'num_features': 0.04493696624644164,
 'num_photos': 0.04109055171478921,
 'price': 0.0970753693913515,
 'street_address': 0.07908024942388504}

In [24]:
#ananlysis by the feature importance by gain
gain = model.get_score(importance_type='gain')
gain_list = []
total = sum(gain.values())
for key in gain:
    gain[key] = gain[key]*1.0/total
    gain_list.append((key,gain[key]))
sorted(gain_list,key = lambda x:x[1],reverse = True)

[('manager_id_perf', 0.04709964072683501),
 ('with_simplex', 0.02968914818799002),
 ('with_no_fee', 0.029298257528266352),
 ('with_lowrise', 0.023912050387596282),
 ('price', 0.02202479254158485),
 ('with_furnished', 0.020903440252948775),
 ('bathrooms', 0.01792761554434289),
 ('price_per_room', 0.016951330036144553),
 ('price_per_bed', 0.015700963312388724),
 ('with_short_term_allowed', 0.014658801839222645),
 ('with_hardwood', 0.014121620081753267),
 ('with_reduced_fee', 0.014043652268649731),
 ('num_photos', 0.01368353075338423),
 ('with_hardwood_floors', 0.013151664252601943),
 ('created_hour', 0.013146475100738351),
 ('bedrooms', 0.012474472186877427),
 ('building_id', 0.012098465299834478),
 ('longitude', 0.011937894265548778),
 ('with_central_a/c', 0.011430144867299144),
 ('with_laundry_in_unit', 0.011405272794044864),
 ('latitude', 0.011275234019783935),
 ('with_high_ceiling', 0.01102655510295255),
 ('with_laundry_in_building', 0.010939565474292718),
 ('with_parking_space', 0.0

In [11]:
#ananlysis by the feature importance by coverage
cover = model.get_score(importance_type='cover')
total = sum(cover.values())
for key in cover:
    cover[key] = cover[key]*1.0/total
cover

{'bathrooms': 0.15003324661763429,
 'bedrooms': 0.11847222747849985,
 'building_id': 0.05966144646752775,
 'created_day': 0.027908091350767217,
 'created_hour': 0.04913703475375256,
 'created_month': 0.015463921187964249,
 'display_address': 0.051917534421511584,
 'latitude': 0.062329192852910546,
 'listing_id': 0.05823796559748455,
 'longitude': 0.05796867229011468,
 'manager_id': 0.0658834209429622,
 'num_description_words': 0.04385875263322271,
 'num_features': 0.05493240649113651,
 'num_photos': 0.053803480057786596,
 'price': 0.07955324745771991,
 'street_address': 0.050839359399004566}