In [10]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold
import re
import string

In [2]:
#try xgboost
#fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=10000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.01
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [11]:
#feature processing functions
#define punctutaion filter
def removePunctuation(x):
    #filter the head or tail blanks
    x = re.sub(r'^\s+',r' ',x)
    x = re.sub(r'\s+$',r' ',x)
    
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars, warning if you are dealing with other languages!!!!!!!!!!!!!!!
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    #change all the blank to space
    x = re.sub(r'\s',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    removing = string.punctuation#.replace('-','')# except '-'
    removed = re.sub("["+removing+"]", "", x)
    #removing the line-changing
    #removed = re.sub('\\n'," ",removed)    
    return removed

#feature processing functions
def proecessStreet(address):
    #remove the building number
    pattern = re.compile('^[\d-]*[\s]+')
    street = removePunctuation(pattern.sub('',address))
    #sub the st to street
    pattern = re.compile('( st)$')
    street = pattern.sub(' street',street)
    #sub the ave to avenue
    pattern = re.compile('( ave)$')
    street = pattern.sub(' avenue',street)
    #nth -> n
    #nst -> n
    #nrd -> n
    #nnd -> n
    pattern = re.compile('(\d+)((th)|(st)|(rd)|(nd))')
    street = pattern.sub('\g<1>',street)
    #deal with the w 14 street => west 14 street
    pattern = re.compile('(w)(\s+)(\d+)')    
    street = pattern.sub('west \g<3>',street)
    #deal with the e....
    pattern = re.compile('(e)(\s+)(\d+)')    
    street = pattern.sub('east \g<3>',street)
    

    return street
    
def getStreetNumber(address):
    #get building id in the front, return -1 if their isn't
    pattern = re.compile('^([\d-]*)([\s]+)')
    try:
        number = pattern.search(address).group(1)
        return int(number)
    except:
        return -1

#from "this is a lit"s python version by rakhlin
def singleValueConvert(df1,df2,column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2

In [4]:
def manager_skill_eval(train_df,test_df,unrank_threshold = 10):

    target_num_map = {'High':2, 'Medium':1, 'Low':0}
    temp=pd.concat([train_df.manager_id,pd.get_dummies(train_df.interest_level)], axis = 1).groupby('manager_id').mean()
     
    temp.columns = ['ManHigh','ManLow', 'ManMedium']
    
    print temp.columns
    temp['count'] = train_df.groupby('manager_id').count().iloc[:,1]
    
    temp['manager_skill'] = temp['ManHigh']*2 + temp['ManMedium']
    
    #ixes of the managers with to few sample
    unranked_managers_ixes = temp['count']<unrank_threshold
    ranked_managers_ixes = ~unranked_managers_ixes
    
    #test for using rank or unrank part for the filling values
    mean_values = temp.loc[unranked_managers_ixes, ['ManHigh','ManLow', 'ManMedium','manager_skill']].mean()
    mean_values_total = temp.loc[:, ['ManHigh','ManLow', 'ManMedium','manager_skill']].mean()
    
    #reset their values to their average
    temp.loc[unranked_managers_ixes,['ManHigh','ManLow', 'ManMedium','manager_skill']] = mean_values.values
    
    #assign the features for the train set
    new_train_df = train_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
    
    #assign the features for the test/val set
    new_test_df = test_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
    new_manager_ixes = new_test_df['ManHigh'].isnull()
    new_test_df.loc[new_manager_ixes,['ManHigh','ManLow', 'ManMedium','manager_skill']] = mean_values_total.values           
    
    return new_train_df,new_test_df

In [5]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [6]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [7]:
#some new numerical features related to the price
train_df["price_per_bath"] =  train_df["price"]*1.0/train_df["bathrooms"]
train_df["price_per_bed"] = train_df["price"]*1.0/train_df["bedrooms"]
train_df["bath_per_bed"] = train_df["bathrooms"]*1.0/train_df["bedrooms"]
train_df["price_per_room"] = train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"])

test_df["price_per_bath"] =  test_df["price"]*1.0/test_df["bathrooms"]
test_df["price_per_bed"] = test_df["price"]*1.0/test_df["bedrooms"]
test_df["bath_per_bed"] = test_df["bathrooms"]*1.0/test_df["bedrooms"]
test_df["price_per_room"] = test_df["price"]*1.0/(test_df["bedrooms"]+test_df["bathrooms"])

features_to_use.extend(["price_per_bed","bath_per_bed","price_per_room"])
#features_to_use.append('price_per_bed')

In [8]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year","listing_id", "created_month", "created_day", "created_hour"])

In [12]:
#new feature for the street_address, use them instead of the original one
train_df["street_name"] = train_df["street_address"].apply(proecessStreet)
test_df["street_name"] = test_df["street_address"].apply(proecessStreet)

train_df["street_number"] = train_df["street_address"].apply(getStreetNumber)
test_df["street_number"] = test_df["street_address"].apply(getStreetNumber)

#features_to_use.append("street_number")

In [13]:
#dealing feature with categorical features 
"""
display_address 8826    
building_id        7585   =》many zeros in this feature
manager_id   3481
street_address 15358 =》will be 3800 if no numbers in it 
"""
categorical = ["display_address", "manager_id", "building_id", "street_address","street_name"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [14]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_df),5,shuffle=True,random_state = 42)

In [15]:
#running and getting the cv from xgboost
cv_scores = []
#K-FOLD already defined.If not ,use
#KF=KFold(len(train_X),5,shuffle=True,random_state = 42)
for dev_index, val_index in KF:
        #split the orginal train set into dev_set and val_set
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        
        #apply the function for createing some featues
        dev_set, val_set =manager_skill_eval(dev_set,val_set)
        #features_to_use.extend(['ManHigh','ManLow', 'ManMedium','manager_skill'])
        features_to_use.append('manager_skill')
        
        #filter the features
        dev_X, val_X = dev_set[features_to_use].as_matrix(), val_set[features_to_use].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        preds, model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=features_to_use)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

Index([u'ManHigh', u'ManLow', u'ManMedium'], dtype='object')
[0]	train-mlogloss:1.03253	test-mlogloss:1.03549
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 100 rounds.
[1]	train-mlogloss:0.978559	test-mlogloss:0.983166
[2]	train-mlogloss:0.929212	test-mlogloss:0.936377
[3]	train-mlogloss:0.88844	test-mlogloss:0.897597
[4]	train-mlogloss:0.854287	test-mlogloss:0.864752
[5]	train-mlogloss:0.823735	test-mlogloss:0.836217
[6]	train-mlogloss:0.795339	test-mlogloss:0.809205
[7]	train-mlogloss:0.770389	test-mlogloss:0.78617
[8]	train-mlogloss:0.747566	test-mlogloss:0.764999
[9]	train-mlogloss:0.728947	test-mlogloss:0.747569
[10]	train-mlogloss:0.711398	test-mlogloss:0.731564
[11]	train-mlogloss:0.695337	test-mlogloss:0.71674
[12]	train-mlogloss:0.681654	test-mlogloss:0.704101
[13]	train-mlogloss:0.668332	test-mlogloss:0.692376
[14]	train-mlogloss:0.656822	test-mlogloss:0.682102
[15]	train-mlogloss:0.

In [16]:
dev_set.columns

Index([            u'bathrooms',              u'bedrooms',
                 u'building_id',               u'created',
                 u'description',       u'display_address',
                    u'features',        u'interest_level',
                    u'latitude',            u'listing_id',
                   u'longitude',            u'manager_id',
                      u'photos',                 u'price',
              u'street_address',            u'num_photos',
                u'num_features', u'num_description_words',
                u'created_year',         u'created_month',
                 u'created_day',          u'created_hour'],
      dtype='object')

In [9]:
#ananlysis by the feature importance by weight
weight = model.get_score()
total = sum(weight.values())
for key in weight:
    weight[key] = weight[key]*1.0/total
weight

{'bathrooms': 0.010285346346753424,
 'bedrooms': 0.030466314219872576,
 'building_id': 0.06857462383082553,
 'created_day': 0.046309475396502646,
 'created_hour': 0.04346278975193168,
 'created_month': 0.006015317879896977,
 'display_address': 0.0806899823776603,
 'latitude': 0.09202589128371967,
 'listing_id': 0.09887149247661652,
 'longitude': 0.07911413853870138,
 'manager_id': 0.09904093805069812,
 'num_description_words': 0.0829605530703538,
 'num_features': 0.04493696624644164,
 'num_photos': 0.04109055171478921,
 'price': 0.0970753693913515,
 'street_address': 0.07908024942388504}

In [None]:
#ananlysis by the feature importance by gain
gain = model.get_score(importance_type='gain')
total = sum(gain.values())
#for key in gain:
#    gain[key] = gain[key]*1.0/total
gain

In [11]:
#ananlysis by the feature importance by coverage
cover = model.get_score(importance_type='cover')
total = sum(cover.values())
for key in cover:
    cover[key] = cover[key]*1.0/total
cover

{'bathrooms': 0.15003324661763429,
 'bedrooms': 0.11847222747849985,
 'building_id': 0.05966144646752775,
 'created_day': 0.027908091350767217,
 'created_hour': 0.04913703475375256,
 'created_month': 0.015463921187964249,
 'display_address': 0.051917534421511584,
 'latitude': 0.062329192852910546,
 'listing_id': 0.05823796559748455,
 'longitude': 0.05796867229011468,
 'manager_id': 0.0658834209429622,
 'num_description_words': 0.04385875263322271,
 'num_features': 0.05493240649113651,
 'num_photos': 0.053803480057786596,
 'price': 0.07955324745771991,
 'street_address': 0.050839359399004566}