In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold

import re
import string

In [2]:
#try xgboost
#fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=10000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [3]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

(49352, 15)
(74659, 14)


In [4]:
#feature processing functions
#define punctutaion filter
def removePunctuation(x):
    #filter the head or tail blanks
    x = re.sub(r'^\s+',r' ',x)
    x = re.sub(r'\s+$',r' ',x)
    
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars, warning if you are dealing with other languages!!!!!!!!!!!!!!!
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    #change all the blank to space
    x = re.sub(r'\s',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    removing = string.punctuation#.replace('-','')# except '-'
    removed = re.sub("["+removing+"]", "", x)
    #removing the line-changing
    #removed = re.sub('\\n'," ",removed)    
    return removed

#feature processing functions
def proecessStreet(address):
    #remove the building number
    pattern = re.compile('^[\d-]*[\s]+')
    street = removePunctuation(pattern.sub('',address))
    #sub the st to street
    pattern = re.compile('( st)$')
    street = pattern.sub(' street',street)
    #sub the ave to avenue
    pattern = re.compile('( ave)$')
    street = pattern.sub(' avenue',street)
    #nth -> n
    #nst -> n
    #nrd -> n
    #nnd -> n
    pattern = re.compile('(\d+)((th)|(st)|(rd)|(nd))')
    street = pattern.sub('\g<1>',street)
    #deal with the w 14 street => west 14 street
    pattern = re.compile('(w)(\s+)(\d+)')    
    street = pattern.sub('west \g<3>',street)
    #deal with the e....
    pattern = re.compile('(e)(\s+)(\d+)')    
    street = pattern.sub('east \g<3>',street)
    

    return street
    
def getStreetNumber(address):
    #get building id in the front, return -1 if their isn't
    pattern = re.compile('^([\d-]*)([\s]+)')
    try:
        number = pattern.search(address).group(1)
        return int(number)
    except:
        return -1

#from "this is a lit"s python version by rakhlin
def singleValueConvert(df1,df2,column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2

In [5]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [6]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year","listing_id", "created_month", "created_day", "created_hour"])

In [7]:
#new feature for the street_address, use them instead of the original one
train_df["street_name"] = train_df["street_address"].apply(proecessStreet)
test_df["street_name"] = test_df["street_address"].apply(proecessStreet)

train_df["street_number"] = train_df["street_address"].apply(getStreetNumber)
test_df["street_number"] = test_df["street_address"].apply(getStreetNumber)

#features_to_use.append("street_number")

In [8]:
#dealing feature with categorical features 
#used street name instead of the original one 
"""
display_address 8826    
building_id        7585   =》many zeros in this feature
manager_id   3481
street_address 15358 =》will be 3800 if no numbers in it 
"""

"""
notice the encoding here, the latter using of the mse might have some leakage here
"""
categorical = ["display_address", "manager_id", "building_id", "street_name","street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f) singleValueConvert
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [9]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_X = train_df[features_to_use]
test_X = test_df[features_to_use]

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_X),5,shuffle=True,random_state = 42)

In [10]:
#running and getting the cv from xgboost
cv_scores = []
#K-FOLD already defined.If not ,use
#KF=KFold(len(train_X),5,shuffle=True,random_state = 42)
for dev_index, val_index in KF:
        #apply the 
        dev_X, val_X = singleValueConvert(train_X.iloc[dev_index,:], train_X.iloc[val_index,:],"street_name")
        dev_X, val_X = dev_X.as_matrix(), val_X.as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=list(list(train_X.columns)))
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


[0]	train-mlogloss:1.04023	test-mlogloss:1.04166
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 100 rounds.
[1]	train-mlogloss:0.989301	test-mlogloss:0.992057
[2]	train-mlogloss:0.948296	test-mlogloss:0.952305
[3]	train-mlogloss:0.912601	test-mlogloss:0.91761
[4]	train-mlogloss:0.881667	test-mlogloss:0.887873
[5]	train-mlogloss:0.854977	test-mlogloss:0.862252
[6]	train-mlogloss:0.830246	test-mlogloss:0.838474
[7]	train-mlogloss:0.807561	test-mlogloss:0.816734
[8]	train-mlogloss:0.787538	test-mlogloss:0.797688
[9]	train-mlogloss:0.769792	test-mlogloss:0.780715
[10]	train-mlogloss:0.7532	test-mlogloss:0.765174
[11]	train-mlogloss:0.738002	test-mlogloss:0.750977
[12]	train-mlogloss:0.723967	test-mlogloss:0.737607
[13]	train-mlogloss:0.712004	test-mlogloss:0.72644
[14]	train-mlogloss:0.702339	test-mlogloss:0.717478
[15]	train-mlogloss:0.694389	test-mlogloss:0.710228
[16]	train-mlogloss:0.686341	tes

In [11]:
#ananlysis by the feature importance by weight
weight = model.get_score()
total = sum(weight.values())
for key in weight:
    weight[key] = weight[key]*1.0/total
weight

{'bathrooms': 0.009726033007084647,
 'bedrooms': 0.03134854007125599,
 'building_id': 0.0631680248986445,
 'created_day': 0.04344977271796552,
 'created_hour': 0.04060362832220812,
 'created_month': 0.004402309676890945,
 'display_address': 0.0681231827675171,
 'latitude': 0.08544575944960892,
 'listing_id': 0.09801793685245096,
 'longitude': 0.07838158810762111,
 'manager_id': 0.09347229616282403,
 'num_description_words': 0.07682542282648758,
 'num_features': 0.04471927597362709,
 'num_photos': 0.0371636840165445,
 'price': 0.09797698513452639,
 'street_address': 0.0709693271632745,
 'street_name': 0.05620623285146812}

In [21]:
#ananlysis by the feature importance by gain(totalgain)
gain = model.get_score(importance_type='gain')
for key in gain:
    gain[key] = gain[key]*1.0*weight[key]
#nomalization
total = sum(gain.values())
for key in gain:
    gain[key] = gain[key]/total


{'bathrooms': 0.01561773881543633,
 'bedrooms': 0.06306273987282299,
 'building_id': 0.08936735374733595,
 'created_day': 0.02479431084778622,
 'created_hour': 0.04818489496080039,
 'created_month': 0.0022583931002279605,
 'display_address': 0.04904422976286214,
 'latitude': 0.08126442059297162,
 'listing_id': 0.06747384894889424,
 'longitude': 0.0737795193043773,
 'manager_id': 0.07419525000016156,
 'num_description_words': 0.05830242075268398,
 'num_features': 0.042606638430017205,
 'num_photos': 0.04099400630463816,
 'price': 0.1786693356528965,
 'street_address': 0.050279983782891294,
 'street_name': 0.0401049151231962}

In [25]:
sorted_gain = []
for key in gain:
    sorted_gain.append((key,gain[key]))
sorted_gain = sorted(sorted_gain,key = lambda x:x[1])

In [26]:
sorted_gain

[('created_month', 0.0022583931002279605),
 ('bathrooms', 0.01561773881543633),
 ('created_day', 0.02479431084778622),
 ('street_name', 0.0401049151231962),
 ('num_photos', 0.04099400630463816),
 ('num_features', 0.042606638430017205),
 ('created_hour', 0.04818489496080039),
 ('display_address', 0.04904422976286214),
 ('street_address', 0.050279983782891294),
 ('num_description_words', 0.05830242075268398),
 ('bedrooms', 0.06306273987282299),
 ('listing_id', 0.06747384894889424),
 ('longitude', 0.0737795193043773),
 ('manager_id', 0.07419525000016156),
 ('latitude', 0.08126442059297162),
 ('building_id', 0.08936735374733595),
 ('price', 0.1786693356528965)]