In [2]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import  preprocessing, ensemble
#from sklearn import model_selection,
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

**XGB runner**

In [8]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

**The form of the list(param.items()) is **
[('num_class', 3),
 ('silent', 1),
 ('eval_metric', 'mlogloss'),
 ('min_child_weight', 1),
 ('subsample', 0.7),
 ('eta', 0.1),
 ('objective', 'multi:softprob'),
 ('colsample_bytree', 0.7),
 ('max_depth', 6)]
 **should check the input for xgb.train**

In [3]:
data_path = "../"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

(49352, 15)
(74659, 14)


In [9]:
train_df["building_id"].head(20)

10        53a5b119ba8f7b61d4e010512e0dfc85
10000     c5c8a357cba207596b04d1afd1e4f130
100004    c3ba40552e2120b0acfc3cb5730bb2aa
100007    28d9ad350afeaab8027513a3e52ac8d5
100013                                   0
100014    38a913e46c94a7f46ddf19b756a9640c
100016    3ba49a93260ca5df92fde024cb4ca61f
100020    0372927bcb6a0949613ef5bf893bbac7
100026    a7efbeb58190aa267b4a9121cd0c88c0
100027                                   0
100030                                   0
10004                                    0
100044    67c9b420da4a365bc26a6cd0ef4a5320
100048                                   0
10005                                    0
100051    bfb9405149bfff42a92980b594c28234
100052    642cc2c920512ffe2a74c28122f8b47f
100053                                   0
100055    cc4c6ae9225df6d2395c4e16c235f7ab
100058    dc3cae15729b48fec3394f9295671991
Name: building_id, dtype: object

In [10]:
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [11]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", "created_month", "created_day", "listing_id", "created_hour"])

Used preprocessing.LabelEncoder() for one-hot encoding

In [12]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

**Dealing with the 'feature' feature by using tf/idf **
The tr_sparse is a sparse matrix for the training data and the te_sparse is a transform for the testing data based on the idf dict from the training data

In [13]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [17]:
type(train_df)

pandas.core.frame.DataFrame

In [19]:
train

10        12282
10000      9080
100004    13719
100007    10866
100013    15072
100014    15194
100016    14964
100020    15223
100026    11646
100027     6459
100030    15694
10004     14685
100044     9660
100048    15380
10005     10903
100051    10791
100052     2750
100053    13340
100055    14962
100058     2752
100062    12188
100063     9584
100065    10797
100066    12060
10007     13956
100071    10901
100075    10107
100076    15369
100079     1446
100081    10066
          ...  
99915     12579
99917     14366
99919     11023
99921     12888
99923     15464
99924      8048
99931     13391
99933     14899
99935     10635
99937     15694
9994      15523
99953     15371
99956     12499
99960      8636
99961     10839
99964     10028
99965     11088
99966     10989
99979     11228
99980     11841
99982     14310
99984     10020
99986      8953
99987     11106
99988     10579
9999       9577
99991     12864
99992     15306
99993     14850
99994     12707
Name: display_address, d