In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold




In [37]:
#lodaing data
data_path = "../"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [35]:
print len(train_df['listing_id'].unique())

49352


In [38]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [86]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", "created_month", "created_day", "created_hour"])

In [103]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_X = train_df[features_to_use]
test_X = test_df[features_to_use]

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_X),5,shuffle=True,random_state = 42)

In [91]:
#cross-validation
cv_scores = []

#using entropy
for dev_index, val_index in KF:
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        #random forest us
        rf30 = ensemble.RandomForestClassifier(40,'entropy',random_state = 42,class_weight='balanced',n_jobs =4)
        rf30.fit(dev_X,dev_y)
        preds = rf30.predict_proba(val_X)
        
        cv_scores.append(log_loss(val_y, preds))

print(cv_scores)
print np.mean(cv_scores)

[0.74837249223604518, 0.79366237125772254, 0.79842061493694294, 0.77195708244100802, 0.78614071073619329]
0.779710654322


In [92]:
#cross-validation
cv_scores = []

#usning gini
for dev_index, val_index in KF:
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        
        #random forest us
        rf30 = ensemble.RandomForestClassifier(40,random_state = 42,class_weight='balanced',n_jobs =4)
        rf30.fit(dev_X,dev_y)
        preds = rf30.predict_proba(val_X)
        
        cv_scores.append(log_loss(val_y, preds))

print(cv_scores)
print np.mean(cv_scores)

[0.77877718302774535, 0.81746625031163511, 0.8264890405946187, 0.76887150006849192, 0.79372930664408248]
0.797066656129


In [63]:
rf30 = ensemble.RandomForestClassifier(40,random_state = 42,class_weight='balanced')
y_train_prediceted=rf30.fit_transform(train_df[features_to_use],train_df[u'interest_level'])
#using random forest testing the importance
for i in range(len(features_to_use)):
    print "The importance of "+features_to_use[i]+":"+str(rf30.feature_importances_[i])



The importance of bathrooms:0.00956833183615
The importance of bedrooms:0.0340186837628
The importance of latitude:0.069102380505
The importance of longitude:0.0655026445719
The importance of price:0.12589901976
The importance of num_photos:0.0253333787294
The importance of num_features:0.0226569119691
The importance of num_description_words:0.0442462638534
The importance of created_year:0.0
The importance of created_month:0.00639664716156
The importance of created_day:0.0276721388561
The importance of created_hour:0.0218762746722
The importance of num_photos:0.02580651389
The importance of num_features:0.0228404045006
The importance of num_description_words:0.0427328686161
The importance of created_year:0.0
The importance of created_month:0.00610253560645
The importance of created_day:0.0284776448726
The importance of listing_id:0.0515252657779
The importance of created_hour:0.019956686499
The importance of num_photos:0.025644014719
The importance of num_features:0.022731499198
The im

In [93]:
#try xgboost
#fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [101]:
#running and getting the cv from xgboost
cv_scores = []
#K-FOLD already defined.If not ,use
#KF=KFold(len(train_X),5,shuffle=True,random_state = 42)
for dev_index, val_index in KF:
        dev_X, val_X = train_X.iloc[dev_index,:].as_matrix(), train_X.iloc[val_index,:].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.03929	test-mlogloss:1.04042
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.989238	test-mlogloss:0.991651
[2]	train-mlogloss:0.949522	test-mlogloss:0.952915
[3]	train-mlogloss:0.911372	test-mlogloss:0.915763
[4]	train-mlogloss:0.88116	test-mlogloss:0.886343
[5]	train-mlogloss:0.85213	test-mlogloss:0.858297
[6]	train-mlogloss:0.83001	test-mlogloss:0.837036
[7]	train-mlogloss:0.807276	test-mlogloss:0.815203
[8]	train-mlogloss:0.788864	test-mlogloss:0.797492
[9]	train-mlogloss:0.77175	test-mlogloss:0.781211
[10]	train-mlogloss:0.756505	test-mlogloss:0.766717
[11]	train-mlogloss:0.742759	test-mlogloss:0.75366
[12]	train-mlogloss:0.730125	test-mlogloss:0.741775
[13]	train-mlogloss:0.719421	test-mlogloss:0.731629
[14]	train-mlogloss:0.709882	test-mlogloss:0.722846
[15]	train-mlogloss:0.701253	test-mlogloss:0.714845
[16]	train-mlogloss:0.693011	test-

In [104]:
#output the outcome - using xgboost
train_X_m = train_X.as_matrix()
test_X_m = test_X.as_matrix()

preds, model = runXGB(train_X_m, train_y, test_X_m, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_alpha.csv", index=False)