In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from math import exp, log, sqrt
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [7]:
X = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [8]:
X.drop('ID', axis=1, inplace=True)
X['target'] = X['target'].apply(lambda x: log(x + 1))

In [9]:
y = X.pop('target')

In [10]:
cols_to_remove = []
for col in X.columns:
    if X[col].std() == 0:
        cols_to_remove.append(col)
        
# remove constant columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True)
# remove constant columns in the test set
test.drop(cols_to_remove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(cols_to_remove)))

Removed `256` Constant Columns



In [12]:
cols_to_remove = []
cols_scaned = []
dups = {}

columns = X.columns
for i in range(len(columns) - 1):
    v = X[columns[i]].values
    dup_cols = []
    for j in range(i + 1, len(columns)):
        if np.array_equal(v, X[columns[j]].values):
            cols_to_remove.append(columns[j])
            if columns[j] not in cols_scaned:
                dup_cols.append(columns[j]) 
                cols_scaned.append(columns[j])
                dups[columns[i]] = dup_cols
                
# remove duplicate columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True) 
# remove duplicate columns in the testing set
test.drop(cols_to_remove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(dups)))

Removed `4` Duplicate Columns



In [14]:
test.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
xgb_model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=8,
    min_child_weight=5,
    gama=0.4,
    subsample=0.6,
    colsample_bytree=0.7,
    reg_alpha=1e-05,
    nthread=4,
    seed=27)
clf = GridSearchCV(xgb_model,
                   {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]},
                   scoring='neg_mean_squared_error',
                   verbose=1)
clf.fit(X, y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gama=0.4, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=8, min_child_weight=5, missing=None,
       n_estimators=100, n_jobs=1, nthread=4, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.6),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [1e-05, 0.01, 0.1, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [22]:
print(clf.best_score_)
print(clf.best_params_)

-2.1868631286050007
{'reg_alpha': 1e-05}


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
xgb_model = xgb.XGBRegressor(
    learning_rate=0.01,
    n_estimators=1000,
    max_depth=8,
    min_child_weight=5,
    gama=0.4,
    subsample=0.6,
    colsample_bytree=0.7,
    reg_alpha=1e-05,
    nthread=4)
xgb_model.fit(X_train, y_train, eval_metric='rmse')
predictions = xgb_model.predict(X_test)
print("RMSE : {0}".format(mean_squared_error(y_test, predictions)))

RMSE : 2.1367545588217056


In [15]:
submit = pd.DataFrame()
submit['ID'] = test.pop('ID')

In [16]:
xgb_model = xgb.XGBRegressor(
    learning_rate=0.01,
    n_estimators=1000,
    max_depth=8,
    min_child_weight=5,
    gama=0.4,
    subsample=0.6,
    colsample_bytree=0.7,
    reg_alpha=1e-05,
    nthread=4)
xgb_model.fit(X, y, eval_metric='rmse')
predictions = xgb_model.predict(test)

In [17]:
submit['target'] = [exp(x) - 1 for x in predictions]
submit.to_csv('my_XGB_prediction.csv', index=False)