In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

## Import

In [12]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

lowest_scored_thirty = lowest_scored_thirty = ['X344', 'X20','X117','X109','X378','X45','X362','X161','X164','X61',
 'X65','X380','X154', 'X300','X77', 'X114', 'X85', 'X321', 'X195','X209', 'X206', 'X283', 'X343', 'X340', 'X376',
 'X36', 'X375', 'X264', 'X250', 'X329'
                                              ,'X3', 'X0', 'X314', 'X350', 'X315', 'X180', 'X27', 'X261', 
                             'X220', 'X321', 'X355', 'X29', 'X136']

train.drop(lowest_scored_thirty, axis=1)
test.drop(lowest_scored_thirty, axis=1)

train.head(2)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0


In [13]:
# process columns,
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))
        
# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

train.head(2)

Shape train: (4209, 378)
Shape test: (4209, 377)


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0


## Add decomposed components: PCA/ICA etc.

In [14]:
from sklearn.decomposition import PCA, FastICA
n_comp = 10

# PCA
pca = PCA(n_components=n_comp)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

## Prepare regressor

In [15]:
import xgboost as xgb

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=750, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)



[0]	train-rmse:12.6399	test-rmse:12.6383
[50]	train-rmse:11.0911	test-rmse:11.1536
[100]	train-rmse:10.0182	test-rmse:10.1466
[150]	train-rmse:9.29075	test-rmse:9.48826
[200]	train-rmse:8.80315	test-rmse:9.06666
[250]	train-rmse:8.47888	test-rmse:8.80221
[300]	train-rmse:8.25923	test-rmse:8.63693
[350]	train-rmse:8.09325	test-rmse:8.53666
[400]	train-rmse:7.95239	test-rmse:8.47786
[450]	train-rmse:7.83531	test-rmse:8.44204
[500]	train-rmse:7.73198	test-rmse:8.42229
[550]	train-rmse:7.64238	test-rmse:8.41071
[600]	train-rmse:7.56819	test-rmse:8.40413
[650]	train-rmse:7.49057	test-rmse:8.40174
[700]	train-rmse:7.41396	test-rmse:8.40215
667


## check f2 score

In [16]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

# now fixed, correct calculation
r2_score = r2_score(dtrain.get_label(), model.predict(dtrain))
print(r2_score)

0.631978411977


In [17]:
# make predictions and save results
y_pred = model.predict(dtest)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('./result/xgboost-depth{}-pca-r2={}.csv'.format(xgb_params['max_depth'], r2_score), index=False)
