In [1]:
# Python and data manipulation stuff
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.metrics import r2_score

from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import TruncatedSVD



In [2]:
# K-Fold helper
def kfold_validate(clf, X, y, k_folds = 5):
    acc = []
    rtimes = 5
    rstate = [420, 123, 456, 678, 666]
    for it in range(rtimes):
        count = 0
        kf = KFold(n_splits=k_folds, shuffle = True, random_state=rstate[it])
        for train_idx, test_idx in kf.split(X):
            count += 1
            # Separe training and test in the Training set for k-Fold
            fold_Xtrain, fold_Xtest = X[train_idx], X[test_idx]
            fold_ytrain, fold_ytest = y[train_idx], y[test_idx]

            # Train
            clf.fit(fold_Xtrain, fold_ytrain)
            pred = clf.predict(fold_Xtest)
            accuracy = r2_score(fold_ytest, pred)
            acc.append(accuracy)
            print("Fold: %s of % s | iter: %s => r2_score = %s" %(count, k_folds, it, accuracy))
    
    print("\nR2_score statistics:")
    print("Mean = %s%%" % '{0:.5f}'.format(np.mean(acc)))
    print("STD = %s%%" % '{0:.5f}'.format(np.std(acc)))

In [3]:
# K-Fold helper
def kfold_validate_keras(clf, X, y, k_folds = 5):
    acc = []
    rtimes = 5
    rstate = [420, 123, 456, 678, 666]
    for it in range(rtimes):
        count = 0
        kf = KFold(n_splits=k_folds, shuffle = True, random_state=rstate[it])
        for train_idx, test_idx in kf.split(X):
            count += 1
            # Separe training and test in the Training set for k-Fold
            fold_Xtrain, fold_Xtest = X[train_idx], X[test_idx]
            fold_ytrain, fold_ytest = y[train_idx], y[test_idx]

            # Train
            clf.fit(fold_Xtrain,fold_ytrain, epochs=250,
                    validation_data=(fold_Xtest, fold_ytest))
            pred = clf.predict(fold_Xtest)
            accuracy = r2_score(fold_ytest, pred)
            acc.append(accuracy)
            print("Fold: %s of % s | iter: %s => r2_score = %s" %(count, k_folds, it, accuracy))
    
    print("\nR2_score statistics:")
    print("Mean = %s%%" % '{0:.5f}'.format(np.mean(acc)))
    print("STD = %s%%" % '{0:.5f}'.format(np.std(acc)))

In [4]:
# https://www.kaggle.com/eikedehling/stack-of-svm-elasticnet-xgboost-rf-0-55
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y).ravel()
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=420).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        acc = []
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                    
                y_holdout = y[test_idx]

                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                accuracy = r2_score(y_holdout, y_pred)
                print ("Model %d fold %d score %f" % (i, j, accuracy))
                acc.append(accuracy)

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)
        
        print("\nR2_score statistics for Models:")
        print("Mean = %s%%" % '{0:.5f}'.format(np.mean(acc)))
        print("STD = %s%%" % '{0:.5f}'.format(np.std(acc)))
        print("\nStarting kFold for stacked models")
        kfold_validate(self.stacker, S_train, y)
        
        # Train on all data
        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_test)[:]
        return res

In [5]:
# Test validation helper
def lb_probing(pred):
    probing = pd.read_csv('data/lb_probing.csv')
    values = []
    for idp in probing.id:
        values.append(pred.y[pred.ID == idp].values[0])
    print('lb probing score = %s' % r2_score(probing.y, np.array(values)))

In [6]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [7]:
# Insert probed data
probing = pd.read_csv('data/lb_probing.csv')

for idp in probing.id:
    n_row = df_test[df_test.ID == idp]
    n_row['y'] = list(probing[probing.id == idp]['y'])
    df_train = pd.concat([df_train, n_row], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
# Get the mean for y across repeated rows
filter_col = list(df_train.columns)
filter_col.remove('ID')
filter_col.remove('y')
repeated = df_train[df_train.duplicated(subset=filter_col, keep=False)]
mean_repeated = repeated.groupby(filter_col, as_index=False).mean()

In [9]:
# Remove repeated rows
print(df_train.shape)
filter_col = list(df_train.columns)
filter_col.remove('ID')
filter_col.remove('y')
df_train.drop_duplicates(subset=filter_col, keep=False, inplace=True)
print(df_train.shape)

# Merge with filtered rows
df_train = pd.concat([df_train, mean_repeated], axis=0)
print(df_train.shape)

(4241, 378)
(3720, 378)
(3939, 378)


In [10]:
# Magic feature from Cro-Magnon
uniquex0 = list(df_train['X0'].unique())
dict_meanx0 = {}
df_train['meanx0'] = df_train.y
df_test['meanx0'] = np.repeat(np.mean(df_train.y), len(df_test))

for x in uniquex0:
    meanx0 = np.mean(df_train['y'][df_train['X0'] == x])
    dict_meanx0[x] = meanx0
    df_train['meanx0'][df_train['X0'] == x] = meanx0
    df_test['meanx0'][df_test['X0'] == x] = meanx0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [11]:
# Remove y outlier
#df_train = df_train[df_train.y < 170]
#print(df_train.shape)

# Remove columns with 0 variance in train
desc = df_train.describe().transpose()
zerovar = list(desc[desc['std'] == 0].transpose().keys())
df_train = df_train.drop(zerovar, axis=1)
df_test = df_test.drop(zerovar, axis=1)
print(df_train.shape)

(3939, 367)


In [12]:
# group train and test
num_train = len(df_train)
df_all = pd.concat([df_train, df_test], axis=0)
print(df_all.shape)

(8148, 367)


In [13]:
del df_train
del df_test

In [14]:
# Get the object features
obj_features = []
int_features = []
for c in df_all.columns:
    if df_all[c].dtype == 'object':
        obj_features.append(c)
    else:
        int_features.append(c)

In [15]:
# Label encoder
for c in df_all.columns:
    if df_all[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df_all[c].values))
        df_all[c] = lbl.transform(list(df_all[c].values))

df_train = df_all[:num_train]
df_test = df_all[num_train:].drop(['y'], axis=1)

In [16]:
# Insert PCA/ICA/tSVD/GRP/SRP
n_comp = 12
rstate = 420

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=rstate)
tsvd_results_train = tsvd.fit_transform(df_train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(df_test)

# PCA
pca = PCA(n_components=n_comp, random_state=rstate)
pca2_results_train = pca.fit_transform(df_train.drop(["y"], axis=1))
pca2_results_test = pca.transform(df_test)

# ICA
ica = FastICA(n_components=n_comp, random_state=rstate)
ica2_results_train = ica.fit_transform(df_train.drop(["y"], axis=1))
ica2_results_test = ica.transform(df_test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=rstate)
grp_results_train = grp.fit_transform(df_train.drop(["y"], axis=1))
grp_results_test = grp.transform(df_test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=rstate)
srp_results_train = srp.fit_transform(df_train.drop(["y"], axis=1))
srp_results_test = srp.transform(df_test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    df_train['pca_' + str(i)] = pca2_results_train[:,i-1]
    df_test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    df_train['ica_' + str(i)] = ica2_results_train[:,i-1]
    df_test['ica_' + str(i)] = ica2_results_test[:, i-1]

    df_train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    df_test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]
    
    df_train['grp_' + str(i)] = grp_results_train[:,i-1]
    df_test['grp_' + str(i)] = grp_results_test[:, i-1]
    
    df_train['srp_' + str(i)] = srp_results_train[:,i-1]
    df_test['srp_' + str(i)] = srp_results_test[:, i-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [17]:
x_train = df_train.drop(['ID', 'y'], axis=1).as_matrix()
y_train = df_train.filter('y').as_matrix()
x_test = df_test.drop(['ID'], axis=1).as_matrix()
id_test = df_test['ID'].apply(int)

In [18]:
# XGBoost
xgb_params = {
    'learning_rate': 0.01,
    'n_estimators': 2000,
    'max_depth':3,
    'subsample': 1,
    'colsample_bytree':0.5,
    'reg_alpha':0.5,
    'reg_lambda':0.5
}

xgb_clf = xgb.XGBRegressor(**xgb_params)
kfold_validate(xgb_clf, x_train, y_train)

# Fit in all data and use on test
xgb_clf.fit(x_train, y_train)
pred_xgb = xgb_clf.predict(x_test)
result = pd.DataFrame({'ID': id_test, 'y': pred_xgb})

lb_probing(result)

Fold: 1 of 5 | iter: 0 => r2_score = 0.594437661011
Fold: 2 of 5 | iter: 0 => r2_score = 0.605623625926
Fold: 3 of 5 | iter: 0 => r2_score = 0.61134325288
Fold: 4 of 5 | iter: 0 => r2_score = 0.427625708461
Fold: 5 of 5 | iter: 0 => r2_score = 0.56992572574
Fold: 1 of 5 | iter: 1 => r2_score = 0.56647830996
Fold: 2 of 5 | iter: 1 => r2_score = 0.59951911192
Fold: 3 of 5 | iter: 1 => r2_score = 0.56801595912
Fold: 4 of 5 | iter: 1 => r2_score = 0.477892400675
Fold: 5 of 5 | iter: 1 => r2_score = 0.614418919698
Fold: 1 of 5 | iter: 2 => r2_score = 0.466636893721
Fold: 2 of 5 | iter: 2 => r2_score = 0.57475744934
Fold: 3 of 5 | iter: 2 => r2_score = 0.642564744861
Fold: 4 of 5 | iter: 2 => r2_score = 0.601846932282
Fold: 5 of 5 | iter: 2 => r2_score = 0.546386240523
Fold: 1 of 5 | iter: 3 => r2_score = 0.572963398528
Fold: 2 of 5 | iter: 3 => r2_score = 0.637794243616
Fold: 3 of 5 | iter: 3 => r2_score = 0.55470900886
Fold: 4 of 5 | iter: 3 => r2_score = 0.506570561497
Fold: 5 of 5 | iter

In [19]:
# RandomForest
rf_params = {
    'n_estimators': 200,
    'n_jobs': -1
}

rf_clf = RandomForestRegressor(**rf_params)
kfold_validate(rf_clf, x_train, y_train.ravel())

# Fit in all data and use on test
rf_clf.fit(x_train, y_train.ravel())
pred_rf = rf_clf.predict(x_test)
result = pd.DataFrame({'ID': id_test, 'y': pred_rf})

lb_probing(result)

Fold: 1 of 5 | iter: 0 => r2_score = 0.550573155865
Fold: 2 of 5 | iter: 0 => r2_score = 0.579084048823
Fold: 3 of 5 | iter: 0 => r2_score = 0.55632780785
Fold: 4 of 5 | iter: 0 => r2_score = 0.441539996482
Fold: 5 of 5 | iter: 0 => r2_score = 0.555697415821
Fold: 1 of 5 | iter: 1 => r2_score = 0.521310789633
Fold: 2 of 5 | iter: 1 => r2_score = 0.58309477478
Fold: 3 of 5 | iter: 1 => r2_score = 0.536598545498
Fold: 4 of 5 | iter: 1 => r2_score = 0.455628773711
Fold: 5 of 5 | iter: 1 => r2_score = 0.538301498614
Fold: 1 of 5 | iter: 2 => r2_score = 0.459418685414
Fold: 2 of 5 | iter: 2 => r2_score = 0.518987287198
Fold: 3 of 5 | iter: 2 => r2_score = 0.596795201914
Fold: 4 of 5 | iter: 2 => r2_score = 0.575626962025
Fold: 5 of 5 | iter: 2 => r2_score = 0.496385641613
Fold: 1 of 5 | iter: 3 => r2_score = 0.544170753711
Fold: 2 of 5 | iter: 3 => r2_score = 0.600708825195
Fold: 3 of 5 | iter: 3 => r2_score = 0.492672583627
Fold: 4 of 5 | iter: 3 => r2_score = 0.462890959554
Fold: 5 of 5 |

In [20]:
# Ridge
ridge_params = {
    'alpha': 100,
    'normalize':False,
    'solver':'auto'
}

ridge_clf = Ridge(**ridge_params)
kfold_validate(ridge_clf, x_train, y_train.ravel())

# Fit in all data and use on test
ridge_clf.fit(x_train, y_train.ravel())
pred = ridge_clf.predict(x_test)
result = pd.DataFrame({'ID': id_test, 'y': pred})
lb_probing(result)

Fold: 1 of 5 | iter: 0 => r2_score = 0.609448641918
Fold: 2 of 5 | iter: 0 => r2_score = 0.622929795048
Fold: 3 of 5 | iter: 0 => r2_score = 0.651448715867
Fold: 4 of 5 | iter: 0 => r2_score = 0.437819334988
Fold: 5 of 5 | iter: 0 => r2_score = 0.581308449489
Fold: 1 of 5 | iter: 1 => r2_score = 0.570727233685
Fold: 2 of 5 | iter: 1 => r2_score = 0.607540783572
Fold: 3 of 5 | iter: 1 => r2_score = 0.581886587623
Fold: 4 of 5 | iter: 1 => r2_score = 0.492140927373
Fold: 5 of 5 | iter: 1 => r2_score = 0.629790104207
Fold: 1 of 5 | iter: 2 => r2_score = 0.471711563798
Fold: 2 of 5 | iter: 2 => r2_score = 0.586805989708
Fold: 3 of 5 | iter: 2 => r2_score = 0.654805066067
Fold: 4 of 5 | iter: 2 => r2_score = 0.606983670013
Fold: 5 of 5 | iter: 2 => r2_score = 0.56777269049
Fold: 1 of 5 | iter: 3 => r2_score = 0.582219452361
Fold: 2 of 5 | iter: 3 => r2_score = 0.650541051933
Fold: 3 of 5 | iter: 3 => r2_score = 0.565735560383
Fold: 4 of 5 | iter: 3 => r2_score = 0.516645470793
Fold: 5 of 5 

In [21]:
stack = Ensemble(n_splits=5,
                 stacker=ElasticNet(l1_ratio=0.1, alpha=1.4),
                 base_models=(xgb_clf, ridge_clf, rf_clf))

pred = stack.fit_predict(x_train, y_train, x_test)

sub = pd.DataFrame({'ID': id_test, 'y': pred})
sub.head()

Model 0 fold 0 score 0.594438
Model 0 fold 1 score 0.605624
Model 0 fold 2 score 0.611343
Model 0 fold 3 score 0.427626
Model 0 fold 4 score 0.569926
Model 1 fold 0 score 0.609449
Model 1 fold 1 score 0.622930
Model 1 fold 2 score 0.651449
Model 1 fold 3 score 0.437819
Model 1 fold 4 score 0.581308
Model 2 fold 0 score 0.545980
Model 2 fold 1 score 0.585385
Model 2 fold 2 score 0.560410
Model 2 fold 3 score 0.434529
Model 2 fold 4 score 0.561122

R2_score statistics for Models:
Mean = 0.55996%
STD = 0.06844%

Starting kFold for stacked models
Fold: 1 of 5 | iter: 0 => r2_score = 0.608411800466
Fold: 2 of 5 | iter: 0 => r2_score = 0.629022694318
Fold: 3 of 5 | iter: 0 => r2_score = 0.644581919768
Fold: 4 of 5 | iter: 0 => r2_score = 0.444911903254
Fold: 5 of 5 | iter: 0 => r2_score = 0.585462363659
Fold: 1 of 5 | iter: 1 => r2_score = 0.587909900804
Fold: 2 of 5 | iter: 1 => r2_score = 0.610606010052
Fold: 3 of 5 | iter: 1 => r2_score = 0.583581368679
Fold: 4 of 5 | iter: 1 => r2_score 

Unnamed: 0,ID,y
0,1,78.668003
1,2,96.120871
2,3,80.110741
3,4,79.37181
4,5,116.591175


In [22]:
lb_probing(sub)

lb probing score = 0.701796264916


In [25]:
final_result = pd.DataFrame({'ID': id_test, 'y': .5*pred + .2*pred_rf + .3*pred_xgb})
final_result.to_csv('ensembleHguimaresProbAveraged.csv', index=False)

In [26]:
lb_probing(final_result)

lb probing score = 0.817777531276
