313549 ~ Mateusz Zacharecki \
335719 ~ Patrycja Kielan

---

# Solution for wrapper variable selection
___

# Importing libraries and data

In [8]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import sklearn.model_selection as skm
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import xgboost
from sklearn.ensemble import VotingRegressor

In [9]:
X_train = pd.read_csv("Xtrain.csv", index_col = 0)
X_test = pd.read_csv("Xtest.csv", index_col = 0)
y_train = pd.read_csv("ytrain.csv", index_col = 0)

# Gradient Boosting

In [3]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k = 10))
])

## Fitting the whole training set

In [4]:
pipeline_GradientBoosting = Pipeline([
    ('pipeline', pipeline),
    ('rf', GradientBoostingRegressor(random_state = 313549))
])

In [5]:
params = {
    'rf__max_depth': [10, 20],
    'rf__min_samples_split': [10, 20],
    'rf__n_estimators': [100, 500]
}

grid_pipeline_GradientBoosting = GridSearchCV(pipeline_GradientBoosting,
                             cv = 3,
                             param_grid = params,
                             scoring = 'neg_mean_squared_error',
                             return_train_score = True)

In [6]:
grid_pipeline_GradientBoosting.fit(X_train, y_train.values.ravel())

In [7]:
results_GradientBoosting = pd.DataFrame({
                     'rf__max_depth': grid_pipeline_GradientBoosting.cv_results_["param_rf__max_depth"],
                     'rf__min_samples_split': grid_pipeline_GradientBoosting.cv_results_["param_rf__min_samples_split"],
                     'rf__n_estimators': grid_pipeline_GradientBoosting.cv_results_["param_rf__n_estimators"],
                     'mean_test_neg_mean_squared_error': grid_pipeline_GradientBoosting.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_GradientBoosting.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_GradientBoosting.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_GradientBoosting.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_GradientBoosting.cv_results_["rank_test_score"]})
results_GradientBoosting.sort_values(by = 'rank_test')

Unnamed: 0,rf__max_depth,rf__min_samples_split,rf__n_estimators,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
2,10,20,100,-96.139989,-70.428201,0.723603,0.829315,1
0,10,10,100,-96.348517,-68.074239,0.557877,0.64349,2
3,10,20,500,-98.04721,-48.470106,0.672436,1.300583,3
1,10,10,500,-98.696663,-43.085563,0.531264,0.973318,4
6,20,20,100,-102.099766,-9.5325,0.554837,0.658234,5
7,20,20,500,-102.965802,-1.571226,0.560051,0.200897,6
4,20,10,100,-103.247474,-3.887118,0.58354,0.457258,7
5,20,10,500,-103.725165,-0.281209,0.634849,0.044699,8


In [8]:
y_pred = pd.DataFrame(grid_pipeline_GradientBoosting.predict(X_test))

In [16]:
pd.DataFrame(y_pred).to_csv("GradientBoosting_wrapper.txt", sep='\t', index=False, header=False)

# SVM

In [9]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k = 10))
])

## Fitting the whole training set

In [10]:
pipeline_SVM = Pipeline([
    ('pipeline', pipeline),
    ('svm', SVR(C = 10, gamma = 0.1, kernel = 'rbf'))
])

In [11]:
params = {}

grid_pipeline_SVM = GridSearchCV(pipeline_SVM,
                             cv = skm.KFold(3, random_state = 313549, shuffle=True),
                             param_grid = params,
                             scoring = 'neg_mean_squared_error',
                             return_train_score = True)

In [12]:
grid_pipeline_SVM.fit(X_train, y_train.values.ravel())

In [21]:
results_SVM = pd.DataFrame({
                     'mean_test_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_SVM.cv_results_["rank_test_score"]})
results_SVM.sort_values(by = 'rank_test')

Unnamed: 0,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
0,-100.317654,-96.007635,1.050979,0.834881,1


In [15]:
y_pred = pd.DataFrame(grid_pipeline_SVM.predict(X_test))

In [16]:
pd.DataFrame(y_pred).to_csv("SVM_wrapper.txt", sep='\t', index=False, header=False)

# Neural network

In [3]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k = 10))
])

## Fitting the whole training set

In [4]:
pipeline_NN = Pipeline([
    ('pipeline', pipeline),
    ('nn', MLPRegressor(max_iter = 10000, random_state = 313549))
])

In [5]:
params = {
    "nn__hidden_layer_sizes": [(4), (16), (4, 4)],
    "nn__solver": ['sgd', 'adam']
}

In [6]:
grid_pipeline_NN = GridSearchCV(pipeline_NN,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [7]:
grid_pipeline_NN.fit(X_train, y_train.values.ravel())

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  activations[i + 1] += self.intercepts_[i]
6 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework

In [8]:
results_NN = pd.DataFrame({'nn__hidden_layer_sizes': grid_pipeline_NN.cv_results_["param_nn__hidden_layer_sizes"],
                           'nn__solver': grid_pipeline_NN.cv_results_["param_nn__solver"],
                     'mean_test_reg_mean_squared_error': grid_pipeline_NN.cv_results_["mean_test_score"],
                     'mean_train_reg_mean_squared_error': grid_pipeline_NN.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_NN.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_NN.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_NN.cv_results_["rank_test_score"]})
results_NN.sort_values(by = 'rank_test')

Unnamed: 0,nn__hidden_layer_sizes,nn__solver,mean_test_reg_mean_squared_error,mean_train_reg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
3,16,adam,-107.367121,-106.9055,1.640403,0.446652,1
1,4,adam,-107.5074,-107.268924,0.982421,0.497676,2
5,"(4, 4)",adam,-107.999863,-107.483372,1.525122,1.267789,3
4,"(4, 4)",sgd,-120.205519,-120.203947,1.291545,0.643717,4
0,4,sgd,,,,,5
2,16,sgd,,,,,5


In [9]:
y_pred = pd.DataFrame(grid_pipeline_NN.predict(X_test))

In [11]:
pd.DataFrame(y_pred).to_csv("neural_network_wrapper.txt", sep='\t', index=False, header=False)

# XGBoost

In [44]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k = 10))
])

## Fitting the whole training set

In [45]:
pipeline_xgb = Pipeline([
    ('pipeline', pipeline),
    ('xgb', xgboost.XGBRegressor(objective='reg:squarederror', n_estimators = 400, random_state = 313549))
])

In [46]:
params = {
    "xgb__eta": [0.3, 0.1, 0.03],
    "xgb__gamma": [0, 0.1, 1],
    "xgb__max_depth": [10, 20]
}

In [47]:
grid_pipeline_xgb = GridSearchCV(pipeline_xgb,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [48]:
grid_pipeline_xgb.fit(X_train, y_train.values.ravel())

In [49]:
results_xgb = pd.DataFrame({'xgb__eta': grid_pipeline_xgb.cv_results_["param_xgb__eta"],
                            'xgb__gamma': grid_pipeline_xgb.cv_results_["param_xgb__gamma"],
                            'xgb__max_depth': grid_pipeline_xgb.cv_results_["param_xgb__gamma"],
                     'mean_test_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_xgb.cv_results_["rank_test_score"]})
results_xgb.sort_values(by = 'rank_test')

Unnamed: 0,xgb__eta,xgb__gamma,xgb__max_depth,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
12,0.03,0.0,0.0,-95.492134,-57.27743,0.768948,0.2716224,1
14,0.03,0.1,0.1,-95.515884,-56.38812,0.760737,0.2163447,2
16,0.03,1.0,1.0,-95.522867,-56.68517,0.747911,0.3430517,3
8,0.1,0.1,0.1,-99.306513,-21.53591,0.785009,0.1423646,4
10,0.1,1.0,1.0,-99.423151,-21.86765,0.956674,0.2985198,5
6,0.1,0.0,0.0,-99.45636,-22.02728,0.861136,0.2936143,6
17,0.03,1.0,1.0,-101.567039,-0.3535385,1.360038,0.0118194,7
13,0.03,0.0,0.0,-101.682125,-0.0263562,1.510452,0.001629226,8
15,0.03,0.1,0.1,-101.781368,-0.06914997,1.309725,0.007365,9
11,0.1,1.0,1.0,-103.006277,-0.3603421,1.529827,0.01478811,10


In [50]:
y_pred = pd.DataFrame(grid_pipeline_xgb.predict(X_test))

In [51]:
pd.DataFrame(y_pred).to_csv("xgboost_wrapper.txt", sep='\t', index=False, header=False)

# Linear regression

In [22]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k = 10))
])

## Fitting the whole training set

In [23]:
pipeline_lm = Pipeline([
    ('pipeline', pipeline),
    ('lm', LinearRegression())
])

In [24]:
params = {}

In [25]:
grid_pipeline_lm = GridSearchCV(pipeline_lm,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [26]:
grid_pipeline_lm.fit(X_train, y_train.values.ravel())

In [27]:
results_lm = pd.DataFrame({
                     'mean_test_neg_mean_squared_error': grid_pipeline_lm.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_lm.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_lm.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_lm.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_lm.cv_results_["rank_test_score"]})
results_lm.sort_values(by = 'rank_test')

Unnamed: 0,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
0,-107.057824,-107.008256,1.109383,0.556895,1


In [28]:
y_pred = pd.DataFrame(grid_pipeline_lm.predict(X_test))

In [29]:
pd.DataFrame(y_pred).to_csv("linear_regression_wrapper.txt", sep='\t', index=False, header=False)

# Voting

In [15]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k = 10))
])

## Fitting the whole data set

In [16]:
reg1 = SVR(C = 10, gamma = 0.1, kernel = 'rbf')
reg2 = MLPRegressor(hidden_layer_sizes = 16, solver = 'adam', max_iter = 10000, random_state = 313549)
reg3 = xgboost.XGBRegressor(eta = 0.1, gamma = 0, max_depth = 10, objective='reg:squarederror', n_estimators = 400, random_state = 313549)

pipeline_voting = Pipeline([
    ('pipeline', pipeline),
    ('eclf', VotingRegressor(estimators=[('svm', reg1),
                             ('nn', reg2),
                             ('xgb', reg3)]))
])

In [17]:
reg1 = SVR(C = 10, gamma = 0.1, kernel = 'rbf')
reg2 = MLPRegressor(hidden_layer_sizes = 16, solver = 'adam', max_iter = 10000, random_state = 313549)
reg3 = xgboost.XGBRegressor(eta = 0.03, gamma = 0, max_depth = 10, objective='reg:squarederror', n_estimators = 400, random_state = 313549)

pipeline_voting = Pipeline([
    ('pipeline', pipeline),
    ('eclf', VotingRegressor(estimators=[('svm', reg1),
                             ('nn', reg2),
                             ('xgb', reg3)]))
])

In [18]:
params = {}

grid_pipeline_voting = GridSearchCV(pipeline_voting,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [19]:
grid_pipeline_voting.fit(X_train, y_train.values.ravel())

In [20]:
results_voting = pd.DataFrame({
                     'mean_test_neg_mean_squared_error': grid_pipeline_voting.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_voting.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_voting.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_voting.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_voting.cv_results_["rank_test_score"]})
results_voting.sort_values(by = 'rank_test')

Unnamed: 0,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
0,-96.185113,-80.506998,1.200426,0.248202,1


In [21]:
y_pred = pd.DataFrame(grid_pipeline_voting.predict(X_test))

In [22]:
pd.DataFrame(y_pred).to_csv("voting_wrapper.txt", sep='\t', index=False, header=False)