313549 ~ Mateusz Zacharecki \
335719 ~ Patrycja Kielan

---

# Solution for filter variable selection
___

# Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import sklearn.model_selection as skm
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_regression
import xgboost
from sklearn.ensemble import VotingRegressor

In [2]:
X_train = pd.read_csv("Xtrain.csv", index_col=0)
X_test = pd.read_csv("Xtest.csv", index_col=0)
y_train = pd.read_csv("ytrain.csv", index_col=0)

# SVM

In [3]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', RFE(DecisionTreeRegressor(), n_features_to_select = 10))
])

## Fitting the whole training set

In [4]:
pipeline_SVM = Pipeline([
    ('pipeline', pipeline),
    ('svm', SVR(C = 10, gamma = 0.1, kernel = 'rbf'))
])

In [5]:
params = {}

grid_pipeline_SVM = GridSearchCV(pipeline_SVM,
                             cv = skm.KFold(3, random_state = 313549, shuffle=True),
                             param_grid = params,
                             scoring = 'neg_mean_squared_error',
                             return_train_score = True)

In [6]:
grid_pipeline_SVM.fit(X_train, y_train.values.ravel())

In [7]:
results_SVM = pd.DataFrame({
                     'mean_test_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_SVM.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_SVM.cv_results_["rank_test_score"]})
results_SVM.sort_values(by = 'rank_test')

Unnamed: 0,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
0,-91.965795,-88.401561,1.360694,0.536007,1


In [8]:
y_pred = pd.DataFrame(grid_pipeline_SVM.predict(X_test))

In [9]:
pd.DataFrame(y_pred).to_csv("SVM_filter.txt", sep='\t', index=False, header=False)

# Neural network

In [10]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', RFE(DecisionTreeRegressor(), n_features_to_select = 10))
])

## Fitting the whole training set

In [11]:
pipeline_NN = Pipeline([
    ('pipeline', pipeline),
    ('nn', MLPRegressor(max_iter = 10000, random_state = 313549))
])

In [12]:
params = {
    "nn__hidden_layer_sizes": [(4), (16), (4, 4)],
    "nn__solver": ['sgd', 'adam']
}

In [13]:
grid_pipeline_NN = GridSearchCV(pipeline_NN,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [14]:
grid_pipeline_NN.fit(X_train, y_train.values.ravel())

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
6 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_s

In [15]:
results_NN = pd.DataFrame({'nn__hidden_layer_sizes': grid_pipeline_NN.cv_results_["param_nn__hidden_layer_sizes"],
                           'nn__solver': grid_pipeline_NN.cv_results_["param_nn__solver"],
                     'mean_test_neg_mean_squared_error': grid_pipeline_NN.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_NN.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_NN.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_NN.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_NN.cv_results_["rank_test_score"]})
results_NN.sort_values(by = 'rank_test')

Unnamed: 0,nn__hidden_layer_sizes,nn__solver,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
3,16,adam,-97.075715,-95.801997,0.704792,0.828179,1
5,"(4, 4)",adam,-97.582967,-96.913593,0.910608,1.600758,2
1,4,adam,-98.810745,-98.005208,0.667451,0.558792,3
4,"(4, 4)",sgd,-120.205519,-120.203947,1.291545,0.643717,4
0,4,sgd,,,,,5
2,16,sgd,,,,,5


In [16]:
y_pred = pd.DataFrame(grid_pipeline_NN.predict(X_test))

In [17]:
pd.DataFrame(y_pred).to_csv("neural_network_filter.txt", sep='\t', index=False, header=False)

# XGBoost

In [18]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', RFE(DecisionTreeRegressor(), n_features_to_select = 10))
])

## Fitting the whole training set

In [19]:
pipeline_xgb = Pipeline([
    ('pipeline', pipeline),
    ('xgb', xgboost.XGBRegressor(objective='reg:squarederror', n_estimators = 400, random_state = 313549))
])

In [20]:
params = {
    "xgb__eta": [0.3, 0.1, 0.03],
    "xgb__gamma": [0, 0.1, 1],
    "xgb__max_depth": [10, 20]
}

In [21]:
grid_pipeline_xgb = GridSearchCV(pipeline_xgb,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [22]:
grid_pipeline_xgb.fit(X_train, y_train.values.ravel())

In [23]:
results_xgb = pd.DataFrame({'xgb__eta': grid_pipeline_xgb.cv_results_["param_xgb__eta"],
                            'xgb__gamma': grid_pipeline_xgb.cv_results_["param_xgb__gamma"],
                            'xgb__max_depth': grid_pipeline_xgb.cv_results_["param_xgb__gamma"],
                     'mean_test_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_xgb.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_xgb.cv_results_["rank_test_score"]})
results_xgb.sort_values(by = 'rank_test')

Unnamed: 0,xgb__eta,xgb__gamma,xgb__max_depth,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
14,0.03,0.1,0.1,-87.838587,-50.366537,0.716135,0.6284519,1
12,0.03,0.0,0.0,-87.905485,-49.922177,1.348044,1.232092,2
16,0.03,1.0,1.0,-88.044292,-49.998999,1.150912,0.5073095,3
10,0.1,1.0,1.0,-91.270553,-20.186102,0.908666,0.3300522,4
8,0.1,0.1,0.1,-91.298745,-19.98143,0.612752,0.3089159,5
6,0.1,0.0,0.0,-91.540592,-19.629446,1.177897,0.6785791,6
15,0.03,0.1,0.1,-94.932879,-0.0442,1.478876,0.0004680466,7
17,0.03,1.0,1.0,-95.185724,-0.26428,1.858987,0.0007946987,8
13,0.03,0.0,0.0,-95.356176,-0.009596,1.758309,0.001318463,9
11,0.1,1.0,1.0,-95.714377,-0.283231,1.499643,0.01089431,10


In [24]:
y_pred = pd.DataFrame(grid_pipeline_xgb.predict(X_test))

In [25]:
pd.DataFrame(y_pred).to_csv("xgboost_filter.txt", sep='\t', index=False, header=False)

# Linear regression

In [26]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', RFE(DecisionTreeRegressor(), n_features_to_select = 10))
])

## Fitting the whole training set

In [27]:
pipeline_lm = Pipeline([
    ('pipeline', pipeline),
    ('lm', LinearRegression())
])

In [28]:
params = {}

In [29]:
grid_pipeline_lm = GridSearchCV(pipeline_lm,
                      cv = skm.KFold(3, random_state = 313549, shuffle=True),
                      param_grid = params,
                      scoring = 'neg_mean_squared_error',
                      return_train_score = True)

In [30]:
grid_pipeline_lm.fit(X_train, y_train.values.ravel())

In [31]:
results_lm = pd.DataFrame({
                     'mean_test_neg_mean_squared_error': grid_pipeline_lm.cv_results_["mean_test_score"],
                     'mean_train_neg_mean_squared_error': grid_pipeline_lm.cv_results_["mean_train_score"],
                     'std_test_neg_mean_squared_error': grid_pipeline_lm.cv_results_["std_test_score"],
                     'std_train_neg_mean_squared_error': grid_pipeline_lm.cv_results_["std_train_score"],
                     'rank_test': grid_pipeline_lm.cv_results_["rank_test_score"]})
results_lm.sort_values(by = 'rank_test')

Unnamed: 0,mean_test_neg_mean_squared_error,mean_train_neg_mean_squared_error,std_test_neg_mean_squared_error,std_train_neg_mean_squared_error,rank_test
0,-97.819109,-97.806987,0.700217,0.782308,1


In [32]:
y_pred = pd.DataFrame(grid_pipeline_lm.predict(X_test))

In [33]:
pd.DataFrame(y_pred).to_csv("linear_regression_filter.txt", sep='\t', index=False, header=False)