##### PARTICIPANTES
- PABLO GARCIA MOLINA
- ANDRES MARTINEZ FUENTES
- JAVIER RIOS MONTES

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn import svm, neighbors, tree, linear_model
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# DATA EXTRACTION
train_df = pd.read_csv('data/TRAIN_DATA.csv', sep=';', encoding='utf_8')
final_test_df = pd.read_csv('data/TEST_INPUT.csv', sep=';', encoding='utf_8')

In [3]:
# SPLIT DATA
# Data is already sepparated between train and test sets
# but we are not provided with the testing results thus we wont use them for testing the models, only to provide final results 
# that will later be checked. Therefore, split in the train data is still needed.

features = list(train_df.columns)
target = 'WG'
features.remove(target)

X = train_df[features]
y = train_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,  
                                                    random_state=0,)

In [4]:
# PREPROCESSING
# We only work with numerical features this time

numerical_features = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()
numerical_transformer = Pipeline(
    steps = [
        ("scaler", StandardScaler()),
        ("missing_values", SimpleImputer(strategy="mean")),
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
],
    remainder='passthrough'
)

In [5]:
# REGRESSION MODELS WITH PIPELINE UNION
SLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", linear_model.LinearRegression())]
)

SV = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", svm.SVR())]
)

Kneighbors = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", neighbors.KNeighborsRegressor())]
)

Tree = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", tree.DecisionTreeRegressor())]
)

BR = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", BaggingRegressor())]
)

In [6]:
# PARAM GRID FOR TWO HYPERPARAMETERS

SLR_param_grid = {
    'regressor__positive': [True, False]
}

SV_param_grid = {
    'regressor__kernel' : ['linear', 'rbf', 'sigmoid'],
    'regressor__C' : [1,5,10],
    'regressor__gamma': [0.001, 0.01, 1]
}

KN_param_grid = {
    'regressor__n_neighbors': [2,3,5,7,10],
}

Tree_param_grid = {
    'regressor__splitter': ['best','random'],
    'regressor__max_depth': [3,5,7,8],
    'regressor__min_samples_leaf': [1,5,10]
}

BR_param_grid = {
    'regressor__n_estimators': [5, 10, 15],
    'regressor__max_samples': [0.4, 0.5, 0.6],
    'regressor__max_features': [0.3, 0.4, 0.5],
    'regressor__bootstrap': [True, False],
    'regressor__bootstrap_features': [True, False]
}

In [7]:
# CREATTING DE GRIDS FOR EVERY MODEL

SLR_search = GridSearchCV(estimator = SLR, 
                      param_grid = SLR_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)

SV_search = GridSearchCV(estimator = SV, 
                      param_grid = SV_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)
KN_search = GridSearchCV(Kneighbors, 
                      KN_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)
Tree_search = GridSearchCV(Tree, 
                      Tree_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)
BR_search = GridSearchCV(BR,
                        BR_param_grid,
                        n_jobs=1,
                        verbose=4,
                        scoring = "neg_mean_squared_error",
                        cv = 3,
                        return_train_score=True)

In [8]:
# TRAINING THE MODELS

SLR_search.fit(X_train, y_train)
SV_search.fit(X_train,y_train)
KN_search.fit(X_train,y_train)
Tree_search.fit(X_train,y_train)
BR_search.fit(X_train,y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END regressor__positive=True;, score=(train=-515.184, test=-427.768) total time=   0.0s
[CV 2/3] END regressor__positive=True;, score=(train=-510.925, test=-421.645) total time=   0.0s
[CV 3/3] END regressor__positive=True;, score=(train=-393.974, test=-677.083) total time=   0.0s
[CV 1/3] END regressor__positive=False;, score=(train=-490.450, test=-415.587) total time=   0.0s
[CV 2/3] END regressor__positive=False;, score=(train=-486.214, test=-408.419) total time=   0.0s
[CV 3/3] END regressor__positive=False;, score=(train=-364.732, test=-675.910) total time=   0.0s
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=linear;, score=(train=-519.533, test=-441.418) total time=   0.0s
[CV 2/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=linear;, score=(train=-513.278, test=-430.884) total time=   0.0s
[CV 3/3] END regresso

[CV 3/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=linear;, score=(train=-377.966, test=-680.052) total time=   0.1s
[CV 1/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=rbf;, score=(train=-820.381, test=-781.847) total time=   0.0s
[CV 2/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=rbf;, score=(train=-855.793, test=-815.403) total time=   0.0s
[CV 3/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=rbf;, score=(train=-796.900, test=-920.343) total time=   0.0s
[CV 1/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-1277.773, test=-1219.237) total time=   0.0s
[CV 2/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-1305.305, test=-1326.274) total time=   0.0s
[CV 3/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-1331.824, test=-1397.907) total time=   0.0s
[CV 1/3] END regressor__C=10, regr

[CV 2/3] END regressor__max_depth=5, regressor__min_samples_leaf=5, regressor__splitter=best;, score=(train=-285.535, test=-520.008) total time=   0.0s
[CV 3/3] END regressor__max_depth=5, regressor__min_samples_leaf=5, regressor__splitter=best;, score=(train=-238.169, test=-794.213) total time=   0.0s
[CV 1/3] END regressor__max_depth=5, regressor__min_samples_leaf=5, regressor__splitter=random;, score=(train=-480.396, test=-511.226) total time=   0.0s
[CV 2/3] END regressor__max_depth=5, regressor__min_samples_leaf=5, regressor__splitter=random;, score=(train=-588.847, test=-549.543) total time=   0.0s
[CV 3/3] END regressor__max_depth=5, regressor__min_samples_leaf=5, regressor__splitter=random;, score=(train=-479.477, test=-985.918) total time=   0.0s
[CV 1/3] END regressor__max_depth=5, regressor__min_samples_leaf=10, regressor__splitter=best;, score=(train=-356.710, test=-500.597) total time=   0.0s
[CV 2/3] END regressor__max_depth=5, regressor__min_samples_leaf=10, regressor__s

[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=15;, score=(train=-226.210, test=-416.338) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=15;, score=(train=-257.173, test=-374.378) total time=   0.0s
[CV 3/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=15;, score=(train=-166.889, test=-609.447) total time=   0.0s
[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.5, regressor__n_estimators=5;, score=(train=-295.975, test=-632.111) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_sampl

[CV 3/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=15;, score=(train=-155.339, test=-549.171) total time=   0.0s
[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.6, regressor__n_estimators=5;, score=(train=-254.824, test=-455.999) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.6, regressor__n_estimators=5;, score=(train=-187.964, test=-400.605) total time=   0.0s
[CV 3/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.6, regressor__n_estimators=5;, score=(train=-238.096, test=-798.584) total time=   0.0s
[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples

[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=10;, score=(train=-245.969, test=-405.662) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=10;, score=(train=-255.699, test=-443.540) total time=   0.0s
[CV 3/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=10;, score=(train=-229.920, test=-685.556) total time=   0.0s
[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=15;, score=(train=-224.905, test=-447.810) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max

[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=15;, score=(train=-166.950, test=-349.536) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=15;, score=(train=-181.524, test=-345.286) total time=   0.0s
[CV 3/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=15;, score=(train=-126.431, test=-559.147) total time=   0.0s
[CV 1/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.6, regressor__n_estimators=5;, score=(train=-267.854, test=-592.015) total time=   0.0s
[CV 2/3] END regressor__bootstrap=True, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_

[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=5;, score=(train=-228.345, test=-465.000) total time=   0.0s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=5;, score=(train=-290.765, test=-557.457) total time=   0.0s
[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=5;, score=(train=-237.036, test=-705.843) total time=   0.0s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=10;, score=(train=-182.078, test=-393.398) total time=   0.0s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.3, regressor__max_sa

[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=10;, score=(train=-133.672, test=-392.240) total time=   0.0s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=10;, score=(train=-149.720, test=-358.218) total time=   0.0s
[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=10;, score=(train=-110.358, test=-593.049) total time=   0.0s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=15;, score=(train=-126.494, test=-483.116) total time=   0.1s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.4, regressor__max

[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=15;, score=(train=-74.557, test=-312.578) total time=   0.1s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=15;, score=(train=-70.878, test=-286.263) total time=   0.1s
[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=True, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=15;, score=(train=-64.379, test=-580.714) total time=   0.1s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max_samples=0.4, regressor__n_estimators=5;, score=(train=-308.299, test=-712.388) total time=   0.0s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.3, regressor__max_s

[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.4, regressor__n_estimators=15;, score=(train=-141.751, test=-566.520) total time=   0.0s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=5;, score=(train=-155.192, test=-455.097) total time=   0.0s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=5;, score=(train=-185.221, test=-577.584) total time=   0.0s
[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__max_samples=0.5, regressor__n_estimators=5;, score=(train=-124.502, test=-596.534) total time=   0.0s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.4, regressor__m

[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=5;, score=(train=-80.372, test=-551.148) total time=   0.0s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=10;, score=(train=-72.488, test=-275.579) total time=   0.1s
[CV 2/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=10;, score=(train=-76.183, test=-276.226) total time=   0.1s
[CV 3/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.5, regressor__max_samples=0.6, regressor__n_estimators=10;, score=(train=-68.875, test=-548.113) total time=   0.1s
[CV 1/3] END regressor__bootstrap=False, regressor__bootstrap_features=False, regressor__max_features=0.5, regressor__max

In [9]:
# BEST MODELS

print('SLR:',SLR_search.best_params_,'(best)\n',
      SLR_search.best_estimator_.score(X_train, y_train),'(train score)\n',
      SLR_search.best_estimator_.score(X_test, y_test), '(test score)\n')
print('SV:',SV_search.best_params_,'(best)\n',
      SV_search.best_estimator_.score(X_train, y_train),'(train score)\n',
      SV_search.best_estimator_.score(X_test, y_test), '(test score)\n')
print('KNN:',KN_search.best_params_, '(best)\n',
      KN_search.best_estimator_.score(X_train, y_train),'(train score)\n',
      KN_search.best_estimator_.score(X_test, y_test), '(test score)\n',)
print('Tree:',Tree_search.best_params_, '(best)\n',
      Tree_search.best_estimator_.score(X_train, y_train), '(train score)\n',
      Tree_search.best_estimator_.score(X_test, y_test), '(test score)\n',)
print('BR:',BR_search.best_params_, '(best)\n',
      BR_search.best_estimator_.score(X_train, y_train), '(train score)\n',
     BR_search.best_estimator_.score(X_test, y_test), '(test score)\n')

SLR: {'regressor__positive': False} (best)
 0.8727177201064644 (train score)
 0.8868712204466241 (test score)

SV: {'regressor__C': 10, 'regressor__gamma': 0.01, 'regressor__kernel': 'rbf'} (best)
 0.879649872542506 (train score)
 0.9053624539727857 (test score)

KNN: {'regressor__n_neighbors': 2} (best)
 0.9832214990775999 (train score)
 0.950945674867764 (test score)

Tree: {'regressor__max_depth': 7, 'regressor__min_samples_leaf': 10, 'regressor__splitter': 'best'} (best)
 0.924526414533582 (train score)
 0.9029092738847903 (test score)

BR: {'regressor__bootstrap': False, 'regressor__bootstrap_features': False, 'regressor__max_features': 0.5, 'regressor__max_samples': 0.6, 'regressor__n_estimators': 10} (best)
 0.9837812435406276 (train score)
 0.9330628073743868 (test score)



In [10]:
def root_mean_squared_error(pred, true) -> float:
    
    return np.sqrt(mean_squared_error(true,pred))

In [11]:
# MEASSURING PERFORMANCE

# RMSE

SLR_RMSE_train = root_mean_squared_error(SLR_search.best_estimator_.predict(X_train), y_train)
SLR_RMSE_test = root_mean_squared_error(SLR_search.best_estimator_.predict(X_test), y_test)
print('RMSE Simple Linear Regression-model:\n',
      SLR_RMSE_train, '(train)\n',
      SLR_RMSE_test, '(test)\n')

SV_RMSE_train = root_mean_squared_error(SV_search.best_estimator_.predict(X_train), y_train)
SV_RMSE_test = root_mean_squared_error(SV_search.best_estimator_.predict(X_test), y_test)
print('RMSE SVR-model:\n',
      SV_RMSE_train, '(train)\n',
      SV_RMSE_test, '(test)\n')

KN_RMSE_train = root_mean_squared_error(KN_search.best_estimator_.predict(X_train), y_train)
KN_RMSE_test = root_mean_squared_error(KN_search.best_estimator_.predict(X_test), y_test)
print('RMSE KNN-model:\n',
      KN_RMSE_train, '(train)\n',
      KN_RMSE_test, '(test)\n')

Tree_RMSE_train = root_mean_squared_error(Tree_search.best_estimator_.predict(X_train), y_train)
Tree_RMSE_test = root_mean_squared_error(Tree_search.best_estimator_.predict(X_test), y_test)
print('RMSE Tree-model:\n',
      Tree_RMSE_train, '(train)\n',
      Tree_RMSE_test, '(test)\n')

BR_RMSE_train = root_mean_squared_error(BR_search.best_estimator_.predict(X_train), y_train)
BR_RMSE_test = root_mean_squared_error(BR_search.best_estimator_.predict(X_test), y_test)
print('RMSE BaggingRegression-model:\n',
      BR_RMSE_train, '(train)\n',
      BR_RMSE_test, '(test)\n')

# CV error with RMSE

SLR_cv_error = np.sqrt(-SLR_search.cv_results_['mean_test_score'].mean())
print('SVR CV error:', SLR_cv_error)

SV_cv_error = np.sqrt(-SV_search.cv_results_['mean_test_score'].mean())
print('SVR CV error:', SV_cv_error)

KN_cv_error = np.sqrt(-KN_search.cv_results_['mean_test_score'].mean())
print('KNN CV error:', KN_cv_error)

Tree_cv_error = np.sqrt(-Tree_search.cv_results_['mean_test_score'].mean())
print('Tree CV error:', Tree_cv_error)

BR_cv_error = np.sqrt(-BR_search.cv_results_['mean_test_score'].mean())
print('BR CV error:', BR_cv_error)

RMSE Simple Linear Regression-model:
 21.344225369461938 (train)
 19.994443226058717 (test)

RMSE SVR-model:
 20.75485493513342 (train)
 18.287506745494547 (test)

RMSE KNN-model:
 7.7494878738374355 (train)
 13.166229371402794 (test)

RMSE Tree-model:
 16.435916115088112 (train)
 18.52301330400415 (test)

RMSE BaggingRegression-model:
 7.619126798504163 (train)
 15.380006218966706 (test)

SVR CV error: 22.458892445712923
SVR CV error: 221.86378791838032
KNN CV error: 21.294535246390435
Tree CV error: 25.490772739266113
BR CV error: 21.87923055808892


In [12]:
# BEST MODEL PREDICTIONS for TEST_INPUT

#!! We consider the best model to be KNN for Regression

model = KN_search.best_estimator_

prediction = pd.DataFrame(model.predict(final_test_df))
prediction.to_csv("AndresJavierPablo.csv" , sep=" ", header=False, index=False)