In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn import svm, neighbors, tree
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor # ???

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# DATA EXTRACTION
train_df = pd.read_csv('data/TRAIN_DATA.csv', sep=';', encoding='utf_8')
final_test_df = pd.read_csv('data/TEST_INPUT.csv', sep=';', encoding='utf_8')

In [3]:
# SPLIT DATA
# Data is already sepparated between train and test sets
# but we are not provided with the testing results thus we wont use them for testing the models, only to provide final results 
# that will later be checked. Therefore, split in the train data is still needed.

features = list(train_df.columns)
target = 'WG'
features.remove(target)

X = train_df[features]
y = train_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,  
                                                    random_state=0,)

In [4]:
# PREPROCESSING
# We only work with numerical features this time

numerical_features = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()
numerical_transformer = Pipeline(
    steps = [
        ("scaler", StandardScaler()),
        ("missing_values", SimpleImputer(strategy="mean")),
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
],
    remainder='passthrough'
)

In [5]:
# REGRESSION MODELS WITH PIPELINE UNION
SV = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", svm.SVR())]
)

Kneighbors = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", neighbors.KNeighborsRegressor())]
)

Tree = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", tree.DecisionTreeRegressor())]
)


In [6]:
# PARAM GRID FOR TWO HYPERPARAMETERS

SV_param_grid = {
    'regressor__kernel' : ['linear', 'rbf', 'sigmoid'],
    'regressor__C' : [1,5,10],
    'regressor__gamma': [0.001, 0.01, 1]
}

KN_param_grid = {
    'regressor__n_neighbors': [2,3,5,7,10],
}

Tree_param_grid = {
    'regressor__splitter': ['best','random'],
    'regressor__max_depth': [3,5,7,8],
    'regressor__min_samples_leaf': [1,5,10]
}

In [56]:
# CREATTING DE GRIDS FOR EVERY MODEL

SV_search = GridSearchCV(estimator = SV, 
                      param_grid = SV_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)
KN_search = GridSearchCV(Kneighbors, 
                      KN_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)
Tree_search = GridSearchCV(Tree, 
                      Tree_param_grid, 
                      n_jobs=1, 
                      verbose=4,
                      scoring = "neg_mean_squared_error",
                      cv = 3,
                      return_train_score=True)

In [57]:
# TRAINING THE MODELS

SV_search.fit(X_train,y_train)
KN_search.fit(X_train,y_train)
Tree_search.fit(X_train,y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=linear;, score=(train=-519.533, test=-441.418) total time=   0.0s
[CV 2/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=linear;, score=(train=-513.278, test=-430.884) total time=   0.0s
[CV 3/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=linear;, score=(train=-388.368, test=-649.823) total time=   0.0s
[CV 1/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=rbf;, score=(train=-3072.714, test=-2917.879) total time=   0.0s
[CV 2/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=rbf;, score=(train=-3042.228, test=-3261.698) total time=   0.0s
[CV 3/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=rbf;, score=(train=-3131.418, test=-3088.450) total time=   0.0s
[CV 1/3] END regressor__C=1, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-3403.324, test=-3234.738)

[CV 1/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-1277.773, test=-1219.237) total time=   0.0s
[CV 2/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-1305.305, test=-1326.274) total time=   0.0s
[CV 3/3] END regressor__C=10, regressor__gamma=0.001, regressor__kernel=sigmoid;, score=(train=-1331.824, test=-1397.907) total time=   0.0s
[CV 1/3] END regressor__C=10, regressor__gamma=0.01, regressor__kernel=linear;, score=(train=-514.846, test=-427.007) total time=   0.1s
[CV 2/3] END regressor__C=10, regressor__gamma=0.01, regressor__kernel=linear;, score=(train=-513.191, test=-433.161) total time=   0.1s
[CV 3/3] END regressor__C=10, regressor__gamma=0.01, regressor__kernel=linear;, score=(train=-377.966, test=-680.052) total time=   0.0s
[CV 1/3] END regressor__C=10, regressor__gamma=0.01, regressor__kernel=rbf;, score=(train=-484.133, test=-453.573) total time=   0.0s
[CV 2/3] END regressor__C=10, re

[CV 1/3] END regressor__max_depth=5, regressor__min_samples_leaf=10, regressor__splitter=random;, score=(train=-589.110, test=-559.328) total time=   0.0s
[CV 2/3] END regressor__max_depth=5, regressor__min_samples_leaf=10, regressor__splitter=random;, score=(train=-525.799, test=-540.149) total time=   0.0s
[CV 3/3] END regressor__max_depth=5, regressor__min_samples_leaf=10, regressor__splitter=random;, score=(train=-413.779, test=-877.115) total time=   0.0s
[CV 1/3] END regressor__max_depth=7, regressor__min_samples_leaf=1, regressor__splitter=best;, score=(train=-93.096, test=-457.464) total time=   0.0s
[CV 2/3] END regressor__max_depth=7, regressor__min_samples_leaf=1, regressor__splitter=best;, score=(train=-101.738, test=-575.431) total time=   0.0s
[CV 3/3] END regressor__max_depth=7, regressor__min_samples_leaf=1, regressor__splitter=best;, score=(train=-85.987, test=-846.413) total time=   0.0s
[CV 1/3] END regressor__max_depth=7, regressor__min_samples_leaf=1, regressor__sp

In [58]:
# BEST MODELS

print('SV:',SV_search.best_params_,'(best)\n',
      SV_search.best_estimator_.score(X_train, y_train),'(train score)\n',
      SV_search.best_estimator_.score(X_test, y_test), '(test score)\n')
print('KNN:',KN_search.best_params_, '(best)\n',
      KN_search.best_estimator_.score(X_train, y_train),'(train score)\n',
      KN_search.best_estimator_.score(X_test, y_test), '(test score)\n',)
print('Tree:',Tree_search.best_params_, '(best)\n',
      Tree_search.best_estimator_.score(X_train, y_train), '(train score)\n',
      Tree_search.best_estimator_.score(X_test, y_test), '(test score)\n',)

SV: {'regressor__C': 10, 'regressor__gamma': 0.01, 'regressor__kernel': 'rbf'} (best)
 0.879649872542506 (train score)
 0.9053624539727857 (test score)

KNN: {'regressor__n_neighbors': 2} (best)
 0.9832214990775999 (train score)
 0.950945674867764 (test score)

Tree: {'regressor__max_depth': 7, 'regressor__min_samples_leaf': 5, 'regressor__splitter': 'random'} (best)
 0.9112151277107325 (train score)
 0.871149436709918 (test score)



In [25]:
def root_mean_squared_error(pred, true) -> float:
    
    return np.sqrt(mean_squared_error(true,pred))

In [66]:
# MEASSURING PERFORMANCE

# RMSE

SV_RMSE_train = root_mean_squared_error(SV_search.best_estimator_.predict(X_train), y_train)
SV_RMSE_test = root_mean_squared_error(SV_search.best_estimator_.predict(X_test), y_test)
print('RMSE SVR-model:\n',
      SV_RMSE_train, '(train)\n',
      SV_RMSE_test, '(test)\n')

KN_RMSE_train = root_mean_squared_error(KN_search.best_estimator_.predict(X_train), y_train)
KN_RMSE_test = root_mean_squared_error(KN_search.best_estimator_.predict(X_test), y_test)
print('RMSE KNN-model:\n',
      KN_RMSE_train, '(train)\n',
      KN_RMSE_test, '(test)\n')

Tree_RMSE_train = root_mean_squared_error(Tree_search.best_estimator_.predict(X_train), y_train)
Tree_RMSE_test = root_mean_squared_error(Tree_search.best_estimator_.predict(X_test), y_test)
print('RMSE Tree-model:\n',
      Tree_RMSE_train, '(train)\n',
      Tree_RMSE_test, '(test)\n')

# CV error with RMSE

SV_cv_error = np.sqrt(-SV_search.cv_results_['mean_test_score'].mean())
print('SVR CV error:', SV_cv_error)

KN_cv_error = np.sqrt(-KN_search.cv_results_['mean_test_score'].mean())
print('KNN CV error:', KN_cv_error)

Tree_cv_error = np.sqrt(-Tree_search.cv_results_['mean_test_score'].mean())
print('Tree CV error:', Tree_cv_error)


RMSE SVR-model:
 20.75485493513342 (train)
 18.287506745494547 (test)

RMSE KNN-model:
 7.7494878738374355 (train)
 13.166229371402794 (test)

RMSE Tree-model:
 17.826492965720153 (train)
 21.338600116751042 (test)

SVR CV error: 221.86378791838032
KNN CV error: 21.294535246390435
Tree CV error: 25.256172683292885


In [79]:
# BEST MODEL PREDICTIONS for TEST_INPUT

model = KN_search.best_estimator_

prediction = pd.DataFrame(model.predict(final_test_df))
# prediction.to_csv("AndresJavierPablo.csv" , sep=" ", header=False, index=False)