# Hyperparameter Tuning Pipeline
### This pipeline is used to specify all possible hp and tune them

## Imports

In [None]:
# General
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ML
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LassoLarsCV, LassoCV, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE

#from sklearn.preprocessing import StandardScaler

# Custom
import sys,os
sys.path.append('/home/jovyan/work/AML/Task_1/Solution') # I would like a cleaner solution but works for now
import Components.Imputation as Imputation
import Components.Outlier_Detection_1D as Outlier_Detection_1D
import Components.Outlier_Detection_ND as Outlier_Detection_ND
import Components.Feature_Selection as Feature_Selection
import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching

# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(Imputation)
importlib.reload(Outlier_Detection_1D)
importlib.reload(Outlier_Detection_ND)
importlib.reload(Feature_Selection)
importlib.reload(Normalisation)
importlib.reload(data_fetching)

<module 'Components.data_fetching' from '/home/jovyan/work/AML/Task_1/Solution/Components/data_fetching.py'>

## Data Cleaning

### Data import

In [None]:
x_train, y_train = data_fetching.get_train_data()
x_test = data_fetching.get_test_data()

### First (simple) imputation

In [None]:
print("Missing values before simple median imputation: ")
print(x_train.isnull().sum().sum())
missing_values = Imputation.missing_values_mask(x_train)

x_train = Imputation.median(x_train, x_test)

print("Missing values after simple median imputation: ")
print(x_train.isnull().sum().sum())

Missing values before simple median imputation: 
76910
Missing values after simple median imputation: 
0


## BIG LOOP

In [None]:
normalization_algos = [Normalisation.gaussian] #, 'to implement: yeo-johnson']
one_dim_outlier_algos = [Outlier_Detection_1D.z_score]
magic_indices_outlier_numbers = [i for i in range(0,100,25)]
impute_algos = [Imputation.mean, Imputation.iterative_regression2]
feature_selections = ['LassoLarsCV', 'RFE'] #, ' to implement correlation']
models = ['xgbr', 'GradientBoostingRegressor', 'LassoCV'] #, 'todo what other models to test?']

In [None]:
all_scores = list()
all_scores_mean = list()
config = list()
import json
# need to rename the x train to get a consistent loop
for norm in normalization_algos:
    ### normalization ###
    print('normalisation')
    print(norm)
    x_train_norm = norm(x_train)
    x_test_norm = norm(x_test)
    for one_dim_outlier_alg in one_dim_outlier_algos:
        ### 1d outlier ###
        print('1d outlier')
        print(one_dim_outlier_alg)
        x_train_1d = one_dim_outlier_alg(x_train_norm)
        for magic_indices_outlier_number in magic_indices_outlier_numbers:
            ### md outlier ###
            print('md outlier n outliers')
            print(magic_indices_outlier_number)
            x_train_md, y_train_md, missing_values_md = Outlier_Detection_ND.magic_indices(x_train_1d,y_train,n_outliers=magic_indices_outlier_number, mask=missing_values)
            for impute_alg in impute_algos:
                ### imputation ###
                print('imputation')
                print(impute_alg)
                x_train_impute = x_train_md.mask(missing_values_md)
                x_train_impute, x_test_impute = impute_alg(x_train_impute,x_test_norm)
                for feature_selection in feature_selections:
                    for model in models:
                        print('pipeline')
                        print(feature_selection)
                        print(model)
                        ### pipeline ###
                        # maybe some ifs needed to configure different models here:
                        if feature_selection == 'LassoLarsCV':
                            if model == 'GradientBoostingRegressor':
                                pipe = Pipeline([('feature_selection', SelectFromModel(LassoLarsCV()),
                        ('regressor', GradientBoostingRegressor()])
                            elif model == 'LassoCV':
                                # dont know if alpha needs more fitting
                                pipe = Pipeline([('feature_selection', SelectFromModel(LassoLarsCV()),
                        ('regressor', LassoCV(random_state=0)])
                            elif model == 'xgbr':
                                pipe = Pipeline([('feature_selection', SelectFromModel(LassoLarsCV()),
                        ('regressor', xgb.XGBRegressor(max_depth=3, n_estimators=75, learning_rate=0.05, alpha=1, objective='reg:squarederror')])

                        elif feature_selection == 'RFE':
                            if model == 'GradientBoostingRegressor':
                                pipe = Pipeline([('feature_selection', SelectFromModel(RFE(Ridge(), n_features_to_select=50()),
                        ('regressor', GradientBoostingRegressor()])
                            elif model == 'LassoCV':
                                # dont know if alpha needs more fitting
                                pipe = Pipeline([('feature_selection', SelectFromModel(RFE(Ridge(), n_features_to_select=50()),
                        ('regressor', LassoCV(random_state=0)])
                            elif model == 'xgbr':
                                pipe = Pipeline([('feature_selection', SelectFromModel(RFE(Ridge(), n_features_to_select=50()),
                        ('regressor', xgb.XGBRegressor(max_depth=3, n_estimators=75, learning_rate=0.05, alpha=1, objective='reg:squarederror')])
                        
                        scores = cross_val_score(pipe, x_train_impute, np.ravel(y_train_md), cv=5, scoring='r2')
                        mean = np.mean(scores)
                        print(scores)
                        print(mean)
                        all_scores.append(scores)
                        all_scores_mean.append(mean)
                        loop_config = {
                            'normalization' : str(norm),
                            'one_dim_outlier' : str(one_dim_outlier_alg),
                            'magic_indices_n_outlier' : magic_indices_outlier_number,
                            'imputation' : str(impute_alg),
                            'feature_selection' : str(feature_selection),
                            'model' : str(model),
                            'scores' : scores,
                            'mean_score' : mean
                        }
                        config.append(loop_config.copy())
                        with open('../../Predictions/hyperparameter_tuning.json', 'w') as fout:
                            fout.write(json.dumps(config))




normalisation
<function gaussian at 0x7f7163634bf8>
1d outlier
<function z_score at 0x7f715d360950>
md outlier n outliers
10
imputation
<function knn2 at 0x7f715d360a60>
pipeline
<class 'sklearn.linear_model._least_angle.LassoLarsCV'>
<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
[0.62984637 0.56484081 0.51662229 0.52947056 0.61813774]
0.5717835561408494
pipeline
<class 'sklearn.linear_model._least_angle.LassoLarsCV'>
<class 'sklearn.linear_model._coordinate_descent.Lasso'>


[0.52484922 0.49012091 0.40378024 0.48554683 0.53890757]
0.4886409504382453
imputation
<function iterative_regression2 at 0x7f7163634400>


KeyboardInterrupt: 

## Final Prediction

In [None]:
pipe.fit(x_train, np.ravel(y_train))

# Note: They said no outliers were introduced in x_test so no need to perform outlier detection
y_pred = pipe.predict(x_test)
plt.hist(y_pred)

y_pred_pd = pd.DataFrame(data=y_pred, columns=["y"])
y_pred_pd.to_csv('../../Predictions/TODO_Give_new_name.csv', index_label='id')