# Template Pipeline
Normalisation&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
**-**  
Imputation 1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
**Median**  
Outlier Detection&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
**Z Score**  
Imputation 2&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
**Median**  
Feature Selection&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
**Implicit to model**  
Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
**Lasso Regression**


## Imports

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [13]:
# General
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ML
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFE
import xgboost as xgb

# Custom
import sys,os
# %cd /content/gdrive/My Drive/ETHZ/Autumn2020/AML/Git/AML/Task_1/Solution
# sys.path.append('/content/gdrive/My Drive/ETHZ/Autumn2020/AML/Git/AML/Task_1/Solution')
sys.path.append('/home/jovyan/work/AML/Task_1/Solution') # I would like a cleaner solution but works for now
# import Components.Imputation as Imputation
# import Components.Outlier_Detection_1D as Outlier_Detection_1D
# import Components.Outlier_Detection_ND as Outlier_Detection_ND
# import Components.Feature_Selection as Feature_Selection
# import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching

# # CAREFUL:
# # If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
# importlib.reload(Imputation)
# importlib.reload(Outlier_Detection_1D)
# importlib.reload(Outlier_Detection_ND)
# importlib.reload(Feature_Selection)
# importlib.reload(Normalisation)
importlib.reload(data_fetching)

<module 'Components.data_fetching' from '/home/jovyan/work/AML/Task_1/Solution/Components/data_fetching.py'>

## Pipeline Optimisatoin

In [10]:
X,y = data_fetching.get_train_data()
x_test_final = data_fetching.get_test_data()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [14]:
nrm = QuantileTransformer(output_distribution='normal')
imp = IterativeImputer(missing_values=np.nan, max_iter=10, initial_strategy='median' ,random_state=0)
rfe = RFE(Ridge())
boost = xgb.XGBRegressor()

pipe = Pipeline(steps=[('normalise', nrm),
                        ('impute', imp),
                        ('feature_select', rfe),
                        ('regress', boost)])

# Normalisation
n_quants = [50,75,100]
# Imputer
n_nearest_features = [5,10,15]
# Feature Selection
n_features = [30,50,70]
# Model
# max_depth = [2,3,4]
learning_rate = [0.01, 0.05, 0.1]
n_estimators = [100, 150, 200]

parameters = dict(normalise__n_quantiles=n_quants,
                    impute__n_nearest_features=n_nearest_features,
                    feature_select__n_features_to_select=n_features,
                    regress__learning_rate=learning_rate,
                    regress__n_estimators=n_estimators)

clf = GridSearchCV(pipe, parameters)
clf.fit(X, y)

# View The Best Parameters
print('n_quantiles=', clf.best_estimator_.get_params()['normalise__n_quantiles'])
print('n_nearest_features=', clf.best_estimator_.get_params()['impute__n_nearest_features'])
print('n_features_to_select=', clf.best_estimator_.get_params()['feature_select__n_features_to_select'])
print('learning_rate=', clf.best_estimator_.get_params()['regress__learning_rate'])
print('n_estimators=', clf.best_estimator_.get_params()['regress__n_estimators'])
best_n_quantiles= clf.best_estimator_.get_params()['normalise__n_quantiles']
best_n_nearest_features= clf.best_estimator_.get_params()['impute__n_nearest_features']
best_n_features_to_select= clf.best_estimator_.get_params()['feature_select__n_features_to_select']
best_learning_rate= clf.best_estimator_.get_params()['regress__learning_rate']
best_n_estimators= clf.best_estimator_.get_params()['regress__n_estimators']

KeyboardInterrupt: 

## Testing

In [None]:
kfold = KFold(n_splits=10)
results = cross_val_score(clf, x_train, y_train, cv=kfold)
print(results)
print(results.mean())

print('Train Score')
y_pred_train = clf.predict(x_train)
test_score = r2_score(y_train, y_pred_train)
print(test_score)

print('Test Score')
y_pred_test = clf.predict(x_test)
test_score = r2_score(y_test, y_pred_test)
print(test_score)

[0.52196672 0.52751118 0.52042321 0.44937897 0.45876363 0.58604072
 0.52967402 0.49833196 0.6046143  0.51960716]
0.521631187746819
Train Score
0.7553048082770955
Test Score
0.5988531138208393


## Final Prediction

In [None]:
pipe = Pipeline(steps=[('normalise', nrm),
                        ('impute', imp),
                        ('feature_select', rfe),
                        ('regress', boost)])

parameters = dict(normalise__n_quantiles=best_n_quantiles,
                    impute__n_nearest_features=best_n_nearest_features,
                    feature_select__n_features_to_select=best_n_features,
                    regress__learning_rate=best_learning_rate,
                    regress__n_estimators=best_n_estimators)

clf = GridSearchCV(pipe, parameters)
clf.fit(X, y)

y_pred = clf.predict(x_test_final)
plt.hist(y_pred)
print("Train Score:", r2_score(y, clf.predict(X)))

y_pred_pd = pd.DataFrame(data=y_pred, columns=["y"])
y_pred_pd.to_csv('../Predictions/XGBoost_Optimised.csv', index_label='id')