# Regressionproblem - Example
In this script we demonstrate how a Machine Learning workflow can look like when you use cross validation on the train set to choose model and then evaluate the chosen models generalization error on the test set. 

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.datasets import load_diabetes

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

# Loading Data

In [3]:
# This code is merely executed to see the description of the data in a smooth way
data = load_diabetes()

In [None]:
print(data.DESCR)

## Storing/Loading the data in the way it will be used

In [5]:
X, y = load_diabetes(return_X_y=True, as_frame=True)

In [None]:
print(X.info())
print()
print(y.info())

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# EDA

In [None]:
# Placing all the data in "df" so I can make a nice correlation plot
df = X.copy()
df['target'] = y

correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)

In [None]:
# Intuitive (and from the correlation plot), therese should be a positive relationship 
# between the target (disease progression) and BMI.
plt.scatter(df["bmi"], df["target"])
plt.xlabel("bmi")
plt.ylabel("Disease Progression")

In [None]:
X_train.head()

In [None]:
X_train.describe()

In [None]:
y_train.head()

In [None]:
y_train.describe()

# Creating 3 different models and choosing the best one with Cross Validation. 

In [None]:
lin_reg = LinearRegression()

# In Scikit-learn, higher score is better. Since a low MSE is better than a high,
# we use negative mean squared error to respect the rule "higher is better".
scores_lr = cross_validate(lin_reg, X_train, y_train, cv=3, scoring = 'neg_mean_squared_error')["test_score"]
print('RMSE for each iteration:', np.sqrt(-scores_lr))
print('RMSE:', np.sqrt(np.mean(-scores_lr)))

In [None]:
lasso = Lasso()
hyper_param_lasso = {'alpha':(0.01, 1, 2, 5, 10)}
lasso_reg = GridSearchCV(lasso, hyper_param_lasso, cv = 5)
# Fit the gridsearch to use the best hyperparameter in our cross validation
lasso_reg.fit(X_train, y_train)

print(lasso_reg.best_params_)

scores_lasso = cross_validate(lasso_reg, X_train, y_train, cv=3, scoring = 'neg_mean_squared_error')["test_score"]
print('RMSE for each iteration:', np.sqrt(-scores_lasso))
print('RMSE:', np.sqrt(np.mean(-scores_lasso)))

In [None]:
forest = RandomForestRegressor()
param_grid = { 
    'n_estimators': [10, 100, 120],
    'max_depth' : [None, 10],
}
forest_reg = GridSearchCV(forest, param_grid=param_grid, cv= 5)
# Fit the gridsearch to use the best hyperparameter in our cross validation
forest_reg.fit(X_train, y_train)

print(forest_reg.best_params_)

scores_forest = cross_validate(forest_reg, X_train, y_train, cv=3, scoring = 'neg_mean_squared_error')["test_score"]
print('RMSE for each iteration:', np.sqrt(-scores_forest))
print('RMSE:', np.sqrt(np.mean(-scores_forest)))

The Linear Regression modell had lowest RMSE and hence we choose that one as our model. 

In [None]:
lin_reg.fit(X_train, y_train)

# Evaluating the chosen model on the test set

In [None]:
# Calculating the mean value of the y_test data. We will put this in relation to the RMSE that we soon calculate.
print(np.mean(y_test))

In [None]:
y_test.plot.box()

In [None]:
y_test_pred_lr = lin_reg.predict(X_test)
RMSE_test_data = root_mean_squared_error(y_test, y_test_pred_lr)
print(RMSE_test_data)

In [None]:
# Our error on the test data is about 37% in relation to the mean value of the test data. 
(RMSE_test_data)/(np.mean(y_test))