# Języki Programowania Python i R


## dr inż. Patryk Jasik
### Division of Theoretical Physics and Quantum Information
### Institute of Physics and Computer Science
### Faculty of Applied Physics and Mathematics
### Gdansk University of Technology

# scikit-learn docs
## https://scikit-learn.org/stable/

In [2]:
#%config Completer.use_jedi = False

**Regression** - is an approach for modelling the relationship between a scalar response and one or more explanatory variables (also known as dependent and independent variables).

2D problem\
$$
y_i = a*x_i + b
$$


Multidimensional problem\
$$
y_i = a_1*x_{i1} + a_2*x_{i2} + ... + a_p*x_{ip} + intercept
$$

In [3]:
#loading the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn.model_selection import KFold

In [4]:
#measurements of physical and chemical properties of Portuguese Vinho Verde wines (white and red) 
white_wine = pd.read_csv("data/white_wine.csv")
white_wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,4
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,4
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,4
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,4
4,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,4


### goal - we will check whether alcohol is a function of the remaining 10 variables and what is the relationship.
### Thanks to this, we will be able to explain the derivative of what set of factors the given alcohol content is, as well as predict the alcohol content in the newly produced batch of wine.

In [5]:
white_wine.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response
count,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0
mean,6.839346,0.280538,0.334332,5.914819,0.045905,34.889169,137.193512,0.99379,3.195458,0.490351,10.589358,3.854835
std,0.86686,0.103437,0.122446,4.861646,0.023103,17.210021,43.129065,0.002905,0.151546,0.113523,1.217076,0.890683
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,1.0
25%,6.3,0.21,0.27,1.6,0.035,23.0,106.0,0.99162,3.09,0.41,9.5,3.0
50%,6.8,0.26,0.32,4.7,0.042,33.0,133.0,0.9935,3.18,0.48,10.4,4.0
75%,7.3,0.33,0.39,8.9,0.05,45.0,166.0,0.99571,3.29,0.55,11.4,4.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,7.0


In [6]:
white_wine.columns

Index(['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'response'],
      dtype='object')

In [7]:
#predictors
X = white_wine.iloc[:, :-2]
X.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
4,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47


In [8]:
#the target variable
y = white_wine.iloc[:, -2]
y.head(10)

0     8.8
1     9.5
2    10.1
3     9.9
4     9.6
5    11.0
6    12.0
7     9.7
8    10.8
9    12.4
Name: alcohol, dtype: float64

In [9]:
y.tail(10)

3951     9.2
3952     9.4
3953    11.8
3954    10.6
3955     9.7
3956    11.2
3957     9.6
3958     9.4
3959    12.8
3960    11.8
Name: alcohol, dtype: float64

In [10]:
#we will create a function that fits the linear regression model to a given sample
#and computes errors of prediction
def fit_regression(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    r2 = sklearn.metrics.r2_score
    rmse = sklearn.metrics.mean_squared_error
    mae = sklearn.metrics.mean_absolute_error
    
    return {
        "r_score_tr": r2(y_train, y_train_pred),
        "r_score_te": r2(y_test, y_test_pred),
        "RMSE_tr": rmse(y_train, y_train_pred, squared=False),
        "RMSE_te": rmse(y_test, y_test_pred, squared=False),
        "MAE_tr": mae(y_train, y_train_pred),
        "MAE_te": mae(y_test, y_test_pred)
    }

In [13]:
results_final = pd.read_csv("data/results_final.csv", index_col=0)

In [12]:
results_final

NameError: name 'results_final' is not defined

In [None]:
white_wine_wout_outl = pd.read_csv("data/white_wine_wout_outl.csv")

In [None]:
#the dataframe without outliers
white_wine_wout_outl

In [None]:
# now, we will create the new reg model based on dataset without outliers
X_wout_out = white_wine_wout_outl.iloc[:,:-2]

In [None]:
X_wout_out

In [None]:
y_wout_out = white_wine_wout_outl.iloc[:,-2]

In [None]:
y_wout_out

In [None]:
X_train_wo, X_test_wo, y_train_wo, y_test_wo = sklearn.model_selection.train_test_split(X_wout_out,
                                                                        y_wout_out,
                                                                        test_size=0.2,
                                                                        random_state=12345)

## Standarization and normalization of data

In [None]:
# we will work with dataset without outliers
X_wout_out.describe()

In [None]:
y_wout_out.describe()

In [None]:
# let's calculate the mean values
X_wout_out_mean = X_wout_out.mean()
X_wout_out_mean

In [None]:
# and standard deviation
X_wout_out_std = X_wout_out.std()

In [None]:
# standarization of the predictors
X_wo_std = (X_wout_out - X_wout_out_mean)/X_wout_out_std

In [None]:
X_wo_std.describe()

In [None]:
# and the same we will do with target variable
y_wo_m = y_wout_out.mean()
y_wo_sd = y_wout_out.std()

In [None]:
y_wo_std = (y_wout_out-y_wo_m)/y_wo_sd

In [None]:
y_wo_std.describe()

In [None]:
X_train_wo_std, X_test_wo_std, y_train_wo_std, y_test_wo_std = model_selection.train_test_split(
    X_wo_std, y_wout_out, test_size=0.2, random_state=12345)

In [None]:
params = ["Lin. Reg. wout outl std"]
res = [fit_regression(linear_model.LinearRegression(),
                          X_train_wo_std, X_test_wo_std, y_train_wo_std, y_test_wo_std)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
results_final

In [None]:
# the same model with standardized target variable
X_train_wo_std_y, X_test_wo_std_y, y_train_wo_std_y, y_test_wo_std_y = model_selection.train_test_split(
    X_wo_std, y_wo_std, test_size=0.2, random_state=12345)

In [None]:
params = ["Lin. Reg. wout outl std y"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_std_y, X_test_wo_std_y, y_train_wo_std_y, y_test_wo_std_y)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
results_final

In [None]:
# MinMax normalization of the dataset
X_wo_norm = (X_wout_out - X_wout_out.min())/(X_wout_out.max() - X_wout_out.min())
y_wo_norm = (y_wout_out - y_wout_out.min())/(y_wout_out.max() - y_wout_out.min())

In [None]:
X_wo_norm.describe()

In [None]:
X_train_wo_norm, X_test_wo_norm, y_train_wo_norm, y_test_wo_norm = model_selection.train_test_split(
    X_wo_norm, y_wout_out, test_size=0.2, random_state=12345)

In [None]:
params = ["Lin. Reg. wout outl norm"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_norm, X_test_wo_norm, y_train_wo_norm, y_test_wo_norm)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
results_final

In [None]:
# the same model with normalized target variable
# THE BIG TRAP!!!
X_train_wo_norm_y, X_test_wo_norm_y, y_train_wo_norm_y, y_test_wo_norm_y = model_selection.train_test_split(
    X_wo_norm, y_wo_norm, test_size=0.2, random_state=12345)

In [None]:
params = ["Lin. Reg. wout outl norm y"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_norm_y, X_test_wo_norm_y, y_train_wo_norm_y, y_test_wo_norm_y)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
results_final

## Reduction of the problem dimension

In [None]:
# let's have a look on variables importance in the dataset without outliers
lm_wo_std = linear_model.LinearRegression()
lm_wo_std.fit(X_wo_std, y_wo_std)

In [None]:
pd.Series(np.abs(lm_wo_std.coef_), index=X_wo_std.columns.to_list()).round(4).sort_values(ascending=False)

In [None]:
# chlorides is the least important feature, so we will remove it
X_wout_out_chl = X_wout_out.drop(columns=['chlorides']).copy()

In [None]:
X_wout_out_chl

In [None]:
X_train_wo_chl, X_test_wo_chl, y_train_wo_chl, y_test_wo_chl = sklearn.model_selection.train_test_split(
    X_wout_out_chl, y_wout_out, test_size=0.2, random_state=12345)

In [None]:
params = ["Lin. Reg. wout outl chl"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_chl, X_test_wo_chl, y_train_wo_chl, y_test_wo_chl)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
results_final

In [None]:
# the residual.sugar and density are strongly correlate
# but the correlation of the residual.sugar with the target value (alcohol) is smaller
# so let's remove the residual.sugar

In [None]:
X_wout_out_res = X_wout_out.drop(columns=['residual.sugar']).copy()

In [None]:
X_wout_out_res

In [None]:
X_train_wo_res, X_test_wo_res, y_train_wo_res, y_test_wo_res = sklearn.model_selection.train_test_split(
    X_wout_out_res, y_wout_out, test_size=0.2, random_state=12345)

In [None]:
params = ["Lin. Reg. wout outl res"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_res, X_test_wo_res, y_train_wo_res, y_test_wo_res)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
# ANOTHER BIG TRAP!!!
results_final

## The regression model based on polynomials

In [None]:
# we use the PolynomialFeatures function with grade 2,
# to generate new features that are the product of the base features,
#for example [x1,x2,x3] -> [x1, x2, x3, x1^2, x1x2, x1x3, x2^2, x2x3, x3^2]

import sklearn.preprocessing
polynomial2_feature = sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False)
polynomial2_feature.fit_transform(np.array([[2,3,5],[1,2,3]]))

In [None]:
#we can check the powers of individual variables (we look at the columns)
polynomial2_feature.powers_.T

In [None]:
# we build a polynomial model transforming the training dataset of predictors X_train_wo
# and the test dataset of predictors X_test_wo
polynomial2 = sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False)
X2_wo_train = polynomial2.fit_transform(X_train_wo)
X2_wo_test = polynomial2.fit_transform(X_test_wo)

In [None]:
#now we have 65 columns
X2_wo_train.shape

In [None]:
#we check the new model
params = ["Lin. Reg. wout outl Poly"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X2_wo_train, X2_wo_test, y_train_wo, y_test_wo)]
results = pd.DataFrame(res, index=params)

In [None]:
results_final = results_final.append(results)

In [None]:
results_final

#### we obtained smaller prediction errors, but the number of model parameters increased significantly

#### we are looking for a balance between the complexity of the model and its quality

In [None]:
# the choice of variables for the model can be made using the Schwarz criterion (BIC - Bayesian Information Criterion)
# we choose a regression model that minimizes the function
# BIC (MSE_p, p, n) = n * log (MSE_p) + p * log (n)
# MSE_p is calculated for the model built on the basis of p <= d variables
# p * log (n) is a penalty for the complexity of the model

In [None]:
def BIC(mse, p, n):
    return n*np.log(mse) + p*np.log(n)

In [None]:
# the number of all possible cases to be considered is in the order of 2^d
# 1. we start with the empty model, BIC = +infinity
# 2. we extend the model with the variable for which BIC is the smallest and
# simultaneously decreases the current BIC value
# if there is no such value, we return the current model
# 3. we repeat the 2nd until exhausted

In [None]:
def forward_selection(X, y):
    n, m = X.shape
    best_idx = []
    best_free = set(range(m))
    best_fit = np.inf
    res = []
    
    for i in range(0, m):
        cur_idx = -1
        cur_fit = np.inf
        for e in best_free:
            r = sklearn.linear_model.LinearRegression()
            test_idx = best_idx + [e]
            r.fit(X[:, test_idx], y)
            test_fit = BIC(sklearn.metrics.mean_squared_error(y, r.predict(X[:, test_idx])), i+2, n)
            if test_fit < cur_fit: cur_idx, cur_fit = e, test_fit
        if cur_fit > best_fit: break
        
        best_idx, best_fit = best_idx + [cur_idx], cur_fit
        best_free.discard(cur_idx)
        res.append((cur_idx, cur_fit))
    return res

In [None]:
#we apply the variable selection algorithm to the polynomial transformed dataset

chosen_df = pd.DataFrame(forward_selection(X2_wo_train, y_train_wo), columns=["variable", "BIC"])

chosen_variables = chosen_df["variable"].tolist()

chosen_df["name_of_variable"] =\
[X_wout_out.columns[w>=1].append(X_wout_out.columns[w==2]).str.cat(sep="*") for w in polynomial2.powers_[chosen_variables]]

chosen_df

In [None]:
chosen_variables

In [None]:
#and the results
params = ["Lin. Reg. BIC"]

res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X2_wo_train[:, chosen_variables],
                          X2_wo_test[:, chosen_variables],
                          y_train_wo, y_test_wo)]

results = pd.DataFrame(res, index=params)


In [None]:
results_final = results_final.append(results)

In [None]:
results_final

## Please check the BIC chosen features with cross validation

In [None]:
# and summary with some figures
plt.figure(figsize=(12,6))

plt.plot(results_final['RMSE_tr'], label='traning')
plt.plot(results_final['RMSE_te'], label='test')
plt.legend()
plt.xticks(np.arange(len(results_final.index.tolist())), results_final.index.tolist(), rotation=75)

plt.show()

In [None]:
plt.figure(figsize=(12,6))

plt.plot(results_final['MAE_tr'], label='traning')
plt.plot(results_final['MAE_te'], label='test')
plt.legend()
plt.xticks(np.arange(len(results_final.index.tolist())), results_final.index.tolist(), rotation=75)

plt.show()

In [None]:
plt.figure(figsize=(12,6))

plt.plot(results_final['r_score_tr'], label='traning')
plt.plot(results_final['r_score_te'], label='test')
plt.legend()
plt.xticks(np.arange(len(results_final.index.tolist())), results_final.index.tolist(), rotation=75)

plt.show()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
# cross validation for several models based on dataset without outliers
methods = pd.Series({
    "LinearRegression": sklearn.linear_model.LinearRegression(),
    "ElasticNet": sklearn.linear_model.ElasticNet(),
    "Ridge": sklearn.linear_model.Ridge(),
    "KNN": KNeighborsRegressor(),
    "GP": GaussianProcessRegressor(),
    "RF": RandomForestRegressor(),
    "SVR": SVR()
})

#evaluation function
def eval_function(X_train, X_test, y_train, y_test):
    cv_models = pd.concat([
        pd.Series(fit_regression(alg,
                                 X_train, X_test, y_train, y_test)) for alg in methods], axis=1).T
    cv_models.index = methods.index
    return cv_models

kf = KFold(n_splits=5)

n_folds = 5

results_cv = [eval_function(X_wout_out.iloc[train,:],
                            X_wout_out.iloc[test,:],
                            y_wout_out.iloc[train],
                            y_wout_out.iloc[test]) for train, test in kf.split(X_wout_out)]

sum(results_cv)/n_folds

# :) We have to discuss these results.

In [None]:
results_cv = [eval_function(X.iloc[train,:],
                            X.iloc[test,:],
                            y.iloc[train],
                            y.iloc[test]) for train, test in kf.split(X)]

sum(results_cv)/n_folds