# Języki Programowania Python i R


## dr inż. Patryk Jasik
### Division of Theoretical Physics and Quantum Information
### Institute of Physics and Computer Science
### Faculty of Applied Physics and Mathematics
### Gdansk University of Technology

# scikit-learn docs
## https://scikit-learn.org/stable/

In [2]:
#%config Completer.use_jedi = False

**Regression** - is an approach for modelling the relationship between a scalar response and one or more explanatory variables (also known as dependent and independent variables).

2D problem\
$$
y_i = a*x_i + b
$$


Multidimensional problem\
$$
y_i = a_1*x_{i1} + a_2*x_{i2} + ... + a_p*x_{ip} + intercept
$$

In [3]:
#loading the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn.model_selection import KFold

In [4]:
#measurements of physical and chemical properties of Portuguese Vinho Verde wines (white and red) 
white_wine = pd.read_csv("data/white_wine.csv")
white_wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,4
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,4
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,4
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,4
4,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,4


### goal - we will check whether alcohol is a function of the remaining 10 variables and what is the relationship.
### Thanks to this, we will be able to explain the derivative of what set of factors the given alcohol content is, as well as predict the alcohol content in the newly produced batch of wine.

In [5]:
white_wine.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response
count,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0,3961.0
mean,6.839346,0.280538,0.334332,5.914819,0.045905,34.889169,137.193512,0.99379,3.195458,0.490351,10.589358,3.854835
std,0.86686,0.103437,0.122446,4.861646,0.023103,17.210021,43.129065,0.002905,0.151546,0.113523,1.217076,0.890683
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,1.0
25%,6.3,0.21,0.27,1.6,0.035,23.0,106.0,0.99162,3.09,0.41,9.5,3.0
50%,6.8,0.26,0.32,4.7,0.042,33.0,133.0,0.9935,3.18,0.48,10.4,4.0
75%,7.3,0.33,0.39,8.9,0.05,45.0,166.0,0.99571,3.29,0.55,11.4,4.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,7.0


In [6]:
white_wine.columns

Index(['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'response'],
      dtype='object')

In [7]:
#predictors
X = white_wine.iloc[:, :-2]
X.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
4,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47


In [8]:
#the target variable
y = white_wine.iloc[:, -2]
y.head(10)

0     8.8
1     9.5
2    10.1
3     9.9
4     9.6
5    11.0
6    12.0
7     9.7
8    10.8
9    12.4
Name: alcohol, dtype: float64

In [9]:
y.tail(10)

3951     9.2
3952     9.4
3953    11.8
3954    10.6
3955     9.7
3956    11.2
3957     9.6
3958     9.4
3959    12.8
3960    11.8
Name: alcohol, dtype: float64

In [10]:
#we will create a function that fits the linear regression model to a given sample
#and computes errors of prediction
def fit_regression(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    r2 = sklearn.metrics.r2_score
    rmse = sklearn.metrics.mean_squared_error
    mae = sklearn.metrics.mean_absolute_error
    
    return {
        "r_score_tr": r2(y_train, y_train_pred),
        "r_score_te": r2(y_test, y_test_pred),
        "RMSE_tr": rmse(y_train, y_train_pred, squared=False),
        "RMSE_te": rmse(y_test, y_test_pred, squared=False),
        "MAE_tr": mae(y_train, y_train_pred),
        "MAE_te": mae(y_test, y_test_pred)
    }

In [13]:
results_final = pd.read_csv("data/results_final.csv", index_col=0)

In [14]:
results_final

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995


In [16]:
white_wine_wout_outl = pd.read_csv("data/white_wine_wout_outl.csv")

In [17]:
#the dataframe without outliers
white_wine_wout_outl

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response
0,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,4
1,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,4
2,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,4
3,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.99490,3.18,0.47,9.6,4
4,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.99380,3.22,0.45,11.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
3615,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,4
3616,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,3
3617,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,4
3618,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,5


In [18]:
# now, we will create the new reg model based on dataset without outliers
X_wout_out = white_wine_wout_outl.iloc[:,:-2]

In [19]:
X_wout_out

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
0,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49
1,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44
2,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40
3,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.99490,3.18,0.47
4,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.99380,3.22,0.45
...,...,...,...,...,...,...,...,...,...,...
3615,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50
3616,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46
3617,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46
3618,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38


In [20]:
y_wout_out = white_wine_wout_outl.iloc[:,-2]

In [21]:
y_wout_out

0        9.5
1       10.1
2        9.9
3        9.6
4       11.0
        ... 
3615    11.2
3616     9.6
3617     9.4
3618    12.8
3619    11.8
Name: alcohol, Length: 3620, dtype: float64

In [22]:
X_train_wo, X_test_wo, y_train_wo, y_test_wo = sklearn.model_selection.train_test_split(X_wout_out,
                                                                        y_wout_out,
                                                                        test_size=0.2,
                                                                        random_state=12345)

## Standarization and normalization of data

In [23]:
# we will work with dataset without outliers
X_wout_out.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
count,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0
mean,6.825552,0.27295,0.326392,5.904793,0.043,34.381492,136.233287,0.993713,3.195343,0.486287
std,0.811477,0.087275,0.101784,4.697618,0.012007,15.476731,41.627261,0.002791,0.143529,0.105441
min,4.4,0.08,0.0,0.6,0.012,2.0,21.0,0.98711,2.79,0.22
25%,6.3,0.21,0.27,1.6,0.035,23.0,106.0,0.99154,3.1,0.41
50%,6.8,0.26,0.32,4.8,0.042,33.0,132.0,0.9934,3.19,0.47
75%,7.3,0.32,0.38,8.825,0.049,45.0,165.0,0.99566,3.29,0.55
max,9.4,0.59,0.7,20.4,0.115,86.0,260.0,1.00196,3.65,0.83


In [24]:
y_wout_out.describe()

count    3620.000000
mean       10.617172
std         1.207047
min         8.400000
25%         9.600000
50%        10.500000
75%        11.400000
max        14.200000
Name: alcohol, dtype: float64

In [25]:
# let's calculate the mean values
X_wout_out_mean = X_wout_out.mean()
X_wout_out_mean

fixed.acidity             6.825552
volatile.acidity          0.272950
citric.acid               0.326392
residual.sugar            5.904793
chlorides                 0.043000
free.sulfur.dioxide      34.381492
total.sulfur.dioxide    136.233287
density                   0.993713
pH                        3.195343
sulphates                 0.486287
dtype: float64

In [26]:
# and standard deviation
X_wout_out_std = X_wout_out.std()

In [29]:
# standarization of the predictors
X_wo_std = (X_wout_out - X_wout_out_mean)/X_wout_out_std  # standaryzacja Z

In [30]:
X_wo_std.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
count,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0
mean,-3.768624e-16,1.1776950000000001e-17,-2.3553900000000002e-17,1.40342e-16,-6.752119e-16,8.734572000000001e-17,-3.14052e-16,-2.229769e-14,2.119851e-15,4.102305e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.98906,-2.210836,-3.206702,-1.129252,-2.581749,-2.092269,-2.768217,-2.366292,-2.82412,-2.525457
25%,-0.6476495,-0.7212879,-0.5540363,-0.9163777,-0.6662749,-0.7353938,-0.7262858,-0.7788067,-0.6642746,-0.7235054
50%,-0.03148887,-0.148385,-0.062802,-0.2351815,-0.0833045,-0.0892625,-0.1016951,-0.112278,-0.03722278,-0.154468
75%,0.5846718,0.5390985,0.5266792,0.6216357,0.4996659,0.686095,0.6910547,0.6975903,0.6595015,0.6042486
max,3.172546,3.632774,3.670579,3.08565,5.996244,3.335233,2.973213,2.955188,3.167709,3.259757


In [31]:
# and the same we will do with target variable
y_wo_m = y_wout_out.mean()
y_wo_sd = y_wout_out.std()

In [32]:
y_wo_std = (y_wout_out-y_wo_m)/y_wo_sd

In [33]:
y_wo_std.describe()

count    3.620000e+03
mean     7.537249e-16
std      1.000000e+00
min     -1.836857e+00
25%     -8.426950e-01
50%     -9.707346e-02
75%      6.485481e-01
max      2.968260e+00
Name: alcohol, dtype: float64

In [34]:
X_train_wo_std, X_test_wo_std, y_train_wo_std, y_test_wo_std = model_selection.train_test_split(
    X_wo_std, y_wout_out, test_size=0.2, random_state=12345)

In [35]:
params = ["Lin. Reg. wout outl std"]
res = [fit_regression(linear_model.LinearRegression(),
                          X_train_wo_std, X_test_wo_std, y_train_wo_std, y_test_wo_std)]
results = pd.DataFrame(res, index=params)

In [36]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [38]:
results_final # !Standaryzacja nie zmienia NIC w modelu regresji liniowej! (model po standaryzacji nie staje sie lepszy)

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433


In [40]:
# the same model with standardized target variable (wystandaryzowane predyktory i zmienna celu)
X_train_wo_std_y, X_test_wo_std_y, y_train_wo_std_y, y_test_wo_std_y = model_selection.train_test_split(
    X_wo_std, y_wo_std, test_size=0.2, random_state=12345)

In [41]:
params = ["Lin. Reg. wout outl std y"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_std_y, X_test_wo_std_y, y_train_wo_std_y, y_test_wo_std_y)]
results = pd.DataFrame(res, index=params)

In [42]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [44]:
results_final # metryki RMSE spadly, R2 zostalo takie samo; trzeba uwazac, zeby sie nie naciac

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845


In [46]:
# MinMax normalization of the dataset (dane w zakresie 0 do 1)
X_wo_norm = (X_wout_out - X_wout_out.min())/(X_wout_out.max() - X_wout_out.min())
y_wo_norm = (y_wout_out - y_wout_out.min())/(y_wout_out.max() - y_wout_out.min())

In [47]:
X_wo_norm.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
count,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0,3620.0
mean,0.48511,0.378334,0.466275,0.267919,0.300974,0.385494,0.482148,0.444668,0.471329,0.436537
std,0.162295,0.171127,0.145406,0.237253,0.116577,0.184247,0.174173,0.187918,0.166894,0.172854
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.38,0.254902,0.385714,0.050505,0.223301,0.25,0.355649,0.298316,0.360465,0.311475
50%,0.48,0.352941,0.457143,0.212121,0.291262,0.369048,0.464435,0.423569,0.465116,0.409836
75%,0.58,0.470588,0.542857,0.415404,0.359223,0.511905,0.60251,0.575758,0.581395,0.540984
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
X_train_wo_norm, X_test_wo_norm, y_train_wo_norm, y_test_wo_norm = model_selection.train_test_split(
    X_wo_norm, y_wout_out, test_size=0.2, random_state=12345)

In [49]:
params = ["Lin. Reg. wout outl norm"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_norm, X_test_wo_norm, y_train_wo_norm, y_test_wo_norm)]
results = pd.DataFrame(res, index=params)

In [50]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [51]:
results_final

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845
Lin. Reg. wout outl norm,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433


In [52]:
# the same model with normalized target variable
# THE BIG TRAP!!!
X_train_wo_norm_y, X_test_wo_norm_y, y_train_wo_norm_y, y_test_wo_norm_y = model_selection.train_test_split(
    X_wo_norm, y_wo_norm, test_size=0.2, random_state=12345)

In [53]:
params = ["Lin. Reg. wout outl norm y"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_norm_y, X_test_wo_norm_y, y_train_wo_norm_y, y_test_wo_norm_y)]
results = pd.DataFrame(res, index=params)

In [54]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [56]:
results_final # RMSE znow znaczaco spadlo, ale R2 nadal bez zmian wzgledem Lin. Reg. wout out (standaryzacja/normalizacja nie poprawia modelu)

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845
Lin. Reg. wout outl norm,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl norm y,0.914584,0.914229,0.060899,0.060565,0.047282,0.047833


## Reduction of the problem dimension

In [57]:
# let's have a look on variables importance in the dataset without outliers
lm_wo_std = linear_model.LinearRegression()
lm_wo_std.fit(X_wo_std, y_wo_std)

In [58]:
pd.Series(np.abs(lm_wo_std.coef_), index=X_wo_std.columns.to_list()).round(4).sort_values(ascending=False)

density                 1.7333
residual.sugar          0.9902
fixed.acidity           0.3755
pH                      0.2922
sulphates               0.0844
total.sulfur.dioxide    0.0639
free.sulfur.dioxide     0.0464
volatile.acidity        0.0332
citric.acid             0.0328
chlorides               0.0146
dtype: float64

In [59]:
# chlorides is the least important feature, so we will remove it
X_wout_out_chl = X_wout_out.drop(columns=['chlorides']).copy()

In [60]:
X_wout_out_chl

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
0,6.3,0.30,0.34,1.6,14.0,132.0,0.99400,3.30,0.49
1,8.1,0.28,0.40,6.9,30.0,97.0,0.99510,3.26,0.44
2,7.2,0.23,0.32,8.5,47.0,186.0,0.99560,3.19,0.40
3,6.2,0.32,0.16,7.0,30.0,136.0,0.99490,3.18,0.47
4,8.1,0.22,0.43,1.5,28.0,129.0,0.99380,3.22,0.45
...,...,...,...,...,...,...,...,...,...
3615,6.2,0.21,0.29,1.6,24.0,92.0,0.99114,3.27,0.50
3616,6.6,0.32,0.36,8.0,57.0,168.0,0.99490,3.15,0.46
3617,6.5,0.24,0.19,1.2,30.0,111.0,0.99254,2.99,0.46
3618,5.5,0.29,0.30,1.1,20.0,110.0,0.98869,3.34,0.38


In [61]:
X_train_wo_chl, X_test_wo_chl, y_train_wo_chl, y_test_wo_chl = sklearn.model_selection.train_test_split(
    X_wout_out_chl, y_wout_out, test_size=0.2, random_state=12345)

In [62]:
params = ["Lin. Reg. wout outl chl"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_chl, X_test_wo_chl, y_train_wo_chl, y_test_wo_chl)]
results = pd.DataFrame(res, index=params)

In [63]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [65]:
results_final # po wywaleniu chlorides (predyktor, mala korelacja) R2 malutko sie poprawil, RMSE malutko sie pogorszyl (troszke)
# bardzo niewielka roznica w "pogorszonym" modelu; jezeli koszt tego pogorszenia jest niewielki, to lepiej redukowac model o predyktor (tu: chloride)

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845
Lin. Reg. wout outl norm,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl norm y,0.914584,0.914229,0.060899,0.060565,0.047282,0.047833


In [66]:
# the residual.sugar and density are strongly correlate
# but the correlation of the residual.sugar with the target value (alcohol) is smaller
# so let's remove the residual.sugar

In [67]:
X_wout_out_res = X_wout_out.drop(columns=['residual.sugar']).copy() # redukcja jednej z dwoch silnie skorelowanych predyktorow

In [68]:
X_wout_out_res

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
0,6.3,0.30,0.34,0.049,14.0,132.0,0.99400,3.30,0.49
1,8.1,0.28,0.40,0.050,30.0,97.0,0.99510,3.26,0.44
2,7.2,0.23,0.32,0.058,47.0,186.0,0.99560,3.19,0.40
3,6.2,0.32,0.16,0.045,30.0,136.0,0.99490,3.18,0.47
4,8.1,0.22,0.43,0.044,28.0,129.0,0.99380,3.22,0.45
...,...,...,...,...,...,...,...,...,...
3615,6.2,0.21,0.29,0.039,24.0,92.0,0.99114,3.27,0.50
3616,6.6,0.32,0.36,0.047,57.0,168.0,0.99490,3.15,0.46
3617,6.5,0.24,0.19,0.041,30.0,111.0,0.99254,2.99,0.46
3618,5.5,0.29,0.30,0.022,20.0,110.0,0.98869,3.34,0.38


In [69]:
X_train_wo_res, X_test_wo_res, y_train_wo_res, y_test_wo_res = sklearn.model_selection.train_test_split(
    X_wout_out_res, y_wout_out, test_size=0.2, random_state=12345)

In [70]:
params = ["Lin. Reg. wout outl res"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X_train_wo_res, X_test_wo_res, y_train_wo_res, y_test_wo_res)]
results = pd.DataFrame(res, index=params)

In [71]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [73]:
# ANOTHER BIG TRAP!!!
results_final # okazuje sie, ze model ma zaleznosci nieliniowe w sposrod predyktorow (usuniecie jednego z dwoch silnie skorelowanych)

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845
Lin. Reg. wout outl norm,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl norm y,0.914584,0.914229,0.060899,0.060565,0.047282,0.047833


## The regression model based on polynomials

In [74]:
# we use the PolynomialFeatures function with grade 2,
# to generate new features that are the product of the base features,
#for example [x1,x2,x3] -> [x1, x2, x3, x1^2, x1x2, x1x3, x2^2, x2x3, x3^2]

import sklearn.preprocessing
polynomial2_feature = sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False)
polynomial2_feature.fit_transform(np.array([[2,3,5],[1,2,3]]))

array([[ 2.,  3.,  5.,  4.,  6., 10.,  9., 15., 25.],
       [ 1.,  2.,  3.,  1.,  2.,  3.,  4.,  6.,  9.]])

In [77]:
#we can check the powers of individual variables (we look at the columns)
polynomial2_feature.powers_.T

array([[1, 0, 0, 2, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 2, 1, 0],
       [0, 0, 1, 0, 0, 1, 0, 1, 2]])

In [78]:
# we build a polynomial model transforming the training dataset of predictors X_train_wo
# and the test dataset of predictors X_test_wo
polynomial2 = sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False)
X2_wo_train = polynomial2.fit_transform(X_train_wo)
X2_wo_test = polynomial2.fit_transform(X_test_wo)

In [85]:
#now we have 65 columns
X2_wo_train.shape # warto robic modele nie na Pandas DF, tylko na np.array (duzo szybciej)

(2896, 65)

In [80]:
#we check the new model
params = ["Lin. Reg. wout outl Poly"]
res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X2_wo_train, X2_wo_test, y_train_wo, y_test_wo)]
results = pd.DataFrame(res, index=params)

In [81]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [86]:
results_final # model jest nieco lepszy, ale teraz ma 65 zmiennych

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845
Lin. Reg. wout outl norm,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl norm y,0.914584,0.914229,0.060899,0.060565,0.047282,0.047833


#### we obtained smaller prediction errors, but the number of model parameters increased significantly

#### we are looking for a balance between the complexity of the model and its quality

In [88]:
# the choice of variables for the model can be made using the Schwarz criterion (BIC - Bayesian Information Criterion)
# we choose a regression model that minimizes the function
# BIC (MSE_p, p, n) = n * log (MSE_p) + p * log (n)
# MSE_p is calculated for the model built on the basis of p <= d variables
# p * log (n) is a penalty for the complexity of the model

In [89]:
def BIC(mse, p, n):
    return n*np.log(mse) + p*np.log(n)

In [90]:
# the number of all possible cases to be considered is in the order of 2^d
# 1. we start with the empty model, BIC = +infinity
# 2. we extend the model with the variable for which BIC is the smallest and
# simultaneously decreases the current BIC value
# if there is no such value, we return the current model
# 3. we repeat the 2nd until exhausted

In [91]:
def forward_selection(X, y):
    n, m = X.shape
    best_idx = []
    best_free = set(range(m))
    best_fit = np.inf
    res = []
    
    for i in range(0, m):
        cur_idx = -1
        cur_fit = np.inf
        for e in best_free:
            r = sklearn.linear_model.LinearRegression()
            test_idx = best_idx + [e]
            r.fit(X[:, test_idx], y)
            test_fit = BIC(sklearn.metrics.mean_squared_error(y, r.predict(X[:, test_idx])), i+2, n)
            if test_fit < cur_fit: cur_idx, cur_fit = e, test_fit
        if cur_fit > best_fit: break
        
        best_idx, best_fit = best_idx + [cur_idx], cur_fit
        best_free.discard(cur_idx)
        res.append((cur_idx, cur_fit))
    return res

In [92]:
#we apply the variable selection algorithm to the polynomial transformed dataset

chosen_df = pd.DataFrame(forward_selection(X2_wo_train, y_train_wo), columns=["variable", "BIC"])

chosen_variables = chosen_df["variable"].tolist()

chosen_df["name_of_variable"] =\
[X_wout_out.columns[w>=1].append(X_wout_out.columns[w==2]).str.cat(sep="*") for w in polynomial2.powers_[chosen_variables]]

chosen_df

Unnamed: 0,variable,BIC,name_of_variable
0,7,-1838.328123,density
1,13,-3865.048679,fixed.acidity*residual.sugar
2,18,-4605.074184,fixed.acidity*pH
3,17,-5361.557596,fixed.acidity*density
4,3,-5599.845252,residual.sugar
5,63,-5819.294774,pH*sulphates
6,21,-5889.590374,volatile.acidity*citric.acid
7,57,-5912.506864,total.sulfur.dioxide*pH
8,54,-5957.281716,free.sulfur.dioxide*sulphates
9,59,-5980.297829,density*density


In [94]:
chosen_variables # tyko cztery oryginalne zmienne; resta to interakcje wybrane zgodnie z kryterium BIC

[7,
 13,
 18,
 17,
 3,
 63,
 21,
 57,
 54,
 59,
 41,
 37,
 62,
 22,
 55,
 0,
 30,
 50,
 12,
 2,
 29,
 11,
 25]

In [95]:
#and the results
params = ["Lin. Reg. BIC"]

res = [fit_regression(sklearn.linear_model.LinearRegression(),
                          X2_wo_train[:, chosen_variables],
                          X2_wo_test[:, chosen_variables],
                          y_train_wo, y_test_wo)]

results = pd.DataFrame(res, index=params)


In [96]:
results_final = results_final.append(results)

  results_final = results_final.append(results)


In [97]:
results_final

Unnamed: 0,r_score_tr,r_score_te,RMSE_tr,RMSE_te,MAE_tr,MAE_te
Lin. Reg.,0.906772,0.621868,0.37257,0.738505,0.282308,0.30647
Lin. Reg. rs123,0.846568,0.903384,0.476746,0.377983,0.307115,0.290608
Lin. Reg. test size 70,0.90513,0.722845,0.375262,0.637879,0.28307,0.296848
Lin Reg CV 5,0.858625,0.810821,0.453313,0.47167,0.300922,0.315083
Lin. Reg. wout out,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl cv,0.914389,0.897387,0.351425,0.362728,0.273571,0.282995
Lin. Reg. wout outl std,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl std y,0.914584,0.914229,0.292627,0.291023,0.227196,0.229845
Lin. Reg. wout outl norm,0.914584,0.914229,0.353214,0.351278,0.274237,0.277433
Lin. Reg. wout outl norm y,0.914584,0.914229,0.060899,0.060565,0.047282,0.047833


## Please check the BIC chosen features with cross validation

In [None]:
# and summary with some figures
plt.figure(figsize=(12,6))

plt.plot(results_final['RMSE_tr'], label='traning')
plt.plot(results_final['RMSE_te'], label='test')
plt.legend()
plt.xticks(np.arange(len(results_final.index.tolist())), results_final.index.tolist(), rotation=75)

plt.show()

In [None]:
plt.figure(figsize=(12,6))

plt.plot(results_final['MAE_tr'], label='traning')
plt.plot(results_final['MAE_te'], label='test')
plt.legend()
plt.xticks(np.arange(len(results_final.index.tolist())), results_final.index.tolist(), rotation=75)

plt.show()

In [None]:
plt.figure(figsize=(12,6))

plt.plot(results_final['r_score_tr'], label='traning')
plt.plot(results_final['r_score_te'], label='test')
plt.legend()
plt.xticks(np.arange(len(results_final.index.tolist())), results_final.index.tolist(), rotation=75)

plt.show()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
# cross validation for several models based on dataset without outliers
methods = pd.Series({
    "LinearRegression": sklearn.linear_model.LinearRegression(),
    "ElasticNet": sklearn.linear_model.ElasticNet(),
    "Ridge": sklearn.linear_model.Ridge(),
    "KNN": KNeighborsRegressor(),
    "GP": GaussianProcessRegressor(),
    "RF": RandomForestRegressor(),
    "SVR": SVR()
})

#evaluation function
def eval_function(X_train, X_test, y_train, y_test):
    cv_models = pd.concat([
        pd.Series(fit_regression(alg,
                                 X_train, X_test, y_train, y_test)) for alg in methods], axis=1).T
    cv_models.index = methods.index
    return cv_models

kf = KFold(n_splits=5)

n_folds = 5

results_cv = [eval_function(X_wout_out.iloc[train,:],
                            X_wout_out.iloc[test,:],
                            y_wout_out.iloc[train],
                            y_wout_out.iloc[test]) for train, test in kf.split(X_wout_out)]

sum(results_cv)/n_folds

# :) We have to discuss these results.

In [None]:
results_cv = [eval_function(X.iloc[train,:],
                            X.iloc[test,:],
                            y.iloc[train],
                            y.iloc[test]) for train, test in kf.split(X)]

sum(results_cv)/n_folds