In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [2]:
! ls

additional_workshop_representation_learning_and_search_images.ipynb
[34mdata[m[m
final_exam_sm.ipynb
first_example_of_notebook_use.ipynb
[34mimages[m[m
mid_terms.ipynb
mid_terms_sm.ipynb
requirements.txt
workshop_1_linear_regression.ipynb
workshop_2_logistic_regression_lda_qda_nb.ipynb
workshop_3_resampling_methods.ipynb
workshop_4_model_selection_and_regularizations.ipynb
workshop_5_beyond_linearity.ipynb
workshop_6_trees_boosting_bagging.ipynb
workshop_7_support_vector_machines.ipynb
workshop_8_pca_clustering.ipynb


In [3]:
! pip install -U -r requirements.txt







## Update repository

In [4]:
! git pull

Already up to date.


## Add import path

In [5]:
import os
import sys
import gc

In [6]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
del module_path

## Organize imports

In [8]:
import multiprocessing
from pathlib import Path

In [9]:
import seaborn as sns

In [86]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)

#### Number of CPU cores

In [11]:
workers = multiprocessing.cpu_count()
workers

8

## Initialize path

In [12]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [13]:
! ls

additional_workshop_representation_learning_and_search_images.ipynb
[34mdata[m[m
final_exam_sm.ipynb
first_example_of_notebook_use.ipynb
[34mimages[m[m
mid_terms.ipynb
mid_terms_sm.ipynb
requirements.txt
workshop_1_linear_regression.ipynb
workshop_2_logistic_regression_lda_qda_nb.ipynb
workshop_3_resampling_methods.ipynb
workshop_4_model_selection_and_regularizations.ipynb
workshop_5_beyond_linearity.ipynb
workshop_6_trees_boosting_bagging.ipynb
workshop_7_support_vector_machines.ipynb
workshop_8_pca_clustering.ipynb


## Load IRIS dataset

In [14]:
SEED = 2022

In [15]:
iris_url = 'https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2'

#### Load dataset

In [16]:
! ls {IRIS_DIR}

Iris.csv        database.sqlite


In [17]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [18]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [19]:
df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [20]:
y = df['Species']
X = df.drop(columns=['Id', 'Species'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

((150, 4),
 (150,),
 Species
 Iris-setosa        50
 Iris-versicolor    50
 Iris-virginica     50
 Name: count, dtype: int64)

```python
# #define predictor and response variables
X = df[['s_length', 's_width', 'p_length', 'p_width']]
y = df['species']
X.shape, y.shape
```

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

## Initialize the model

In [22]:
lda = LinearDiscriminantAnalysis()

#### Forward selection

In [23]:
X_train.shape

(120, 4)

In [24]:
sfs_forward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='forward',
                  cv=10,
                  n_jobs=workers)

sfs_forward = sfs_forward.fit(X_train, y_train)
sfs_forward

In [25]:
X_train_fw = sfs_forward.transform(X_train)
X_test_fw = sfs_forward.transform(X_test)
X_train_fw.shape

(120, 2)

In [26]:
X_train.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

In [27]:
sfs_forward.get_feature_names_out(X_train.columns)

array(['SepalWidthCm', 'PetalWidthCm'], dtype=object)

In [28]:
lda_fw = LinearDiscriminantAnalysis()

In [29]:
lda_fw = lda_fw.fit(X_train_fw, y_train)

In [30]:
y_pred_fw = lda_fw.predict(X_test_fw)

In [31]:
cr = classification_report(y_test, y_pred_fw)
print(cr)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



#### Backward selection\

In [32]:
lda = LinearDiscriminantAnalysis()

In [33]:
X_train.shape

(120, 4)

In [34]:
sfs_backward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='backward',
                  cv=10,
                  n_jobs=workers)

sfs_backward = sfs_backward.fit(X_train, y_train)
sfs_backward

In [35]:
X_train_bw = sfs_backward.transform(X_train)
X_test_bw = sfs_backward.transform(X_test)
X_train_bw.shape

(120, 2)

In [36]:
sfs_backward.get_feature_names_out(X_train.columns)

array(['SepalWidthCm', 'PetalWidthCm'], dtype=object)

In [37]:
lda_bw = LinearDiscriminantAnalysis()

In [38]:
lda_bw = lda_fw.fit(X_train_bw, y_train)

In [39]:
y_pred_bw = lda_fw.predict(X_test_bw)

In [40]:
cr = classification_report(y_test, y_pred_bw)
print(cr)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



## Ridge and Lasso regression

In [41]:
pertol_consumption_path = LR_PATH / 'petrol_consumption.csv'

In [42]:
df = pd.read_csv(pertol_consumption_path)
df

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410
5,10.0,5342,1333,0.571,457
6,8.0,5319,11868,0.451,344
7,8.0,5126,2138,0.553,467
8,8.0,4447,8577,0.529,464
9,7.0,4512,8507,0.552,498


In [43]:
df.shape

(48, 5)

In [44]:
print(df.describe().round(2).T)

                              count     mean      std      min      25%  \
Petrol_tax                     48.0     7.67     0.95     5.00     7.00   
Average_income                 48.0  4241.83   573.62  3063.00  3739.00   
Paved_Highways                 48.0  5565.42  3491.51   431.00  3110.25   
Population_Driver_licence(%)   48.0     0.57     0.06     0.45     0.53   
Petrol_Consumption             48.0   576.77   111.89   344.00   509.50   

                                  50%      75%       max  
Petrol_tax                       7.50     8.12     10.00  
Average_income                4298.00  4578.75   5342.00  
Paved_Highways                4735.50  7156.00  17782.00  
Population_Driver_licence(%)     0.56     0.60      0.72  
Petrol_Consumption             568.50   632.75    968.00  


#### Prepare dataset

In [45]:
y = df['Petrol_Consumption']
X = df[['Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)', 'Petrol_tax']]

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED)

In [119]:
X_train.head()

Unnamed: 0,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_tax
32,3063,6524,0.578,8.0
4,4399,431,0.544,8.0
0,3571,1976,0.525,9.0
20,4593,7834,0.663,7.0
10,4391,5939,0.53,8.0


In [120]:
k = 10

In [121]:
abs_scaler = MaxAbsScaler()
std_scaler = StandardScaler()
print(X_train.head(k))
X_train = abs_scaler.fit_transform(X_train)
print(X_train[:k])
X_train = std_scaler.fit_transform(X_train)
print(X_train[:k])
print(X_test.head(k))
X_test = abs_scaler.transform(X_test)
print(X_test[:k])
X_test = std_scaler.transform(X_test)
print(X_test[:k])

    Average_income  Paved_Highways  Population_Driver_licence(%)  Petrol_tax
32            3063            6524                         0.578         8.0
4             4399             431                         0.544         8.0
0             3571            1976                         0.525         9.0
20            4593            7834                         0.663         7.0
10            4391            5939                         0.530         8.0
47            5002            9794                         0.593         7.0
13            4207            6580                         0.545         7.0
12            4817            6930                         0.574         7.0
44            5215            2302                         0.672         6.0
23            4258            4686                         0.517         9.0
[[0.57338076 0.36688786 0.79834254 0.8       ]
 [0.82347435 0.02423799 0.75138122 0.8       ]
 [0.66847623 0.11112361 0.72513812 0.9       ]
 [0.85979034

In [126]:
np.mean(X_train, axis=1), np.var(X_train, axis=1)

(array([-3.68552921e-01, -3.58815861e-01, -4.66409236e-01,  5.86791552e-01,
        -4.13751216e-02,  5.66806082e-01, -2.59987101e-01,  1.85902417e-01,
         2.68754961e-01, -7.49815157e-04, -6.38947536e-01,  7.76349435e-01,
         2.59157538e-01,  2.05577009e-01,  5.49486190e-02, -8.87439188e-02,
        -4.85244211e-02, -4.82073010e-02, -8.15233354e-01, -4.20704604e-01,
        -1.40123487e-01, -4.21161922e-01,  2.19565039e-01,  1.66072690e-01,
        -2.12890079e-01,  2.08287213e-01, -6.28599850e-01,  7.92957879e-01,
         7.26092567e-01,  4.59421832e-02, -6.84702253e-01, -6.06184125e-01,
         9.28932298e-03, -1.15702809e-02,  8.07799941e-01,  3.75223151e-02,
         3.12264933e-01,  3.14014902e-02]),
 array([0.9565035 , 0.62844335, 1.13181503, 0.69611183, 0.28462979,
        0.68278867, 0.1554592 , 0.4304327 , 2.62277171, 0.85769059,
        0.04212655, 2.34160004, 0.47584598, 1.11800594, 4.71688838,
        0.90828438, 0.89658602, 0.75627687, 0.46607425, 0.39644191,


In [127]:
np.mean(X_test, axis=1), np.var(X_test, axis=1)

(array([ 0.10249144,  0.23983635, -0.10811823, -0.20916214,  0.11235256,
         0.41196144,  0.13788113,  0.0500624 , -0.93420769, -0.81996457]),
 array([1.53687596, 1.01344624, 0.77277052, 0.81708323, 0.40910151,
        3.22264369, 1.07387714, 0.40348952, 0.0710209 , 0.62328813]))

In [128]:
print(f'{abs_scaler.scale_ = }\n')

abs_scaler.scale_ = array([5.3420e+03, 1.7782e+04, 7.2400e-01, 1.0000e+01])



In [129]:
print(f'{std_scaler.mean_ = }\n, {std_scaler.var_ = }\n, {std_scaler.scale_ = }\n')

std_scaler.mean_ = array([0.78373958, 0.32621841, 0.79757924, 0.76468421])
, std_scaler.var_ = array([0.01054   , 0.03939755, 0.0049014 , 0.00967816])
, std_scaler.scale_ = array([0.10266452, 0.19848816, 0.07000998, 0.09837766])



#### Ridge

In [196]:
ridge_rg = Ridge(alpha=0.8)
ridge_rg.fit(X_train, y_train)

In [197]:
ridge_rg.intercept_

586.8157894736842

In [198]:
ridge_rg.coef_

array([-30.39448402,  -9.45974861,  55.3170188 , -37.75178753])

In [199]:
feature_names = X.columns
feature_names

Index(['Average_income', 'Paved_Highways', 'Population_Driver_licence(%)',
       'Petrol_tax'],
      dtype='object')

In [200]:
feature_names = X.columns
model_coefficients = ridge_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names, 
                              columns = ['Coefficient value'])
print(coefficients_df)

                              Coefficient value
Average_income                       -30.394484
Paved_Highways                        -9.459749
Population_Driver_licence(%)          55.317019
Petrol_tax                           -37.751788


#### Inference with the model

In [201]:
y_pred = ridge_rg.predict(X_test)
y_pred

array([431.9928635 , 711.13715818, 527.0252372 , 510.95753041,
       653.20450907, 356.25469177, 504.34575546, 558.58687507,
       568.87777939, 517.24224693])

In [202]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

    Actual   Predicted
22     464  431.992863
39     968  711.137158
25     566  527.025237
3      414  510.957530
40     587  653.204509
6      344  356.254692
7      467  504.345755
9      498  558.586875
43     591  568.877779
34     487  517.242247


#### Evaluate the model

In [203]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 65.36
Mean squared error: 8892.54
Root mean squared error: 94.30


In [204]:
actual_minus_predicted = sum((y_test - y_pred)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

R²: 0.6561545545120848


In [205]:
ridge_rg.score(X_test, y_test)

0.6561545545120848

In [206]:
ridge_rg.score(X_train, y_train)

0.6513320693694356

#### Lasso

In [218]:
lasso_rg = Lasso(alpha=2.8)
lasso_rg.fit(X_train, y_train)

In [219]:
lasso_rg.intercept_

586.8157894736842

In [220]:
lasso_rg.coef_

array([-28.08064038,  -3.8901699 ,  54.54649613, -32.60660716])

In [221]:
feature_names = X.columns
feature_names

Index(['Average_income', 'Paved_Highways', 'Population_Driver_licence(%)',
       'Petrol_tax'],
      dtype='object')

In [222]:
feature_names = X.columns
model_coefficients = lasso_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names, 
                              columns = ['Coefficient value'])
print(coefficients_df)

                              Coefficient value
Average_income                       -28.080640
Paved_Highways                        -3.890170
Population_Driver_licence(%)          54.546496
Petrol_tax                           -32.606607


#### Inference with the model

In [223]:
y_pred = lasso_rg.predict(X_test)
y_pred

array([437.78745463, 703.99292596, 530.98129898, 508.36489309,
       648.35657516, 374.37494972, 504.74724684, 561.2333563 ,
       559.65326209, 514.04641982])

In [224]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

    Actual   Predicted
22     464  437.787455
39     968  703.992926
25     566  530.981299
3      414  508.364893
40     587  648.356575
6      344  374.374950
7      467  504.747247
9      498  561.233356
43     591  559.653262
34     487  514.046420


#### Evaluate the model

In [225]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 67.07
Mean squared error: 9234.26
Root mean squared error: 96.10


In [226]:
actual_minus_predicted = sum((y_test - y_pred)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

R²: 0.6429416209025577


In [227]:
lasso_rg.score(X_test, y_test)

0.6429416209025577

In [228]:
lasso_rg.score(X_train, y_train)

0.645690900240431