In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [2]:
! ls

additional_workshop_representation_learning_and_search_images.ipynb
[34mdata[m[m
final_exam_sm.ipynb
first_example_of_notebook_use.ipynb
[34mimages[m[m
mid_terms.ipynb
mid_terms_sm.ipynb
requirements.txt
workshop_1_linear_regression.ipynb
workshop_2_logistic_regression_lda_qda_nb.ipynb
workshop_3_resampling_methods.ipynb
workshop_4_model_selection_and_regularizations.ipynb
workshop_5_beyond_linearity.ipynb
workshop_6_trees_boosting_bagging.ipynb
workshop_7_support_vector_machines.ipynb
workshop_8_pca_clustering.ipynb


In [3]:
! pip install -U -r requirements.txt

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m21.1 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:04[0mm




Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.2
    Uninstalling xgboost-2.0.2:
      Successfully uninstalled xgboost-2.0.2
Successfully installed xgboost-2.0.3


## Update repository

In [4]:
! git pull

Already up to date.


## Add import path

In [5]:
import os
import sys
import gc

In [6]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
del module_path

## Organize imports

In [8]:
import multiprocessing
from pathlib import Path

In [9]:
import seaborn as sns

In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)

#### Number of CPU cores

In [11]:
workers = multiprocessing.cpu_count()
workers

8

## Initialize path

In [12]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [13]:
! ls

additional_workshop_representation_learning_and_search_images.ipynb
[34mdata[m[m
final_exam_sm.ipynb
first_example_of_notebook_use.ipynb
[34mimages[m[m
mid_terms.ipynb
mid_terms_sm.ipynb
requirements.txt
workshop_1_linear_regression.ipynb
workshop_2_logistic_regression_lda_qda_nb.ipynb
workshop_3_resampling_methods.ipynb
workshop_4_model_selection_and_regularizations.ipynb
workshop_5_beyond_linearity.ipynb
workshop_6_trees_boosting_bagging.ipynb
workshop_7_support_vector_machines.ipynb
workshop_8_pca_clustering.ipynb


## Load IRIS dataset

In [14]:
SEED = 2022

In [15]:
iris_url = 'https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2'

#### Load dataset

In [16]:
! ls {IRIS_DIR}

Iris.csv        database.sqlite


In [17]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [18]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [19]:
df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [20]:
y = df['Species']
X = df.drop(columns=['Id', 'Species'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

((150, 4),
 (150,),
 Species
 Iris-setosa        50
 Iris-versicolor    50
 Iris-virginica     50
 Name: count, dtype: int64)

```python
# #define predictor and response variables
X = df[['s_length', 's_width', 'p_length', 'p_width']]
y = df['species']
X.shape, y.shape
```

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

## Initialize the model

In [22]:
lda = LinearDiscriminantAnalysis()

#### Forward selection

In [23]:
X_train.shape

(120, 4)

In [25]:
sfs_forward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='forward',
                  cv=10,
                  n_jobs=workers)

sfs_forward = sfs_forward.fit(X_train, y_train)
sfs_forward

In [26]:
X_train_fw = sfs_forward.transform(X_train)
X_test_fw = sfs_forward.transform(X_test)
X_train_fw.shape

(120, 2)

In [27]:
X_train.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

In [28]:
sfs_forward.get_feature_names_out(X_train.columns)

array(['SepalWidthCm', 'PetalWidthCm'], dtype=object)

In [29]:
lda_fw = LinearDiscriminantAnalysis()

In [30]:
lda_fw = lda_fw.fit(X_train_fw, y_train)

In [32]:
y_pred_fw = lda_fw.predict(X_test_fw)

In [43]:
cr_fw = classification_report(y_test, y_pred_fw)
print(cr_fw)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



#### Backward selection\

In [34]:
lda = LinearDiscriminantAnalysis()

In [35]:
X_train.shape

(120, 4)

In [36]:
sfs_backward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='backward',
                  cv=10,
                  n_jobs=workers)

sfs_backward = sfs_backward.fit(X_train, y_train)
sfs_backward

In [37]:
X_train_bw = sfs_backward.transform(X_train)
X_test_bw = sfs_backward.transform(X_test)
X_train_bw.shape

(120, 2)

In [38]:
sfs_backward.get_feature_names_out(X_train.columns)

array(['SepalWidthCm', 'PetalWidthCm'], dtype=object)

In [39]:
lda_bw = LinearDiscriminantAnalysis()

In [40]:
lda_bw = lda_fw.fit(X_train_bw, y_train)

In [41]:
y_pred_bw = lda_fw.predict(X_test_bw)

In [45]:
cr_bw = classification_report(y_test, y_pred_bw)
print(cr_bw)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



In [46]:
print(cr_fw, cr_bw)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



## Ridge and Lasso regression

In [61]:
pertol_consumption_path = LR_PATH / 'petrol_consumption.csv'

In [62]:
df = pd.read_csv(pertol_consumption_path)
df

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410
5,10.0,5342,1333,0.571,457
6,8.0,5319,11868,0.451,344
7,8.0,5126,2138,0.553,467
8,8.0,4447,8577,0.529,464
9,7.0,4512,8507,0.552,498


In [63]:
df.shape

(48, 5)

In [64]:
print(df.describe().round(2).T)

                              count     mean      std      min      25%  \
Petrol_tax                     48.0     7.67     0.95     5.00     7.00   
Average_income                 48.0  4241.83   573.62  3063.00  3739.00   
Paved_Highways                 48.0  5565.42  3491.51   431.00  3110.25   
Population_Driver_licence(%)   48.0     0.57     0.06     0.45     0.53   
Petrol_Consumption             48.0   576.77   111.89   344.00   509.50   

                                  50%      75%       max  
Petrol_tax                       7.50     8.12     10.00  
Average_income                4298.00  4578.75   5342.00  
Paved_Highways                4735.50  7156.00  17782.00  
Population_Driver_licence(%)     0.56     0.60      0.72  
Petrol_Consumption             568.50   632.75    968.00  


#### Prepare dataset

In [69]:
y = df['Petrol_Consumption']
X = df[['Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)', 'Petrol_tax']]

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED)

In [143]:
std_scaller = StandardScaler()
X_train = std_scaller.fit_transform(X_train)
X_test = std_scaller.transform(X_test)

In [144]:
X.shape

(48, 4)

#### Ridge

In [145]:
ridge_rg = Ridge(alpha=100.08)
ridge_rg.fit(X_train, y_train)

In [146]:
ridge_rg.intercept_

586.8157894736842

In [147]:
ridge_rg.coef_

array([ -5.03608463,   0.84867214,  15.28124177, -11.63981555])

In [148]:
feature_names = X.columns
feature_names

Index(['Average_income', 'Paved_Highways', 'Population_Driver_licence(%)',
       'Petrol_tax'],
      dtype='object')

In [149]:
feature_names = X.columns
model_coefficients = ridge_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names, 
                              columns = ['Coefficient value'])
print(coefficients_df)

                              Coefficient value
Average_income                        -5.036085
Paved_Highways                         0.848672
Population_Driver_licence(%)          15.281242
Petrol_tax                           -11.639816


#### Inference with the model

In [150]:
y_pred = ridge_rg.predict(X_test)
y_pred

array([543.44485904, 621.06578339, 564.74486433, 566.84353847,
       606.41914413, 535.57747171, 565.761224  , 584.46109246,
       576.82133346, 560.86365826])

In [151]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

    Actual   Predicted
22     464  543.444859
39     968  621.065783
25     566  564.744864
3      414  566.843538
40     587  606.419144
6      344  535.577472
7      467  565.761224
9      498  584.461092
43     591  576.821333
34     487  560.863658


#### Evaluate the model

In [152]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 106.47
Mean squared error: 21000.28
Root mean squared error: 144.91


In [153]:
actual_minus_predicted = sum((y_test - y_pred)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

R²: 0.1879884023210907


In [154]:
ridge_rg.score(X_test, y_test)

0.18798840232109082

In [155]:
ridge_rg.score(X_train, y_train)

0.30827784559471216

#### Lasso

In [156]:
lasso_rg = Lasso(alpha=1.8)
lasso_rg.fit(X_train, y_train)

In [157]:
lasso_rg.intercept_

586.8157894736842

In [158]:
lasso_rg.coef_

array([-29.27524322,  -6.09760135,  55.26682079, -34.74871915])

In [159]:
feature_names = X.columns
feature_names

Index(['Average_income', 'Paved_Highways', 'Population_Driver_licence(%)',
       'Petrol_tax'],
      dtype='object')

In [160]:
feature_names = X.columns
model_coefficients = lasso_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names, 
                              columns = ['Coefficient value'])
print(coefficients_df)

                              Coefficient value
Average_income                       -29.275243
Paved_Highways                        -6.097601
Population_Driver_licence(%)          55.266821
Petrol_tax                           -34.748719


#### Inference with the model

In [161]:
y_pred = lasso_rg.predict(X_test)
y_pred

array([434.44593636, 707.58603932, 529.23373086, 508.66543487,
       650.61038265, 365.548161  , 503.87573131, 559.87918437,
       563.03196233, 514.86904751])

In [162]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

    Actual   Predicted
22     464  434.445936
39     968  707.586039
25     566  529.233731
3      414  508.665435
40     587  650.610383
6      344  365.548161
7      467  503.875731
9      498  559.879184
43     591  563.031962
34     487  514.869048


#### Evaluate the model

In [163]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 66.12
Mean squared error: 9026.05
Root mean squared error: 95.01


In [164]:
actual_minus_predicted = sum((y_test - y_pred)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

R²: 0.6509922315404321


In [165]:
lasso_rg.score(X_test, y_test)

0.6509922315404321

In [166]:
lasso_rg.score(X_train, y_train)

0.6492085497829607