In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.10 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [2]:
! ls

[34mdata[m[m
[34mimages[m[m
mid_terms.ipynb
mid_terms_sm.ipynb
requirements.txt
workshop_1_linear_regression.ipynb
workshop_2_logistic_regression_lda_qda_nb.ipynb
workshop_3_resampling_methods.ipynb
workshop_4_model_selection_and_regularizations.ipynb


In [3]:
! pip install -U -r requirements.txt

Collecting scikit-learn
  Downloading scikit_learn-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl (9.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m


Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.1.3
    Uninstalling scikit-learn-1.1.3:
      Successfully uninstalled scikit-learn-1.1.3
Successfully installed scikit-learn-1.2.0


## Update repository

In [4]:
! git pull

Already up to date.


## Add import path

In [5]:
import os
import sys
import gc

In [6]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
del module_path

## Organize imports

In [8]:
import multiprocessing
from pathlib import Path

In [9]:
import seaborn as sns

In [60]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)

#### Number of CPU cores

In [17]:
workers = multiprocessing.cpu_count()
workers

8

## Initialize path

In [80]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [19]:
! ls

[34mdata[m[m
[34mimages[m[m
mid_terms.ipynb
mid_terms_sm.ipynb
requirements.txt
workshop_1_linear_regression.ipynb
workshop_2_logistic_regression_lda_qda_nb.ipynb
workshop_3_resampling_methods.ipynb
workshop_4_model_selection_and_regularizations.ipynb


## Load IRIS dataset

In [26]:
SEED = 2022

In [27]:
iris_url = 'https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2'

#### Load dataset

In [28]:
! ls {IRIS_DIR}

Iris.csv        database.sqlite


In [29]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [30]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [31]:
df['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [32]:
y = df['Species']
X = df.drop(columns=['Id', 'Species'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

((150, 4),
 (150,),
 Iris-setosa        50
 Iris-versicolor    50
 Iris-virginica     50
 Name: Species, dtype: int64)

```python
# #define predictor and response variables
X = df[['s_length', 's_width', 'p_length', 'p_width']]
y = df['species']
X.shape, y.shape
```

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

## Initialize the model

In [62]:
lda = LinearDiscriminantAnalysis()

#### Forward selection

In [49]:
X_train.shape

(120, 4)

In [50]:
sfs_forward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='forward',
                  cv=10,
                  n_jobs=workers)

sfs_forward = sfs_forward.fit(X_train, y_train)
sfs_forward

In [53]:
X_train_fw = sfs_forward.transform(X_train)
X_test_fw = sfs_forward.transform(X_test)
X_train_fw.shape

(120, 2)

In [54]:
lda_fw = LinearDiscriminantAnalysis()

In [56]:
lda_fw = lda_fw.fit(X_train_fw, y_train)

In [58]:
y_pred_fw = lda_fw.predict(X_test_fw)

In [61]:
cr = classification_report(y_test, y_pred_fw)
print(cr)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



#### Backward selection\

In [63]:
lda = LinearDiscriminantAnalysis()

In [64]:
X_train.shape

(120, 4)

In [72]:
sfs_forward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='backward',
                  cv=10,
                  n_jobs=workers)

sfs_forward = sfs_forward.fit(X_train, y_train)
sfs_forward

In [73]:
X_train_bw = sfs_forward.transform(X_train)
X_test_bw = sfs_forward.transform(X_test)
X_train_bw.shape

(120, 2)

In [74]:
lda_bw = LinearDiscriminantAnalysis()

In [75]:
lda_bw = lda_fw.fit(X_train_bw, y_train)

In [76]:
y_pred_bw = lda_fw.predict(X_test_bw)

In [77]:
cr = classification_report(y_test, y_pred_bw)
print(cr)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.80      1.00      0.89         8
 Iris-virginica       1.00      0.85      0.92        13

       accuracy                           0.93        30
      macro avg       0.93      0.95      0.94        30
   weighted avg       0.95      0.93      0.93        30



## Ridge and Lasso regression

In [81]:
pertol_consumption_path = LR_PATH / 'petrol_consumption.csv'

In [82]:
df = pd.read_csv(pertol_consumption_path)
df

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410
5,10.0,5342,1333,0.571,457
6,8.0,5319,11868,0.451,344
7,8.0,5126,2138,0.553,467
8,8.0,4447,8577,0.529,464
9,7.0,4512,8507,0.552,498


In [84]:
df.shape

(48, 5)

In [85]:
print(df.describe().round(2).T)

                              count     mean      std      min      25%  \
Petrol_tax                     48.0     7.67     0.95     5.00     7.00   
Average_income                 48.0  4241.83   573.62  3063.00  3739.00   
Paved_Highways                 48.0  5565.42  3491.51   431.00  3110.25   
Population_Driver_licence(%)   48.0     0.57     0.06     0.45     0.53   
Petrol_Consumption             48.0   576.77   111.89   344.00   509.50   

                                  50%      75%       max  
Petrol_tax                       7.50     8.12     10.00  
Average_income                4298.00  4578.75   5342.00  
Paved_Highways                4735.50  7156.00  17782.00  
Population_Driver_licence(%)     0.56     0.60      0.72  
Petrol_Consumption             568.50   632.75    968.00  


#### Prepare dataset

In [86]:
y = df['Petrol_Consumption']
X = df[['Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)', 'Petrol_tax']]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED)

In [110]:
X.shape

(48, 4)

#### Ridge

In [97]:
ridge_rg = Ridge(alpha=100.8)
ridge_rg.fit(X_train, y_train)

In [98]:
ridge_rg.intercept_

779.8862818815674

In [99]:
ridge_rg.coef_

array([-2.45650550e-02,  6.05822864e-04,  1.11037060e+00, -1.23421561e+01])

In [102]:
feature_names = X.columns
feature_names

Index(['Average_income', 'Paved_Highways', 'Population_Driver_licence(%)',
       'Petrol_tax'],
      dtype='object')

In [103]:
feature_names = X.columns
model_coefficients = ridge_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names, 
                              columns = ['Coefficient value'])
print(coefficients_df)

                              Coefficient value
Average_income                        -0.024565
Paved_Highways                         0.000606
Population_Driver_licence(%)           1.110371
Petrol_tax                           -12.342156


#### Inference with the model

In [104]:
y_pred = ridge_rg.predict(X_test)
y_pred

array([550.56286211, 589.86793246, 580.87958411, 569.69996881,
       587.70676368, 558.17818835, 557.1378453 , 588.42032063,
       603.64092988, 597.14162032])

In [105]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

    Actual   Predicted
22     464  550.562862
39     968  589.867932
25     566  580.879584
3      414  569.699969
40     587  587.706764
6      344  558.178188
7      467  557.137845
9      498  588.420321
43     591  603.640930
34     487  597.141620


#### Evaluate the model

In [106]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 115.35
Mean squared error: 24940.53
Root mean squared error: 157.93


In [107]:
actual_minus_predicted = sum((y_test - y_pred)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

R²: 0.035631748456249834


In [108]:
ridge_rg.score(X_test, y_test)

0.035631748456249834

In [109]:
ridge_rg.score(X_train, y_train)

0.14343663767753

#### Lasso

In [112]:
lasso_rg = Lasso(alpha=100.8)
lasso_rg.fit(X_train, y_train)

In [113]:
lasso_rg.intercept_

673.3778641168543

In [114]:
lasso_rg.coef_

array([-0.02433956,  0.00264467,  0.        , -0.        ])

In [115]:
feature_names = X.columns
feature_names

Index(['Average_income', 'Paved_Highways', 'Population_Driver_licence(%)',
       'Petrol_tax'],
      dtype='object')

In [116]:
feature_names = X.columns
model_coefficients = lasso_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names, 
                              columns = ['Coefficient value'])
print(coefficients_df)

                              Coefficient value
Average_income                        -0.024340
Paved_Highways                         0.002645
Population_Driver_licence(%)           0.000000
Petrol_tax                            -0.000000


#### Inference with the model

In [117]:
y_pred = lasso_rg.predict(X_test)
y_pred

array([560.66383477, 577.94991859, 595.36197633, 561.06182489,
       577.3597949 , 575.30272479, 554.26758137, 586.05600187,
       589.13144765, 596.75102455])

In [118]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

    Actual   Predicted
22     464  560.663835
39     968  577.949919
25     566  595.361976
3      414  561.061825
40     587  577.359795
6      344  575.302725
7      467  554.267581
9      498  586.056002
43     591  589.131448
34     487  596.751025


#### Evaluate the model

In [119]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 119.10
Mean squared error: 26498.44
Root mean squared error: 162.78


In [120]:
actual_minus_predicted = sum((y_test - y_pred)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

R²: -0.02460757968572924


In [121]:
lasso_rg.score(X_test, y_test)

-0.02460757968572924

In [122]:
lasso_rg.score(X_train, y_train)

0.03233822475188153