In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [None]:
! ls

In [None]:
! pip install -U -r requirements.txt

## Update repository

In [None]:
! git pull

## Add import path

In [None]:
import os
import sys
import gc

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
del module_path

## Organize imports

In [None]:
import multiprocessing
from pathlib import Path

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures, 
                                   SplineTransformer)
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)

#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

## Initialize path

In [None]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
! ls

## Load IRIS dataset

In [None]:
SEED = 2022

In [None]:
iris_url = 'https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2'

#### Load dataset

In [None]:
! ls {IRIS_DIR}

In [None]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [None]:
df

In [None]:
df['Species'].value_counts()

In [None]:
y = df['Species']
X = df.drop(columns=['Id', 'Species'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

```python
# #define predictor and response variables
X = df[['s_length', 's_width', 'p_length', 'p_width']]
y = df['species']
X.shape, y.shape
```

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

## Generate polynomial features

In [None]:
poly = PolynomialFeatures(degree=2)

In [None]:
X_train_pl = poly.fit_transform(X_train)
X_test_pl = poly.transform(X_test)

In [None]:
feature_names_pl = poly.get_feature_names_out(input_features=X_train.columns)
feature_names_pl

In [None]:
X_train.shape, X_test.shape, X_train_pl.shape, X_test_pl.shape

## Initialize the model

In [None]:
lda = LinearDiscriminantAnalysis()

## Train model with polynomial features

In [None]:
lda_pl = lda.fit(X_train_pl, y_train)

In [None]:
y_pred_pl = lda_pl.predict(X_test_pl)

In [None]:
cr = classification_report(y_test, y_pred_pl)
print(cr)

#### Forward selection

In [None]:
X_train_pl.shape

In [None]:
sfs_forward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='forward',
                  cv=10,
                  n_jobs=workers)

sfs_forward = sfs_forward.fit(X_train_pl, y_train)
sfs_forward

In [None]:
X_train_fw = sfs_forward.transform(X_train)
X_test_fw = sfs_forward.transform(X_test)
X_train_fw.shape

In [None]:
X_train.columns

In [None]:
sfs_forward.get_feature_names_out(X_train.columns)

In [None]:
lda_fw = LinearDiscriminantAnalysis()

In [None]:
lda_fw = lda_fw.fit(X_train_fw, y_train)

In [None]:
y_pred_fw = lda_fw.predict(X_test_fw)

In [None]:
cr = classification_report(y_test, y_pred_fw)
print(cr)

#### Backward selection\

In [None]:
lda = LinearDiscriminantAnalysis()

In [None]:
X_train_pl.shape

In [None]:
sfs_backward = SFS(lda, 
                  n_features_to_select=2, 
                  direction='backward',
                  cv=10,
                  n_jobs=workers)

sfs_backward = sfs_backward.fit(X_train_pl, y_train)
sfs_backward

In [None]:
X_train_bw = sfs_backward.transform(X_train_pl)
X_test_bw = sfs_backward.transform(X_test_pl)
X_train_bw.shape

In [None]:
sfs_backward.get_feature_names_out(input_features=feature_names_pl)

In [None]:
lda_bw = LinearDiscriminantAnalysis()

In [None]:
lda_bw = lda_bw.fit(X_train_bw, y_train)

In [None]:
y_pred_bw = lda_bw.predict(X_test_bw)

In [None]:
cr = classification_report(y_test, y_pred_bw)
print(cr)

## Ridge and Lasso regression

In [None]:
pertol_consumption_path = LR_PATH / 'petrol_consumption.csv'

In [None]:
df = pd.read_csv(pertol_consumption_path)
df

In [None]:
df.shape

In [None]:
print(df.describe().round(2).T)

#### Prepare dataset

In [None]:
y = df['Petrol_Consumption']
X = df[['Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)', 'Petrol_tax']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED)

## Generate Polynomial features

In [None]:
poly = PolynomialFeatures(degree=4)

In [None]:
X_train_pl = poly.fit_transform(X_train)
X_test_pl = poly.transform(X_test)

In [None]:
feature_names_pl = poly.get_feature_names_out(input_features=X_train.columns)
feature_names_pl

In [None]:
X_train.shape, X_train_pl.shape

In [None]:
std_scaller = StandardScaler()
X_train_pl = std_scaller.fit_transform(X_train_pl)
X_test_pl = std_scaller.transform(X_test_pl)

#### Ridge

In [None]:
ridge_rg = Ridge(alpha=100.08)
ridge_rg.fit(X_train_pl, y_train)

In [None]:
ridge_rg.intercept_

In [None]:
ridge_rg.coef_

In [None]:
model_coefficients = ridge_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names_pl, 
                              columns = ['Coefficient value'])
print(coefficients_df)

#### Inference with the model

In [None]:
y_pred_pl = ridge_rg.predict(X_test_pl)
y_pred_pl

In [None]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_pl})
print(results)

#### Evaluate the model

In [None]:
mae = mean_absolute_error(y_test, y_pred_pl)
mse = mean_squared_error(y_test, y_pred_pl)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
actual_minus_predicted = sum((y_test - y_pred_pl)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

In [None]:
ridge_rg.score(X_test_pl, y_test)

In [None]:
ridge_rg.score(X_train_pl, y_train)

#### Lasso

In [None]:
lasso_rg = Lasso(alpha=1.8)
lasso_rg.fit(X_train_pl, y_train)

In [None]:
lasso_rg.intercept_

In [None]:
lasso_rg.coef_

In [None]:
model_coefficients = lasso_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names_pl, 
                              columns = ['Coefficient value'])
print(coefficients_df)

#### Inference with the model

In [None]:
y_pred = lasso_rg.predict(X_test_pl)
y_pred

In [None]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_pl})
print(results)

#### Evaluate the model

In [None]:
mae = mean_absolute_error(y_test, y_pred_pl)
mse = mean_squared_error(y_test, y_pred_pl)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
actual_minus_predicted = sum((y_test - y_pred_pl)**2)
actual_minus_actual_mean = sum((y_test - y_test.mean())**2)
r2 = 1 - actual_minus_predicted/actual_minus_actual_mean
print('R²:', r2)

In [None]:
lasso_rg.score(X_test_pl, y_test)

In [None]:
lasso_rg.score(X_train_pl, y_train)

## Linear / polynomial splines

In [None]:
spline = SplineTransformer(n_knots=32, degree=1)

In [None]:
X_train_sp = spline.fit_transform(X_train)
X_test_sp = spline.transform(X_test)

In [None]:
feature_names_sp = spline.get_feature_names_out(input_features=X_train.columns)
feature_names_sp

In [None]:
X_train.shape, X_train_sp.shape

In [None]:
std_scaller = StandardScaler()
X_train_sp = std_scaller.fit_transform(X_train_sp)
X_test_sp = std_scaller.transform(X_test_sp)

#### Ridge

In [None]:
ridge_rg = Ridge(alpha=36.08)
ridge_rg.fit(X_train_sp, y_train)

In [None]:
ridge_rg.intercept_

In [None]:
ridge_rg.coef_

In [None]:
model_coefficients = ridge_rg.coef_

coefficients_df = pd.DataFrame(data = model_coefficients, 
                              index = feature_names_sp, 
                              columns = ['Coefficient value'])
print(coefficients_df)

#### Inference with the model

In [None]:
y_pred_sp = ridge_rg.predict(X_test_sp)
y_pred_sp

In [None]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_sp})
print(results)

#### Evaluate the model

In [None]:
mae = mean_absolute_error(y_test, y_pred_sp)
mse = mean_squared_error(y_test, y_pred_sp)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
R_2_test = ridge_rg.score(X_test_sp, y_test)
print(f'R² (test): {R_2_test}')

In [None]:
R_2_train = ridge_rg.score(X_train_sp, y_train)
print(f'R² (train): {R_2_train}')