In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [None]:
! ls

In [None]:
! pip install -U -r requirements.txt

## Update repository

In [None]:
! git pull

## Add import path

In [None]:
import os
import sys
import gc

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
del module_path

## Organize imports

In [None]:
import multiprocessing
from pathlib import Path

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures, 
                                   SplineTransformer, LabelEncoder)
from sklearn.feature_selection import (SequentialFeatureSelector as SFS, 
                                       SelectFromModel)
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)

#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

## Initialize path

In [None]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
! ls

## Load IRIS dataset

In [None]:
SEED = 2024

In [None]:
iris_url = 'https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2'

#### Load dataset

In [None]:
! ls {IRIS_DIR}

In [None]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [None]:
df

In [None]:
df['Species'].value_counts()

In [None]:
le = LabelEncoder()

In [None]:
iris_labels = le.fit_transform(df.Species)

In [None]:
df.loc[:, 'labels'] = iris_labels

In [None]:
df

In [None]:
y = df['labels']
X = df.drop(columns=['Id', 'Species', 'labels'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

In [None]:
y_train

## Classification trees

In [None]:
cls_tr = DecisionTreeClassifier(random_state=2022)
cls_rf = RandomForestClassifier(n_estimators=128, 
                                max_depth=64, 
                                n_jobs=workers,
                                random_state=SEED,
                                verbose=True)
cls_xb = XGBClassifier(n_estimators=1000, 
                       max_depth=1, 
                       n_jobs=workers,
                       device='gpu',
                       random_state=SEED,
                       verbosity=3)

## Train models

In [None]:
cls_tr = cls_tr.fit(X_train, y_train)

In [None]:
y_pred = cls_tr.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
cls_rf = cls_rf.fit(X_train, y_train)

In [None]:
y_pred = cls_rf.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
cls_xb = cls_xb.fit(X_train, y_train)

In [None]:
y_pred = cls_xb.predict(X_test)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

#### Feature importance

In [None]:
cls_rf.feature_importances_

In [None]:
forest_importances = pd.Series(cls_rf.feature_importances_, index=list(X_train.columns))

std = np.std([tree.feature_importances_ for tree in cls_rf.estimators_], axis=0)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
list(X_train.columns)

In [None]:
rf_feat = SelectFromModel(cls_rf, prefit=True)

In [None]:
rf_feat.get_feature_names_out(input_features=list(X_train.columns))

In [None]:
rf_feat.get_support()

In [None]:
X_train_rd = rf_feat.transform(X_train)
X_test_rd = rf_feat.transform(X_test)
X_train.shape, X_train_rd.shape, X_test.shape, X_test_rd.shape

In [None]:
cls_rf_rd = RandomForestClassifier(n_estimators=128, 
                                   max_depth=64, 
                                   n_jobs=workers, 
                                   verbose=True)

In [None]:
cls_rf_rd.fit(X_train_rd, y_train)

In [None]:
y_pred = cls_rf_rd.predict(X_test_rd)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
xgb.plot_importance(cls_xb)

In [None]:
xgb.plot_tree(cls_xb)

In [None]:
plot_tree(cls_tr)

In [None]:
plot_tree(cls_rf.estimators_[64])

## Regression trees

In [None]:
pertol_consumption_path = LR_PATH / 'petrol_consumption.csv'

In [None]:
df = pd.read_csv(pertol_consumption_path)
df

In [None]:
df.shape

In [None]:
print(df.describe().round(2).T)

#### Prepare dataset

In [None]:
y = df['Petrol_Consumption']
X = df[['Average_income', 'Paved_Highways',
       'Population_Driver_licence(%)', 'Petrol_tax']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED)

## Initialize and fit

In [None]:
reg_tr = DecisionTreeRegressor(random_state=SEED)
reg_rf = RandomForestRegressor(n_estimators=128, 
                               max_depth=64, 
                               n_jobs=workers,
                               random_state=SEED,
                               verbose=True)
reg_xb = XGBRegressor(n_estimators=1000, 
                      max_depth=1, 
                      n_jobs=workers,
                      device='gpu',
                      random_state=SEED,
                      verbosity=3)

## Train models

In [None]:
reg_tr = reg_tr.fit(X_train, y_train)

In [None]:
y_pred = reg_tr.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
reg_rf = reg_rf.fit(X_train, y_train)

In [None]:
y_pred = reg_rf.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
reg_xb = reg_xb.fit(X_train, y_train)

In [None]:
y_pred = reg_xb.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

#### Feature importance

In [None]:
reg_rf.feature_importances_

In [None]:
list(X_train.columns)

In [None]:
rf_feat = SelectFromModel(cls_rf, prefit=True)
# rf_feat.get_feature_names_out(input_features=list(X_train.columns))

In [None]:
X_train_rd = rf_feat.transform(X_train)
X_test_rd = rf_feat.transform(X_test)
X_train.shape, X_train_rd.shape, X_test.shape, X_test_rd.shape

In [None]:
cls_rf_rd = RandomForestClassifier(n_estimators=128, 
                                   max_depth=64, 
                                   n_jobs=workers, 
                                   verbose=True)

In [None]:
cls_rf_rd.fit(X_train_rd, y_train)

In [None]:
y_pred = cls_rf_rd.predict(X_test_rd)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

In [None]:
xgb.plot_importance(reg_xb)

In [None]:
xgb.plot_tree(reg_xb)

In [None]:
plot_tree(reg_tr)

In [None]:
plot_tree(reg_rf.estimators_[64])