In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

In [None]:
! ls

In [None]:
! pip install -U -r requirements.txt



## Update repository

In [None]:
! git pull

## Add import path

In [None]:
import os
import sys
import gc

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
del module_path

## Organize imports

In [None]:
import multiprocessing
from pathlib import Path

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures, 
                                   SplineTransformer, LabelEncoder)
from sklearn.feature_selection import (SequentialFeatureSelector as SFS, 
                                       SelectFromModel)
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, 
                                           QuadraticDiscriminantAnalysis)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, SVR, LinearSVR

from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import KMeans, AgglomerativeClustering

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                             classification_report, confusion_matrix)


#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

## Initialize path

In [None]:
DATA = Path('data')
PATH = DATA / 'log_regr_lda_qda_np'
LR_PATH = DATA / 'linear_regression'
PUMPKIN_DIR = PATH / 'Pumpkin_Seeds_Dataset'
IRIS_DIR = PATH / 'iris'
PUMPKIN_DIR.mkdir(exist_ok=True, parents=True)
IRIS_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
SEED = 2024

In [None]:
! ls

## Load IRIS dataset

#### Load dataset

In [None]:
! ls {IRIS_DIR}

In [None]:
df = pd.read_csv(IRIS_DIR / 'Iris.csv')

In [None]:
df

In [None]:
df['Species'].value_counts()

In [None]:
y = df['Species']
X = df.drop(columns=['Id', 'Species'], axis=1)
X.shape, y.shape, df['Species'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)

In [None]:
y_train

In [None]:
LB_DCT = dict(Setosa=0, 
              Versicolor=1, 
              Virginica=2)

In [None]:
y_train_ch = [LB_DCT[yt.replace('Iris-', '').title()] for yt in y_train]
y_train_ch = np.array(y_train_ch)
y_train_ch

In [None]:
y_test_ch = [LB_DCT[yt.replace('Iris-', '').title()] for yt in y_test]
y_test_ch = np.array(y_test_ch)
y_test_ch

## Scaling / standartizing the parameters

In [None]:
scaler = StandardScaler().fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## PCA Decomposition on IRIS dataset

In [None]:
pca = PCA(n_components=3)
pca.fit(X_train)
X_train_pc = pca.transform(X_train)
X_test_pc = pca.transform(X_test)

## Plotting the results

In [None]:
def plot_clusters(X_pc, y_ch):
    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()

    ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
    ax.set_position([0, 0, 0.95, 1])
    plt.cla()
    for name, label in [('Setosa', 0), ('Versicolor', 1), ('Virginica', 2)]:
        ax.text3D(
            X_pc[y_ch == label, 0].mean(),
            X_pc[y_ch == label, 1].mean() + 1.5,
            X_pc[y_ch == label, 2].mean(),
            name,
            horizontalalignment="center",
            bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
        )
    # Reorder the labels to have colors matching the cluster results
    y_chf = np.choose(y_ch, [1, 2, 0]).astype(float)
    ax.scatter(
        X_pc[:, 0], X_pc[:, 1], X_pc[:, 2], 
        c=y_chf, cmap=plt.cm.nipy_spectral, edgecolor="k")

    ax.xaxis.set_ticklabels([])
    ax.yaxis.set_ticklabels([])
    ax.zaxis.set_ticklabels([])

    plt.show()

In [None]:
plot_clusters(X_train_pc, y_train_ch)

In [None]:
plot_clusters(X_test_pc, y_test_ch)

```python
fig = plt.figure(1, figsize=(4, 3))
plt.clf()

ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
ax.set_position([0, 0, 0.95, 1])
plt.cla()
for name, label in [('Setosa', 0), ('Versicolor', 1), ('Virginica', 2)]:
    ax.text3D(
        X_train_pc[y_train_ch == label, 0].mean(),
        X_train_pc[y_train_ch == label, 1].mean() + 1.5,
        X_train_pc[y_train_ch == label, 2].mean(),
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
    )
# Reorder the labels to have colors matching the cluster results
y_train_chf = np.choose(y_train_ch, [1, 2, 0]).astype(float)
ax.scatter(
    X_train_pc[:, 0], X_train_pc[:, 1], X_train_pc[:, 2], 
    c=y_train_chf, cmap=plt.cm.nipy_spectral, edgecolor="k")

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])

plt.show()
```

$$ 
 \sigma^2 = \frac{1}{N - 1} \sum_{i=1}^{N}(x - \mu)^2
$$

## Classification SVM

In [None]:
cls_rbf = SVC(C=1.0, 
              kernel='rbf', 
              verbose=True,
              decision_function_shape='ovr',
              random_state=2022)

## Train models

In [None]:
cls_rbf = cls_rbf.fit(X_train_pc, y_train)
cls_rbf

In [None]:
y_pred = cls_rbf.predict(X_test_pc)

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

## Clustering for high dimensional features

Clustering of the hig dimensional feature with K-means

#### Initialize model

In [None]:
kmns = KMeans(
    n_clusters=3, 
    init='random',
    n_init=10, 
    max_iter=300, 
    tol=1e-04, 
    random_state=SEED,
)

#### Prepare data

In [None]:
SEED = 2024

In [None]:
studen_scores_path = PUMPKIN_DIR / 'Pumpkin_Seeds_Dataset.xlsx'

In [None]:
df = pd.read_excel(studen_scores_path)
df

In [None]:
df.shape

## Data analysis

In [None]:
df['Class'].value_counts() 

In [None]:
df.describe().T

In [None]:
y = df['Class']
X = df.drop(columns=['Class'], axis=1)

In [None]:
y = y.replace('Çerçevelik', 0).replace('Ürgüp Sivrisi', 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25, 
                                                    random_state=SEED)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

#### Scaling Data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fit the model

In [None]:
y_km_train = kmns.fit_predict(X_train)

In [None]:
y_pred = kmns.predict(X_test)

In [None]:
y_pred

#### Elbow method

In [None]:
distortions = []
for i in range(1, 11):
    kmns_i = KMeans(
        n_clusters=i, 
        init='random',
        n_init=10, 
        max_iter=300,
        tol=1e-04, 
        random_state=0
    )
    kmns_i.fit(X_train)
    distortions.append(kmns_i.inertia_)

# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
kmns_ch = KMeans(
    n_clusters=8, 
    init='random',
    n_init=10, 
    max_iter=300,
    tol=1e-04, 
    random_state=0
)

In [None]:
kmns_ch = kmns_ch.fit(X_train)
y_km_train = kmns_ch.predict(X_train)
y_km_test = kmns_ch.predict(X_test)

In [None]:
y_km_train

In [None]:
y_km_test

In [None]:
y_km_test.shape, y_test.shape

In [None]:
for idx, cl in enumerate(y_km_test):
    print(idx, cl)

In [None]:
clust_dict = dict()
for idx, (cl, y_tst) in enumerate(zip(y_km_test, y_test)):
    clust_dict.setdefault(cl, list())
    clust_dict[cl].append(y_tst)

In [None]:
clust_dict

In [None]:
cl_i = np.array(clust_dict[2])
un, ct = np.unique(cl_i, return_counts=True)
ct[0] / (cl_i.shape[0]), ct[1] / (cl_i.shape[0])

In [None]:
cl_i = np.array(clust_dict[3])
un, ct = np.unique(cl_i, return_counts=True)
ct[0] / (cl_i.shape[0]), ct[1] / (cl_i.shape[0])

## Hyerarchial clustering

Hyerarchial clustering on high dimensional data

#### Initialize model

In [None]:
hrch = AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=0.02,
)

In [None]:
hrch = hrch.fit(X_train)

In [None]:
y_hr_train = hrch.fit_predict(X_train)

In [None]:
y_hr_train

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(hrch, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

## Test on images

```python
! pip install -U scikit-image
```

In [None]:
from skimage.data import coins

orig_coins = coins()

In [None]:
from scipy.ndimage import gaussian_filter
from skimage.transform import rescale

smoothened_coins = gaussian_filter(orig_coins, sigma=2)
rescaled_coins = rescale(
    smoothened_coins,
    0.2,
    mode="reflect",
    anti_aliasing=False,
)

X = np.reshape(rescaled_coins, (-1, 1))

In [None]:
from sklearn.feature_extraction.image import grid_to_graph

connectivity = grid_to_graph(*rescaled_coins.shape)

In [None]:
import time as time

from sklearn.cluster import AgglomerativeClustering

print("Compute structured hierarchical clustering...")
st = time.time()
n_clusters = 27  # number of regions
ward = AgglomerativeClustering(
    n_clusters=n_clusters, linkage="ward", connectivity=connectivity
)
ward = ward.fit(X)
y_ward = ward.fit_predict(X)
label = np.reshape(ward.labels_, rescaled_coins.shape)
print(f"Elapsed time: {time.time() - st:.3f}s")
print(f"Number of pixels: {label.size}")
print(f"Number of clusters: {np.unique(label).size}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(5, 5))
plt.imshow(rescaled_coins, cmap=plt.cm.gray)
for l in range(n_clusters):
    plt.contour(
        label == l,
        colors=[
            plt.cm.nipy_spectral(l / float(n_clusters)),
        ],
    )
plt.axis("off")
plt.show()

In [None]:
hrch.distances_

In [None]:
ward.distances_

In [None]:
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(ward, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()