# Training

In [71]:
import os

DATASET_NAME = "drsprg"
DATA_BASE_DIR = f"../data/processed/{DATASET_NAME}/"
IMAGES_DIR = os.path.join(DATA_BASE_DIR, "jpgs/")
LBP_DATASET = os.path.join(DATA_BASE_DIR, "artifacts/lbp_dataset.pkl")

# Training related parameters
SEED = 42
TEST_SIZE = 0.2
RS_N_ITER = 40
CV = 20
N_COMPONENTS = 2

In [72]:
import warnings

import joblib
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [73]:
np.random.seed(SEED)

In [74]:
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

## LBP

In [75]:
lbp_dataset = joblib.load(LBP_DATASET)

In [76]:
X = np.array([sample[0] for sample in lbp_dataset])
y = np.array([sample[1] for sample in lbp_dataset])

In [77]:
X.shape, y.shape

((102, 1620), (102,))

In [78]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [88]:
pca = PCA(n_components=N_COMPONENTS)
X_scaled = pca.fit_transform(X_scaled)

array([-8.08913239, -1.9630445 ])

### Supervised learning

#### SVM

In [80]:
def run(model, X, y, cv):
    metrics = ["precision", "recall", "f1", "accuracy"]
    results = [
        cross_val_score(model, X, y, scoring=metric, cv=cv, error_score=np.nan).mean()
        for metric in metrics
    ]
    return results

##### Radial Basis Function (RBF)

In [81]:
results = {}
rbf_svc = SVC(kernel="rbf")
results["svc_rbf"] = run(rbf_svc, X_scaled, y, CV)

##### Kernels Linear

In [82]:
linear_svc = SVC(kernel="linear")
results["svc_linear"] = run(linear_svc, X_scaled, y, CV)

#### Random Forest

In [83]:
rf = RandomForestClassifier()
results["rf"] = run(rf, X_scaled, y, CV)

#### XGBoost

In [84]:
xgbc = XGBClassifier()
results["xgbc"] = run(xgbc, X_scaled, y, CV)

### Unsupervised learning

#### K-Means

In [85]:
n_clusters = len(np.unique(y))
kmeans = KMeans(n_clusters=n_clusters, random_state=SEED)
results["kmeans"] = run(kmeans, X_scaled, y, CV)

In [86]:
columns = ["precision", "recall", "f1", "accuracy"]
pd.DataFrame(data=results, index=columns).transpose()

Unnamed: 0,precision,recall,f1,accuracy
svc_rbf,0.780833,0.791667,0.765238,0.738333
svc_linear,0.825833,0.775,0.772738,0.755
rf,0.763333,0.75,0.7325,0.705
xgbc,0.771667,0.7,0.708214,0.696667
kmeans,0.595,0.465,0.500357,0.555
