## Import Toolkit

In [None]:
# OS libs
import os

# Data handling tools
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,
                             classification_report, confusion_matrix)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Deep learning libs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Other
from tqdm import tqdm

# Warnings
import warnings
warnings.filterwarnings('ignore')

## Load data

In [None]:
!pwd

In [None]:
# Train

train_data_path = '../data/Training/'

filepaths = []
labels = []

folds = os.listdir(train_data_path)

for fold in folds:
    f_path = os.path.join(train_data_path, fold)
    filelists = os.listdir(f_path)

    for file in filelists:
        filepaths.append(os.path.join(f_path, file))
        labels.append(fold)

# Concat data paths with labels
Fseries = pd.Series(filepaths, name='filepaths')
Lseries = pd.Series(labels, name='label')
train_df = pd.concat([Fseries, Lseries], axis=1)


# Test

test_data_path = '../data/Testing/'

filepaths = []
labels = []

folds = os.listdir(test_data_path)

for fold in folds:
    f_path = os.path.join(test_data_path, fold)
    filelists = os.listdir(f_path)

    for file in filelists:
        filepaths.append(os.path.join(f_path, file))
        labels.append(fold)

# Concat data paths with labels
Fseries = pd.Series(filepaths, name='filepaths')
Lseries = pd.Series(labels, name='label')
test_df = pd.concat([Fseries, Lseries], axis=1)

## Split Data into Valid and Test

In [None]:
valid, test = train_test_split(test_df, train_size=0.5,
                               shuffle=True, random_state=42)

## Image Data Generator

In [None]:
img_size = (224, 224)
batch_size = 32
tr_gen = ImageDataGenerator()
ts_gen = ImageDataGenerator()

train_gen = tr_gen.flow_from_dataframe(train_df, x_col='filepaths',
                                       y_col='label',
                                       target_size=img_size,
                                       class_mode='categorical',
                                       color_mode='grayscale',
                                       shuffle=True, batch_size=batch_size)

valid_gen = ts_gen.flow_from_dataframe(valid, x_col='filepaths', y_col='label',
                                       target_size=img_size,
                                       class_mode='categorical',
                                       color_mode='grayscale', shuffle=True,
                                       batch_size=batch_size)

test_gen = ts_gen.flow_from_dataframe(test, x_col='filepaths', y_col='label',
                                      target_size=img_size,
                                      class_mode='categorical',
                                      color_mode='grayscale', shuffle=False,
                                      batch_size=batch_size)

## Show sample from train data

In [None]:
gen_dict = train_gen.class_indices
classes = list(gen_dict.keys())
images, labels = next(train_gen)

plt.figure(figsize=(20, 20))

for i in range(16):
    plt.subplot(4, 4, i+1)
    image = images[i] / 255
    plt.imshow(image)
    index = np.argmax(labels[i])
    class_name = classes[index]
    plt.title(class_name, color='blue', fontsize=12)
    plt.axis('off')
plt.show()

## Flatten images

In [None]:
# Training data
X_train, y_train = [], []
for i in tqdm(range(len(train_gen))):
    images, labels = train_gen[i]
    X_train.append(images.reshape(images.shape[0], -1))
    y_train.append(np.argmax(labels, axis=1))

X_train = np.concatenate(X_train)
y_train = np.concatenate(y_train)

# Validation data
X_valid, y_valid = [], []
for i in tqdm(range(len(valid_gen))):
    images, labels = valid_gen[i]
    X_valid.append(images.reshape(images.shape[0], -1))
    y_valid.append(np.argmax(labels, axis=1))

X_valid = np.concatenate(X_valid)
y_valid = np.concatenate(y_valid)

# Test data
X_test, y_test = [], []
for i in tqdm(range(len(test_gen))):
    images, labels = test_gen[i]
    X_test.append(images.reshape(images.shape[0], -1))
    y_test.append(np.argmax(labels, axis=1))

X_test = np.concatenate(X_test)
y_test = np.concatenate(y_test)

In [None]:
# check
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

## PCA

In [None]:
from sklearn.decomposition import PCA

n_components = 2250

pca = make_pipeline(StandardScaler(), PCA(n_components=n_components,
                                          random_state=42))
X_train_pca = pca.fit_transform(X_train)


X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)

print("Original shape:", X_train.shape, X_valid.shape, X_test.shape)
print("Reduced shape:", X_train_pca.shape, X_valid_pca.shape, X_test_pca.shape)

## Model Structure (SVM)

In [None]:
model_SVC = LinearSVC(max_iter=1000, random_state=42)
model_SVC.fit(X_train_pca, y_train)

In [None]:
# validation
y_valid_pred = model_SVC.predict(X_valid_pca)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)

print(f'Validation Accuracy: {accuracy_valid:.4f}')
print('Classification Report:')
print(classification_report(y_valid, y_valid_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_valid, y_valid_pred))

In [None]:
# test
y_test_pred = model_SVC.predict(X_test_pca)
accuracy_test = accuracy_score(y_test, y_test_pred)

print(f'Test Accuracy: {accuracy_test:.4f}')
print('Classification Report:')
print(classification_report(y_test, y_test_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_test_pred))

## Model Structure (OVR)

In [None]:
model_OVR = LogisticRegression(max_iter=1000, random_state=42,
                               multi_class='ovr')
model_OVR.fit(X_train_pca, y_train)

In [None]:
# validation
y_valid_pred = model_OVR.predict(X_valid_pca)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)

print(f'Validation Accuracy: {accuracy_valid:.4f}')
print('Classification Report:')
print(classification_report(y_valid, y_valid_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_valid, y_valid_pred))

In [None]:
# test
y_test_pred = model_OVR.predict(X_test_pca)
accuracy_test = accuracy_score(y_test, y_test_pred)

print(f'Test Accuracy: {accuracy_test:.4f}')
print('Classification Report:')
print(classification_report(y_test, y_test_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_test_pred))