In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

# --- hide warnings ---
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from src.preprocessing import preprocessor, X, Y
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix


# --- Load data from CSV file ---
df = pd.read_csv("data/initial_labeling_data.csv")

# --- Split features and target ---
X = df.iloc[:, 2:-1]  # input features
Y = df.iloc[:, -1:]   # target labels

# --- split data ---
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# --- flatten y to avoid DataConversionWarning ---
y_train = y_train.values.ravel()
y_test  = y_test.values.ravel()

# --- define pipeline with SVM ---
chain = Pipeline([
    ("preprocessing", preprocessor()),
    ("pca", PCA(random_state=42)),
    ("svc", SVC())
])

# --- parameters for GridSearchCV ---
param_grid = {
    'svc__C': [1, 10],
    'svc__max_iter': [1000, 5000],
    'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'svc__gamma': ['auto', 'scale'],
    'svc__class_weight': ['balanced'],
    'pca__n_components': [2, 5]
}

# --- GridSearchCV object ---
grid_search = GridSearchCV(chain, param_grid, scoring='accuracy', cv=10)

# --- train model ---
grid_search.fit(X_train, y_train)

# --- predictions ---
y_pred = grid_search.predict(X_test)

# --- evaluation metrics ---
accuracy_values = accuracy_score(y_test, y_pred)
precision_values = precision_score(y_test, y_pred, average='macro')
recall_values = recall_score(y_test, y_pred, average='macro')
confusionmatrix_values = confusion_matrix(y_test, y_pred, labels=['low', 'middle', 'high'])

# --- model evaluation & hyperparameter optimization results --- 
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_values:.4f}")
print(f"Precision: {precision_values:.4f}")
print(f"Recall: {recall_values:.4f}")
print(f"Confusion matrix:\n {confusionmatrix_values}")

Best Hyperparameters: {'pca__n_components': 5, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'auto', 'svc__kernel': 'linear', 'svc__max_iter': 5000}
Accuracy: 0.5000
Precision: 0.4498
Recall: 0.4588
Confusion matrix:
 [[16  1 10]
 [ 7  2 10]
 [ 0  9 19]]
