In [5]:
import sys
import os
sys.path.append(os.path.abspath(".."))

# --- hide warnings ---
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from src.preprocessing import preprocessor, X, Y
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix


# --- split data ---
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# --- flatten y to avoid DataConversionWarning ---
y_train = y_train.values.ravel()
y_test  = y_test.values.ravel()

# --- define pipeline with Logistic Regression ---
chain = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(random_state=42)),
    ('classifier', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, random_state=42))
])

# --- parameters for GridSearchCV ---
param_grid = {
    'classifier__C': [1, 10],
    'pca__n_components': [2, 5],
    'classifier__solver': ['lbfgs', 'saga']  # solver choice, lbfgs/saga
}

# --- GridSearchCV object ---
grid_search = GridSearchCV(chain, param_grid, scoring='accuracy', cv=10)

# --- train model ---
grid_search.fit(X_train, y_train)

# --- predictions ---
y_pred = grid_search.predict(X_test)

# --- evaluation metrics ---
accuracy_values = accuracy_score(y_test, y_pred)
precision_values = precision_score(y_test, y_pred, average='macro')
recall_values = recall_score(y_test, y_pred, average='macro')
confusionmatrix_values = confusion_matrix(y_test, y_pred, labels=['low', 'middle', 'high'])

# --- model evaluation & hyperparameter optimization results 
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_values:.4f}")
print(f"Precision: {precision_values:.4f}")
print(f"Recall: {recall_values:.4f}")
print(f"Confusionmatrix:\n {confusionmatrix_values}")

Best Hyperparameters: {'classifier__C': 10, 'classifier__solver': 'lbfgs', 'pca__n_components': 5}
Accuracy: 0.5000
Precision: 0.4594
Recall: 0.4615
Confusionmatrix:
 [[22  1  4]
 [14  2  3]
 [10  5 13]]
