In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

# --- hide warnings ---
import warnings
warnings.filterwarnings('ignore') 

import numpy as np
import pandas as pd
import tensorflow as tf
from src.preprocessing import preprocessor, X, Y
from tensorflow.keras import models, layers, optimizers
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix


# --- Load data from CSV file ---
df = pd.read_csv("data/initial_labeling_data.csv")

# --- Split features and target ---
X = df.iloc[:, 2:-1]  # input features
Y = df.iloc[:, -1:]   # target labels

# --- split data ---
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# --- preprocessing ---
preprocessor.fit(X_train)
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
input_dim = X_train_preprocessed.shape[1]

# --- label encoding + one-hot ---
encoding = LabelEncoder()
y_train_encoded = encoding.fit_transform(y_train.values.ravel())
y_test_encoded = encoding.transform(y_test.values.ravel())

num_classes = len(np.unique(y_train_encoded))
y_train_encoded = tf.keras.utils.to_categorical(y_train_encoded, num_classes=num_classes)
y_test_encoded = tf.keras.utils.to_categorical(y_test_encoded, num_classes=num_classes)

# --- define Multi-layer perceptron model ---
optimizer = optimizers.Adam(learning_rate=0.001)

model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(20, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(10, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

# --- train model ---
model.fit(X_train_preprocessed, y_train_encoded, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

# --- predictions ---
y_pred = model.predict(X_test_preprocessed, verbose=0)
y_test_labels = np.argmax(y_test_encoded, axis=1)
y_pred_selected = np.argmax(y_pred, axis=1)

# --- evaluation metrics ---
accuracy_values = accuracy_score(y_test_labels, y_pred_selected)
precision_values = precision_score(y_test_labels, y_pred_selected, average='macro')
recall_values = recall_score(y_test_labels, y_pred_selected, average='macro')
confusionmatrix_values = confusion_matrix(y_test_labels, y_pred_selected, labels=np.arange(num_classes))

# --- model evaluation & hyperparameter optimization results ---
print(f"Accuracy: {accuracy_values:.4f}")
print(f"Precision: {precision_values:.4f}")
print(f"Recall: {recall_values:.4f}")
print(f"Confusion matrix:\n {confusionmatrix_values}")

Accuracy: 0.5000
Precision: 0.5063
Recall: 0.4545
Confusion matrix:
 [[17 10  1]
 [ 8 19  0]
 [ 5 13  1]]
