# **Imports**

In [33]:
import numpy as np
from pathlib import Path
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# **Loading Data**

In [34]:
# Load features saved from Notebook 2
FEATURE_DIR = Path("../artifacts/features/ML")

X = np.load(FEATURE_DIR / "X_features.npy")
y = np.load(FEATURE_DIR / "y_labels.npy")
languages = np.load(FEATURE_DIR / "languages.npy")

print("X Shape: ", X.shape)
print("Y Shape: ", y.shape)

X Shape:  (2000, 282)
Y Shape:  (2000,)


# **Training Data Split**

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3, stratify=y)

# **Pipelines**

### **Logistic Regression**

In [36]:
logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=300))
])

logreg_params = {
    "clf__C": [0.01, 0.1, 1, 10]
}

### **Random Forest**

In [37]:
rf_pipe = Pipeline([
    ("clf", RandomForestClassifier(random_state=42))
])

rf_params = {
    "clf__n_estimators": [200, 400],
    "clf__max_depth": [None, 20, 40]
}

### **SVM**

In [38]:
svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(probability=True))
])

svm_params = {
    "clf__C": [1, 10, 50],
    "clf__gamma": ["scale", 0.01, 0.001]
}

# **Training and Evaluation**

In [39]:
def train_and_eval(pipe, params, name):
    print(f"\nTraining {name}...")

    grid = GridSearchCV(pipe, params, cv=5, scoring="f1", n_jobs=-1)

    grid.fit(X_train, y_train)

    preds = grid.predict(X_val)

    acc = accuracy_score(y_val, preds)
    f1 = f1_score(y_val, preds)

    print("Best Parameters:", grid.best_params_)
    print("Accuracy:", acc)
    print("F1:", f1)
    print(classification_report(y_val, preds))

    return grid.best_estimator_, f1

In [40]:
best_logreg, logreg_f1 = train_and_eval(logreg_pipe, logreg_params, "Logistic Regression")
best_rf, rf_f1 = train_and_eval(rf_pipe, rf_params, "Random Forest")
best_svm, svm_f1 = train_and_eval(svm_pipe, svm_params, "SVM (RBF)")


Training Logistic Regression...
Best Parameters: {'clf__C': 1}
Accuracy: 0.995
F1: 0.995
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       200
           1       0.99      0.99      0.99       200

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400


Training Random Forest...
Best Parameters: {'clf__max_depth': None, 'clf__n_estimators': 400}
Accuracy: 0.9875
F1: 0.9874686716791979
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       200
           1       0.99      0.98      0.99       200

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400


Training SVM (RBF)...
Best Parameters: {'clf__C': 1, 'clf__gamma': 'scale'}
Accuracy: 1.0
F1: 1.0
              precisio

# **Selecting Best Model**

In [41]:
models = {
    "logreg": best_logreg,
    "rf": best_rf,
    "svm": best_svm
}

results = {
    "logreg": logreg_f1,
    "rf": rf_f1,
    "svm": svm_f1
}

best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print("\nSelected Best Model:", best_model_name)


Selected Best Model: svm


# **Save Best Model**

In [42]:
MODEL_DIR = Path("../artifacts/models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

model_path = MODEL_DIR / "best_ml_model.joblib"
joblib.dump(best_model, model_path)

print("Saved model to:", model_path)

Saved model to: ..\artifacts\models\best_ml_model.joblib
