In [1]:
# 04_modeling_improved.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib

# -----------------------
# Step 1: Load cleaned dataset
# -----------------------
data = pd.read_csv("data/processed/student_scores_processed.csv")

# -----------------------
# Step 2: Features & target
# -----------------------
X = data.drop(["feedback", "student_id"], axis=1, errors="ignore")
y = data["feedback"]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# -----------------------
# Step 3: Train-test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# -----------------------
# Logistic Regression Tuning
# -----------------------
log_reg = LogisticRegression(max_iter=1000)
param_grid_lr = {"C": [0.01, 0.1, 1, 10], "solver": ["liblinear", "lbfgs"]}

grid_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring="accuracy")
grid_lr.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_lr.best_params_)
y_pred_lr = grid_lr.predict(X_test)
print("LogReg Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

# -----------------------
# Random Forest Tuning
# -----------------------
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {"n_estimators": [100, 200], "max_depth": [None, 5, 10], "min_samples_split": [2, 5]}

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring="accuracy")
grid_rf.fit(X_train, y_train)

print("\nBest Random Forest Params:", grid_rf.best_params_)
y_pred_rf = grid_rf.predict(X_test)
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

# -----------------------
# XGBoost Model
# -----------------------
xgb_clf = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb_clf.fit(X_train, y_train)

y_pred_xgb = xgb_clf.predict(X_test)
print("\nXGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

# -----------------------
# Save Models & LabelEncoder
# -----------------------
joblib.dump(grid_lr.best_estimator_, "models/logistic_regression_best.pkl")
joblib.dump(grid_rf.best_estimator_, "models/random_forest_best.pkl")
joblib.dump(xgb_clf, "models/xgb_best.pkl")
joblib.dump(le, "models/label_encoder.pkl")

print("✅ Models and label encoder saved in 'models/'")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Logistic Regression Params: {'C': 0.1, 'solver': 'liblinear'}
LogReg Accuracy: 0.3
                                                precision    recall  f1-score   support

                          Excellent engagement       0.00      0.00      0.00         3
Good understanding but slow in problem solving       0.00      0.00      0.00         2
                   Needs more practice in math       0.50      0.75      0.60         4
      Strong in theory but weak in application       0.00      0.00      0.00         3
                    Struggles with assignments       0.33      0.25      0.29         4
                   Very consistent performance       0.33      0.50      0.40         4

                                      accuracy                           0.30        20
                                     macro avg       0.19      0.25      0.21        20
                                  weighted avg       0.23      0.30      0.26        20


Best Random Forest Params: {

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Accuracy: 0.25
                                                precision    recall  f1-score   support

                          Excellent engagement       0.33      0.33      0.33         3
Good understanding but slow in problem solving       0.20      0.50      0.29         2
                   Needs more practice in math       0.00      0.00      0.00         4
      Strong in theory but weak in application       0.00      0.00      0.00         3
                    Struggles with assignments       0.33      0.50      0.40         4
                   Very consistent performance       0.33      0.25      0.29         4

                                      accuracy                           0.25        20
                                     macro avg       0.20      0.26      0.22        20
                                  weighted avg       0.20      0.25      0.22        20

✅ Models and label encoder saved in 'models/'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
