In [44]:
import pandas as pd
import numpy as np

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Regression models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

In [45]:
df = pd.read_csv("../data/processed/student_cleaned.csv")
df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,self_study_hours,backlog_count,famrel,freetime,goout,Dalc,Walc,health,internal_marks_1,internal_marks_2,final_score,attendance_percentage
0,18,4,4,2,2,0,4,3,4,1,1,3,5,6,6,88
1,17,1,1,1,2,0,5,3,3,1,1,3,5,5,6,92
2,15,1,1,1,2,3,4,3,2,2,3,3,7,8,10,80
3,15,4,2,1,3,0,3,2,2,1,1,5,15,14,15,96
4,16,3,3,1,2,0,4,3,2,1,2,5,6,10,10,92


In [46]:
df.columns

Index(['age', 'Medu', 'Fedu', 'traveltime', 'self_study_hours',
       'backlog_count', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
       'health', 'internal_marks_1', 'internal_marks_2', 'final_score',
       'attendance_percentage'],
      dtype='object')

In [47]:
df["performance_index"] = df["attendance_percentage"] * df["internal_marks_2"]
df["consistency_score"] = abs(df["internal_marks_2"] - df["internal_marks_1"])

In [48]:
def risk_label(score):
    if score < 10:
        return "At Risk"
    elif score < 14:
        return "Average"
    else:
        return "Top Performer"

df["risk_category"] = df["final_score"].apply(risk_label)
df["risk_category"].value_counts()

risk_category
Average          165
At Risk          130
Top Performer    100
Name: count, dtype: int64

# Feature Selection 

In [49]:
features = [
    "attendance_percentage",
    "self_study_hours",
    "internal_marks_1",
    "internal_marks_2",
    "backlog_count",
    "performance_index",
    "consistency_score"
]

X = df[features]
y_reg = df["final_score"]
y_cls = df["risk_category"]

# Train-Test-Split

In [50]:
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

_, _, y_train_cls, y_test_cls = train_test_split(
    X, y_cls, test_size=0.2, random_state=42
)

# Baseline Regression Models

In [51]:
lr = LinearRegression()
ridge = Ridge(alpha=1.0)

lr.fit(X_train, y_train_reg)
ridge.fit(X_train, y_train_reg)

lr_rmse = mean_squared_error(y_test_reg, lr.predict(X_test), squared=False)
ridge_rmse = mean_squared_error(y_test_reg, ridge.predict(X_test), squared=False)

lr_rmse, ridge_rmse

(2.1302438543368867, 2.129398598771566)

# Random Forest Pipeline

In [52]:
rf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(random_state=42))
])

# Hyperparameter Grid

In [53]:
param_grid_rf = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

# GridSearchCV

In [54]:
grid_rf = GridSearchCV(
    rf_pipeline,
    param_grid_rf,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

grid_rf.fit(X_train, y_train_reg)

# Best Regression Model Evaluation

In [55]:
best_rf = grid_rf.best_estimator_

rf_rmse = mean_squared_error(
    y_test_reg,
    best_rf.predict(X_test),
    squared=False
)

rf_rmse

1.756069342639461

# Cross-Validation Score

In [56]:
cv_rmse = cross_val_score(
    best_rf,
    X,
    y_reg,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

cv_rmse.mean() * -1

1.781978438698367

# Regression Model Comparison Table

In [57]:
regression_results = pd.DataFrame({
    "Model": [
        "Linear Regression",
        "Ridge Regression",
        "Random Forest (Tuned)"
    ],
    "RMSE": [
        lr_rmse,
        ridge_rmse,
        rf_rmse
    ]
})

regression_results

Unnamed: 0,Model,RMSE
0,Linear Regression,2.130244
1,Ridge Regression,2.129399
2,Random Forest (Tuned),1.756069


# Classification Pipeline

In [58]:
rf_cls_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(random_state=42))
])

# Classification Hyperparameter Grid

In [59]:
param_grid_cls = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20]
}

# GridSearchCV for Classification 

In [60]:
grid_cls = GridSearchCV(
    rf_cls_pipeline,
    param_grid_cls,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_cls.fit(X_train, y_train_cls)

# Classification Result

In [61]:
best_cls = grid_cls.best_estimator_

print(classification_report(
    y_test_cls,
    best_cls.predict(X_test)
))

               precision    recall  f1-score   support

      At Risk       0.86      0.93      0.89        27
      Average       0.77      0.77      0.77        26
Top Performer       0.92      0.85      0.88        26

     accuracy                           0.85        79
    macro avg       0.85      0.85      0.85        79
 weighted avg       0.85      0.85      0.85        79



In [62]:
import joblib

joblib.dump(best_rf, "../models/final_score_model.pkl")
joblib.dump(best_cls, "../models/risk_classifier.pkl")

['../models/risk_classifier.pkl']

In [64]:
df.to_csv("../data/processed/student_featured.csv", index=False)