In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# 1️⃣ Load processed data
data = pd.read_csv("data/processed/student_scores_processed.csv")

# 2️⃣ Convert all object (text) columns to numeric
label_encoders = {}
for col in data.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# 3️⃣ Prepare features and target
X = data.drop(columns=["final_score"])
y = data["final_score"]

# 4️⃣ Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5️⃣ Train XGBoost Regressor
xgb_reg = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_reg.fit(X_train, y_train)

# 6️⃣ Save model and encoders
joblib.dump(xgb_reg, "models/xgb_best_model.pkl")
joblib.dump(label_encoders, "models/label_encoders.pkl")
print("✅ Model and label encoders saved successfully!")


✅ Model and label encoders saved successfully!
