# Student Grade Prediction

Train on S1-S4 course grades â†’ Predict S5-S6 course grades

With hyperparameter tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import joblib
import json
import os
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded!")

## 1. Load and Prepare Data

In [None]:
df = pd.read_excel('cleaned_data.xlsx')
grade_matrix = df.pivot_table(index='admi', columns='code', values='note', aggfunc='first')
print(f"Grade matrix: {grade_matrix.shape[0]} students x {grade_matrix.shape[1]} courses")

In [None]:
# Get courses by semester
courses_s1_s4 = []
courses_s5_s6 = []

for sem in ['S1', 'S2', 'S3', 'S4']:
    courses_s1_s4.extend(df[df['simester'] == sem]['code'].unique().tolist())
for sem in ['S5', 'S6']:
    courses_s5_s6.extend(df[df['simester'] == sem]['code'].unique().tolist())

courses_s1_s4 = list(dict.fromkeys([c for c in courses_s1_s4 if c in grade_matrix.columns]))
courses_s5_s6 = list(dict.fromkeys([c for c in courses_s5_s6 if c in grade_matrix.columns]))

print(f"Input (S1-S4): {len(courses_s1_s4)} courses")
print(f"Output (S5-S6): {len(courses_s5_s6)} courses")

In [None]:
# Prepare X and y
X = grade_matrix[courses_s1_s4].fillna(grade_matrix[courses_s1_s4].median())
y = grade_matrix[courses_s5_s6].fillna(grade_matrix[courses_s5_s6].median())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## 2. Hyperparameter Tuning - Random Forest

In [None]:
# Random Forest parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("Random Forest Parameter Grid:")
for param, values in rf_param_grid.items():
    print(f"  {param}: {values}")
print(f"\nTotal combinations: {np.prod([len(v) for v in rf_param_grid.values()])}")

In [None]:
print("Tuning Random Forest (this may take a few minutes)...")

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    rf_param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

print(f"\nBest RF Parameters: {rf_grid.best_params_}")
print(f"Best CV R2 Score: {rf_grid.best_score_:.4f}")

In [None]:
# Evaluate best RF on test set
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

print(f"Random Forest Test Results:")
print(f"  RMSE: {rf_rmse:.2f}")
print(f"  R2: {rf_r2:.4f}")

## 3. Hyperparameter Tuning - XGBoost

In [None]:
# XGBoost parameter grid
xgb_param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [3, 5, 7, 10],
    'estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'estimator__subsample': [0.7, 0.8, 1.0],
    'estimator__colsample_bytree': [0.7, 0.8, 1.0]
}

print("XGBoost Parameter Grid:")
for param, values in xgb_param_grid.items():
    print(f"  {param.replace('estimator__', '')}: {values}")
print(f"\nTotal combinations: {np.prod([len(v) for v in xgb_param_grid.values()])}")

In [None]:
print("Tuning XGBoost (this may take several minutes)...")

# Use RandomizedSearchCV for faster tuning due to many combinations
from sklearn.model_selection import RandomizedSearchCV

xgb_base = MultiOutputRegressor(xgb.XGBRegressor(random_state=42, n_jobs=1))

xgb_grid = RandomizedSearchCV(
    xgb_base,
    xgb_param_grid,
    n_iter=50,  # Test 50 random combinations
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

xgb_grid.fit(X_train, y_train)

print(f"\nBest XGB Parameters:")
for param, value in xgb_grid.best_params_.items():
    print(f"  {param.replace('estimator__', '')}: {value}")
print(f"Best CV R2 Score: {xgb_grid.best_score_:.4f}")

In [None]:
# Evaluate best XGB on test set
xgb_best = xgb_grid.best_estimator_
xgb_pred = xgb_best.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_r2 = r2_score(y_test, xgb_pred)

print(f"XGBoost Test Results:")
print(f"  RMSE: {xgb_rmse:.2f}")
print(f"  R2: {xgb_r2:.4f}")

## 4. Compare Results

In [None]:
print("="*60)
print("FINAL COMPARISON")
print("="*60)
print(f"{'Model':<20} {'RMSE':<10} {'R2':<10} {'CV R2':<10}")
print("-"*60)
print(f"{'Random Forest':<20} {rf_rmse:<10.2f} {rf_r2:<10.4f} {rf_grid.best_score_:<10.4f}")
print(f"{'XGBoost':<20} {xgb_rmse:<10.2f} {xgb_r2:<10.4f} {xgb_grid.best_score_:<10.4f}")

if rf_r2 >= xgb_r2:
    best_model, best_name, best_params = rf_best, 'RandomForest', rf_grid.best_params_
else:
    best_model, best_name, best_params = xgb_best, 'XGBoost', xgb_grid.best_params_

print(f"\nBest Model: {best_name}")

## 5. Save Models with Hyperparameter Names

In [None]:
os.makedirs('models', exist_ok=True)

# Create descriptive filenames with hyperparameters
rf_params = rf_grid.best_params_
rf_filename = f"rf_n{rf_params['n_estimators']}_d{rf_params['max_depth']}_split{rf_params['min_samples_split']}_leaf{rf_params['min_samples_leaf']}.pkl"

xgb_params = {k.replace('estimator__', ''): v for k, v in xgb_grid.best_params_.items()}
xgb_filename = f"xgb_n{xgb_params['n_estimators']}_d{xgb_params['max_depth']}_lr{xgb_params['learning_rate']}_sub{xgb_params['subsample']}_col{xgb_params['colsample_bytree']}.pkl"

# Save models
joblib.dump(rf_best, f'models/{rf_filename}')
joblib.dump(xgb_best, f'models/{xgb_filename}')
joblib.dump(best_model, 'models/best_model.pkl')
joblib.dump(courses_s1_s4, 'models/feature_columns.pkl')
joblib.dump(courses_s5_s6, 'models/target_columns.pkl')

print("Models saved:")
print(f"  - {rf_filename}")
print(f"  - {xgb_filename}")
print(f"  - best_model.pkl ({best_name})")

In [None]:
# Save metadata
metadata = {
    'best_model': best_name,
    'random_forest': {
        'params': rf_params,
        'filename': rf_filename,
        'rmse': float(rf_rmse),
        'r2': float(rf_r2),
        'cv_r2': float(rf_grid.best_score_)
    },
    'xgboost': {
        'params': xgb_params,
        'filename': xgb_filename,
        'rmse': float(xgb_rmse),
        'r2': float(xgb_r2),
        'cv_r2': float(xgb_grid.best_score_)
    },
    'input_courses': courses_s1_s4,
    'output_courses': courses_s5_s6
}

with open('models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("\nMetadata saved to models/model_metadata.json")

In [None]:
# List all saved files
print("\nAll files in models/:")
for f in sorted(os.listdir('models')):
    size = os.path.getsize(f'models/{f}') / 1024
    print(f"  - {f} ({size:.1f} KB)")

## 6. Inference Example

In [1]:
# Load best model
model = joblib.load('models/best_model.pkl')
input_courses = joblib.load('models/feature_columns.pkl')
output_courses = joblib.load('models/target_columns.pkl')

# Test student
student_id = X_test.index[0]
student_input = X_test.loc[[student_id]]
student_actual = y_test.loc[student_id]

print(f"Student ID: {student_id}")
print(f"\nInput grades (S1-S4):")
for course in input_courses:
    print(f"  {course}: {student_input[course].values[0]:.0f}")

NameError: name 'joblib' is not defined

In [None]:
# Predict
predicted = model.predict(student_input)[0]

print(f"{'Course':<10} {'Predicted':<12} {'Actual':<10}")
print("-" * 35)
for i, course in enumerate(output_courses):
    print(f"{course:<10} {predicted[i]:<12.1f} {student_actual[course]:<10.0f}")

In [None]:
# Inference function
def predict_grades(s1_s4_grades):
    model = joblib.load('models/best_model.pkl')
    input_courses = joblib.load('models/feature_columns.pkl')
    output_courses = joblib.load('models/target_columns.pkl')
    
    X = np.array([[s1_s4_grades.get(c, 50) for c in input_courses]])
    predictions = model.predict(X)[0]
    
    return dict(zip(output_courses, predictions))

# Example
example = {c: 75 for c in input_courses}
result = predict_grades(example)

print("Example: Student with 75 in all S1-S4 courses")
print("\nPredicted S5-S6 grades:")
for course, grade in result.items():
    print(f"  {course}: {grade:.1f}")