In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:

# Load and enhance data
def load_and_enhance_data(filepath):
    data = pd.read_csv(filepath)
    
    # Boost high performers
    def is_high_performer(row):
        return (
            row["Hours_Studied"] >= 25 and row["Attendance"] >= 0.95 and
            row["Previous_Scores"] >= 85 and row["Motivation_Level"] == "High" and
            row["Tutoring_Sessions"] >= 2 and row["Parental_Involvement"] == "High" and
            row["Access_to_Resources"] == "High"
        )
    data.loc[data.apply(is_high_performer, axis=1) & (data["Exam_Score"] < 80), "Exam_Score"] = \
        np.random.randint(80, 100, size=len(data[data.apply(is_high_performer, axis=1) & (data["Exam_Score"] < 80)]))
    
    # Synthetic students (less extreme, fewer samples)
    high_performers = [{
        "Hours_Studied": np.random.randint(10, 25), "Attendance": np.random.uniform(0.8, 1.0),
        "Previous_Scores": np.random.randint(75, 90), "Motivation_Level": np.random.choice(["Medium", "High"]),
        "Tutoring_Sessions": np.random.randint(1, 4), "Parental_Involvement": np.random.choice(["Medium", "High"]),
        "Access_to_Resources": np.random.choice(["Medium", "High"]), "Exam_Score": np.random.randint(80, 90)
    } for _ in range(100)]
    
    excellent_performers = [{
        "Hours_Studied": np.random.randint(15, 30), "Attendance": np.random.uniform(0.85, 1.0),
        "Previous_Scores": np.random.randint(80, 95), "Motivation_Level": "High",
        "Tutoring_Sessions": np.random.randint(2, 5), "Parental_Involvement": "High",
        "Access_to_Resources": "High", "Exam_Score": np.random.randint(90, 100)
    } for _ in range(50)]
    
    medium_performers = [{
        "Hours_Studied": np.random.randint(10, 20), "Attendance": np.random.uniform(0.75, 0.95),
        "Previous_Scores": np.random.randint(60, 80), "Motivation_Level": np.random.choice(["Medium", "High"]),
        "Tutoring_Sessions": np.random.randint(0, 3), "Parental_Involvement": np.random.choice(["Medium", "High"]),
        "Access_to_Resources": np.random.choice(["Medium", "High"]), "Exam_Score": np.random.randint(65, 80)
    } for _ in range(100)]
    
    at_risk_performers = [{
        "Hours_Studied": np.random.randint(0, 10), "Attendance": np.random.uniform(0.5, 0.8),
        "Previous_Scores": np.random.randint(40, 60), "Motivation_Level": np.random.choice(["Low", "Medium"]),
        "Tutoring_Sessions": np.random.randint(0, 2), "Parental_Involvement": np.random.choice(["Low", "Medium"]),
        "Access_to_Resources": np.random.choice(["Low", "Medium"]), "Exam_Score": np.random.randint(40, 60)
    } for _ in range(100)]
    
    synthetic_students = pd.DataFrame(high_performers + excellent_performers + medium_performers + at_risk_performers)
    data = pd.concat([data, synthetic_students], ignore_index=True)
    
    # Feature selection and encoding
    selected_features = [
        "Hours_Studied", "Attendance", "Previous_Scores", "Motivation_Level",
        "Tutoring_Sessions", "Parental_Involvement", "Access_to_Resources", "Exam_Score"
    ]
    data = data[selected_features]
    
    categorical_columns = ["Parental_Involvement", "Access_to_Resources", "Motivation_Level"]
    label_encoders = {}
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        data[col] = label_encoders[col].fit_transform(data[col])
    
    for col in categorical_columns:
        print(f"{col} (encoded): {data[col].unique()}")
    
    # Normalize numerical columns
    numerical_columns = ["Hours_Studied", "Attendance", "Previous_Scores", "Motivation_Level", "Tutoring_Sessions"]
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
    
    # Feature engineering
    data["Hours_Motivation"] = data["Hours_Studied"] * (data["Motivation_Level"] + 1)
    data["Attendance_Impact"] = data["Attendance"] * data["Previous_Scores"]
    
    X = data.drop("Exam_Score", axis=1)
    y = data["Exam_Score"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("\nEnhanced Dataset Shape:", X_train.shape, "Training Samples:", len(y_train), "Testing Samples:", len(y_test))
    return X_train, X_test, y_train, y_test, scaler, label_encoders

X_train, X_test, y_train, y_test, scaler, label_encoders = load_and_enhance_data("student_performance.csv")

# Train model with squared error
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3],
    'min_samples_split': [10],
    'subsample': [0.8]
}
grid_search = GridSearchCV(
    GradientBoostingRegressor(random_state=42, loss='squared_error'),
    param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


Parental_Involvement (encoded): [1 2 0]
Access_to_Resources (encoded): [0 2 1]
Motivation_Level (encoded): [1 2 0]

Enhanced Dataset Shape: (5565, 9) Training Samples: 5565 Testing Samples: 1392
Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [3]:
# Evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    overall_metrics = {
        'r2': r2_score(y_test, y_pred),
        'mse': mean_squared_error(y_test, y_pred),
        'mae': mean_absolute_error(y_test, y_pred)
    }
    
    segments = {'at_risk': (0, 60), 'average': (60, 80), 'high': (80, 90), 'excellent': (90, 100)}
    segment_metrics = {}
    for name, (lower, upper) in segments.items():
        mask = (y_test >= lower) & (y_test < upper)
        if sum(mask) > 0:
            segment_metrics[name] = {
                'r2': r2_score(y_test[mask], y_pred[mask]),
                'mse': mean_squared_error(y_test[mask], y_pred[mask]),
                'mae': mean_absolute_error(y_test[mask], y_pred[mask]),
                'count': sum(mask)
            }
    
    at_risk_threshold = 60
    at_risk_count = sum(y_pred < at_risk_threshold)
    at_risk_precision = sum(y_test[y_pred < at_risk_threshold] < at_risk_threshold) / at_risk_count if at_risk_count > 0 else None
    
    print("\n=== MODEL EVALUATION ===")
    print(f"Overall R²: {overall_metrics['r2']:.3f}, MSE: {overall_metrics['mse']:.2f}, MAE: {overall_metrics['mae']:.2f}")
    print("\nSegment Performance:")
    for name, metrics in segment_metrics.items():
        print(f"{name}: R²={metrics['r2']:.3f}, MSE={metrics['mse']:.2f}, MAE={metrics['mae']:.2f}, Samples={metrics['count']}")
    print(f"\nAt-Risk Students: {at_risk_count}/{len(y_test)} ({at_risk_count/len(y_test):.1%})")
    if at_risk_precision:
        print(f"At-Risk Precision: {at_risk_precision:.1%}")
    
    return overall_metrics, segment_metrics

overall_metrics, segment_metrics = evaluate_model(best_model, X_test, y_test)



=== MODEL EVALUATION ===
Overall R²: 0.779, MSE: 6.21, MAE: 1.21

Segment Performance:
at_risk: R²=0.367, MSE=23.43, MAE=3.82, Samples=32
average: R²=0.839, MSE=1.82, MAE=0.94, Samples=1323
high: R²=-10.537, MSE=146.86, MAE=8.70, Samples=25
excellent: R²=-22.022, MSE=151.40, MAE=8.19, Samples=12

At-Risk Students: 26/1392 (1.9%)
At-Risk Precision: 92.3%


In [4]:


# Save model and preprocessors
joblib.dump(best_model, "final_gradient_boosting_model.pkl")
joblib.dump(scaler, "final_scaler.pkl")
joblib.dump(label_encoders, "final_label_encoders.pkl")
print("\nFinal model and preprocessors saved.")


Final model and preprocessors saved.
