# Student Performance Risk Analysis

This notebook provides a step-by-step analysis of the student performance data to predict academic risk.

In [1]:
import os
import pandas as pd
import sys

# Ensure src is in path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data_loader import load_data
from src.preprocessing import Preprocessor
from src.models import ModelTrainer
from src.evaluation import Evaluator

## 1. Load Data

In [2]:
DATA_PATH = '../student_performance_risk_dataset.csv'
df = load_data(DATA_PATH)
df.head()

Successfully loaded data from ../student_performance_risk_dataset.csv. Shape: (120, 6)


Unnamed: 0,student_id,attendance_percentage,assignment_average,internal_marks,previous_sem_gpa,at_risk
0,S1000,93,79,78,7.67,0
1,S1001,83,61,66,4.49,1
2,S1002,69,66,66,4.03,1
3,S1003,97,74,89,7.77,0
4,S1004,62,40,90,5.17,1


## 2. Preprocessing
We drop the ID column for training and scale the features.

In [3]:
preprocessor = Preprocessor()
X_scaled, y, full_processed_df, feature_names = preprocessor.preprocess(df)
X_train, X_test, y_train, y_test = preprocessor.split_data(X_scaled, y)
print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

Training Shape: (96, 4)
Testing Shape: (24, 4)


## 3. Model Training & Comparison
We train multiple models and compare their performance.

In [5]:
trainer = ModelTrainer()
evaluator = Evaluator()

models = ['LogisticRegression', 'RandomForest', 'SVM', 'GradientBoosting']
results = {}

for name in models:
    trainer.train_model(name, X_train, y_train)
    y_pred = trainer.predict(name, X_test)
    results[name] = evaluator.evaluate(y_test, y_pred)

results_df = pd.DataFrame(results).T
results_df

Trained LogisticRegression
Trained RandomForest
Trained SVM
Trained GradientBoosting


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
LogisticRegression,0.875,0.9,0.947368,0.923077
RandomForest,1.0,1.0,1.0,1.0
SVM,0.916667,0.947368,0.947368,0.947368
GradientBoosting,1.0,1.0,1.0,1.0


## 4. Feature Importance
Analyzing which factors contribute most to risk.

In [6]:
best_model_name = results_df['F1-Score'].idxmax()
print(f"Best Model: {best_model_name}")

best_model = trainer.models[best_model_name]
feature_imp = evaluator.get_feature_importance(best_model, feature_names)
feature_imp

Best Model: RandomForest


Unnamed: 0,Feature,Importance
0,attendance_percentage,0.356758
3,previous_sem_gpa,0.316245
1,assignment_average,0.166485
2,internal_marks,0.160512
