# Student Performance Risk Analysis

This notebook provides a step-by-step analysis of the student performance data to predict academic risk.

In [None]:
import os
import pandas as pd
import sys

# Ensure src is in path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data_loader import load_data
from src.preprocessing import Preprocessor
from src.models import ModelTrainer
from src.evaluation import Evaluator

## 1. Load Data

In [None]:
DATA_PATH = '../student_performance_risk_dataset.csv'
df = load_data(DATA_PATH)
df.head()

## 2. Preprocessing
We drop the ID column for training and scale the features.

In [None]:
preprocessor = Preprocessor()
X_scaled, y, full_processed_df, feature_names = preprocessor.preprocess(df)
X_train, X_test, y_train, y_test = preprocessor.split_data(X_scaled, y)
print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

## 3. Model Training & Comparison
We train multiple models and compare their performance.

In [None]:
trainer = ModelTrainer()
evaluator = Evaluator()

models = ['LogisticRegression', 'RandomForest', 'SVM', 'GradientBoosting']
results = {}

for name in models:
    trainer.train_model(name, X_train, y_train)
    y_pred = trainer.predict(name, X_test)
    results[name] = evaluator.evaluate(y_test, y_pred)

results_df = pd.DataFrame(results).T
results_df

## 4. Feature Importance
Analyzing which factors contribute most to risk.

In [None]:
best_model_name = results_df['F1-Score'].idxmax()
print(f"Best Model: {best_model_name}")

best_model = trainer.models[best_model_name]
feature_imp = evaluator.get_feature_importance(best_model, feature_names)
feature_imp