In [None]:
# Cell 1: Install and import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Cell 2: Load or create dataset
# Download from: https://www.kaggle.com/datasets/aljaroudi/student-performance-dataset
# Or create synthetic data:
np.random.seed(42)
n_samples = 200

data = pd.DataFrame({
    'attendance': np.random.uniform(60, 100, n_samples),
    'internal_marks': np.random.uniform(30, 90, n_samples),
    'study_hours': np.random.uniform(0, 8, n_samples),
    'backlogs': np.random.randint(0, 5, n_samples),
    'participation': np.random.uniform(1, 5, n_samples),
    'pass': np.random.randint(0, 2, n_samples)
})

print(data.head())
print(data.describe())

# Cell 3: Data preprocessing
# Check for missing values
print(data.isnull().sum())

# Feature engineering
X = data[['attendance', 'internal_marks', 'study_hours', 'backlogs', 'participation']]
y = data['pass']

# Handle missing values (if any)
X = X.fillna(X.mean())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Cell 4: Train multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'accuracy': accuracy,
        'f1': f1,
        'roc_auc': roc_auc,
        'model': model
    }
    
    print(f"\n{name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  ROC-AUC: {roc_auc:.4f}")

# Cell 5: Model comparison
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']

print(f"\nBest Model: {best_model_name}")

# Cell 6: Evaluate best model
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {results[best_model_name]["roc_auc"]:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Cell 7: Save model and preprocessor
with open('../backend/models/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('../backend/models/preprocessor.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model and preprocessor saved successfully!")
