In [1]:
# Full pipeline with corrected column names
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

df = pd.read_csv("study_performance.csv")

# Create target
df['avg_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
def score_label(x):
    if x < 60:
        return 'Low'
    elif x <= 80:
        return 'Medium'
    else:
        return 'High'
df['performance_level'] = df['avg_score'].apply(score_label)

print("Class distribution:")
display(df['performance_level'].value_counts())

# Features and target
X = df.drop(columns=['avg_score','performance_level'])
y = df['performance_level']

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('clf', clf)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=['Low','Medium','High'])
cm_df = pd.DataFrame(cm, index=['Low','Medium','High'], columns=['Pred Low','Pred Medium','Pred High'])
print("Confusion Matrix:")
display(cm_df)

# Feature importances
ohe = pipe.named_steps['preprocessor'].named_transformers_['cat']
try:
    ohe_feature_names = list(ohe.get_feature_names_out(categorical_cols))
except Exception:
    ohe_feature_names = []
    for i, col in enumerate(categorical_cols):
        cats = ohe.categories_[i]
        ohe_feature_names += [f"{col}_{c}" for c in cats]

feature_names = numeric_cols + ohe_feature_names
importances = pipe.named_steps['clf'].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(20)
print("\nTop 20 Feature Importances:")
display(feat_imp)
feat_imp.to_csv('feature_importances.csv')

# Save pipeline and streamlit app
model_path = "student_performance_pipeline.joblib"
joblib.dump(pipe, model_path)

Class distribution:


performance_level
Medium    521
Low       285
High      194
Name: count, dtype: int64

Categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
Numeric columns: ['math_score', 'reading_score', 'writing_score']
Test Accuracy: 0.9650

Classification Report:
              precision    recall  f1-score   support

        High       0.97      0.95      0.96        39
         Low       1.00      0.93      0.96        57
      Medium       0.94      0.99      0.97       104

    accuracy                           0.96       200
   macro avg       0.97      0.96      0.96       200
weighted avg       0.97      0.96      0.96       200

Confusion Matrix:


Unnamed: 0,Pred Low,Pred Medium,Pred High
Low,53,4,0
Medium,0,103,1
High,0,2,37



Top 20 Feature Importances:


writing_score                                     0.337505
reading_score                                     0.318719
math_score                                        0.237338
gender_male                                       0.011083
gender_female                                     0.010495
lunch_free/reduced                                0.008879
test_preparation_course_none                      0.007899
lunch_standard                                    0.006858
test_preparation_course_completed                 0.006410
parental_level_of_education_high school           0.006334
race_ethnicity_group E                            0.006276
race_ethnicity_group C                            0.005913
race_ethnicity_group D                            0.005566
parental_level_of_education_some college          0.005507
parental_level_of_education_some high school      0.004962
parental_level_of_education_associate's degree    0.004852
race_ethnicity_group B                            0.0045

['student_performance_pipeline.joblib']