<a href="https://colab.research.google.com/github/KudratBatta/Exams-Student-Performance/blob/main/Student_perform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("spscientist/students-performance-in-exams")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/spscientist/students-performance-in-exams?dataset_version_number=1...


100%|██████████| 8.70k/8.70k [00:00<00:00, 12.6MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/spscientist/students-performance-in-exams/versions/1





In [2]:
import pandas as pd
import os

# Locate the CSV file inside the downloaded path
csv_path = os.path.join(path, "StudentsPerformance.csv")

# Load the dataset
df = pd.read_csv(csv_path)

# Show the first few rows
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
# Binary target: 1 if math score > 70
df['high_math_score'] = (df['math score'] > 70).astype(int)

# Combine reading and writing scores
df['rw_avg'] = (df['reading score'] + df['writing score']) / 2

# Ordinal encoding for parental level of education
edu_order = {
    "some high school": 0,
    "high school": 1,
    "some college": 2,
    "associate's degree": 3,
    "bachelor's degree": 4,
    "master's degree": 5
}
df['parent_edu_level'] = df['parental level of education'].map(edu_order)

In [7]:
# Select useful features
X = df[['gender', 'race/ethnicity', 'lunch', 'test preparation course', 'rw_avg', 'parent_edu_level']]
y = df['high_math_score']

# One-hot encode categorical columns
X = pd.get_dummies(X, drop_first=True)

print("Feature columns:", X.columns.tolist())

Feature columns: ['rw_avg', 'parent_edu_level', 'gender_male', 'race/ethnicity_group B', 'race/ethnicity_group C', 'race/ethnicity_group D', 'race/ethnicity_group E', 'lunch_standard', 'test preparation course_none']


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.84


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1]
}

grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                    param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
y_pred_tuned = grid.best_estimator_.predict(X_test)
print("Tuned XGBoost Accuracy:", accuracy_score(y_test, y_pred_tuned))

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Tuned XGBoost Accuracy: 0.87


In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(random_state=42)
xgb = grid.best_estimator_

ensemble = VotingClassifier(estimators=[
    ('rf', rf),
    ('gb', gb),
    ('xgb', xgb)
], voting='soft')

ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)

print("Ensemble Accuracy:", accuracy_score(y_test, y_pred_ensemble))

Parameters: { "use_label_encoder" } are not used.



Ensemble Accuracy: 0.855


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:\n", classification_report(y_test, y_pred_ensemble))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ensemble))

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89       125
           1       0.82      0.79      0.80        75

    accuracy                           0.85       200
   macro avg       0.85      0.84      0.84       200
weighted avg       0.85      0.85      0.85       200

Confusion Matrix:
 [[112  13]
 [ 16  59]]


In [13]:
import joblib

# Save the final ensemble model
joblib.dump(ensemble, 'high_accuracy_model.pkl')
print("Model saved.")

Model saved.


In [14]:
# Simulate new data (can also load new CSV)
new_data = df.sample(5).drop(columns=['math score', 'high_math_score', 'average_score'], errors='ignore')

# Feature engineering on new data
new_data['rw_avg'] = (new_data['reading score'] + new_data['writing score']) / 2
new_data['parent_edu_level'] = new_data['parental level of education'].map(edu_order)

# Select features
X_new = new_data[['gender', 'race/ethnicity', 'lunch', 'test preparation course', 'rw_avg', 'parent_edu_level']]

# One-hot encoding (match train columns)
X_new = pd.get_dummies(X_new, drop_first=True)

# Ensure columns match training data
for col in X.columns:
    if col not in X_new:
        X_new[col] = 0
X_new = X_new[X.columns]  # reorder

In [15]:
# Predict probability and class
pred_probs = ensemble.predict_proba(X_new)[:, 1]
pred_classes = ensemble.predict(X_new)

# Show results
results = new_data.copy()
results['Predicted_High_Math_Score'] = pred_classes
results['Probability'] = pred_probs
results[['gender', 'rw_avg', 'parental level of education', 'Predicted_High_Math_Score', 'Probability']]

Unnamed: 0,gender,rw_avg,parental level of education,Predicted_High_Math_Score,Probability
85,female,81.0,some college,1,0.773079
371,female,71.5,some college,0,0.069426
447,male,80.5,high school,1,0.911837
511,male,46.5,some high school,0,0.046461
299,male,81.0,associate's degree,1,0.867944
