In [1]:
pip install numpy pandas scikit-learn matplotlib seaborn joblib


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib


In [3]:
df = pd.read_csv("student-mat.csv", sep=';')  # file same folder e thakbe

df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
# pass = 1  if G3 >= 10 else 0
df['pass'] = (df['G3'] >= 10).astype(int)
df['pass'].value_counts()


pass
1    265
0    130
Name: count, dtype: int64

In [5]:
selected_features = ['sex', 'age', 'studytime', 'failures', 'absences', 'goout', 'health']

df_model = df[selected_features + ['pass']]

df_model.head()


Unnamed: 0,sex,age,studytime,failures,absences,goout,health,pass
0,F,18,2,0,6,4,3,0
1,F,17,2,0,4,3,3,0
2,F,15,2,3,10,2,3,1
3,F,15,3,0,2,2,5,1
4,F,16,2,0,4,2,5,1


In [6]:
X = df_model.drop('pass', axis=1)
y = df_model['pass']

numeric_features = ['age', 'studytime', 'failures', 'absences', 'goout', 'health']
categorical_features = ['sex']

numeric_features, categorical_features


(['age', 'studytime', 'failures', 'absences', 'goout', 'health'], ['sex'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

rf_clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ))
])


In [9]:
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6708860759493671

Classification report:
               precision    recall  f1-score   support

           0       0.50      0.31      0.38        26
           1       0.71      0.85      0.78        53

    accuracy                           0.67        79
   macro avg       0.61      0.58      0.58        79
weighted avg       0.64      0.67      0.65        79



In [10]:
joblib.dump(rf_clf, "student_performance_model.joblib")
print("Model saved as student_performance_model.joblib")


Model saved as student_performance_model.joblib
