In [3]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier

import joblib


In [4]:
df=pd.read_csv(r'C:\Users\KEERTHAN\OneDrive\Documents\GitHub\Keerthan-Projects\Python Project 3\Student_analyser\data\student_por_cleaned1.csv')
print("shape:",df.shape)
df.head()

shape: (202, 36)


Unnamed: 0,school,sex,age,address,famsize,pstatus,medu,fedu,mjob,fjob,...,dalc,walc,health,absences,g1,g2,g3,alcohol_total,grade_variation,performance
0,0,female,0.338598,1,0,1,1,1,0,2,...,0.0,-0.765759,-0.277493,-0.10098,-1.258255,-0.792123,-1.013882,2,1.154701,average
1,0,female,-1.461318,1,0,1,4,2,1,3,...,0.0,-0.765759,1.073195,-0.829482,0.743064,0.441878,0.227355,2,0.0,average
2,0,female,-0.56136,1,0,1,3,3,2,2,...,0.0,0.615342,1.073195,-0.829482,-0.457727,0.030545,-0.186391,3,1.154701,average
3,0,male,-0.56136,1,1,1,4,3,3,2,...,0.0,0.615342,1.073195,1.356023,-0.057464,-0.380789,-0.186391,3,0.57735,average
4,0,female,0.338598,1,0,0,4,4,2,4,...,0.0,-0.765759,-1.62818,-0.10098,-0.857991,0.030545,-0.186391,2,1.732051,average


In [6]:
target ='performance'
x=df.drop(columns=[target])
y=df[target]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)

Training samples: (161, 35)
Testing samples: (41, 35)


In [8]:
numerical_cols = x.select_dtypes(include=['int64','float64']).columns
categorical_cols = x.select_dtypes(include='object').columns

In [9]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [11]:
model = RandomForestClassifier(n_estimators=200, random_state=42)

In [12]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [13]:
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

     average       1.00      1.00      1.00        30
   excellent       1.00      1.00      1.00         7
        poor       1.00      1.00      1.00         4

    accuracy                           1.00        41
   macro avg       1.00      1.00      1.00        41
weighted avg       1.00      1.00      1.00        41



In [17]:
joblib.dump(clf, 'C:/Users/KEERTHAN/OneDrive/Documents/GitHub/Keerthan-Projects/Python Project 3/Student_analyser/models/best_model.pkl')

print("Model saved at C:/Users/KEERTHAN/OneDrive/Documents/GitHub/Keerthan-Projects/Python Project 3/Student_analyser/models/best_model.pkl")

Model saved at C:/Users/KEERTHAN/OneDrive/Documents/GitHub/Keerthan-Projects/Python Project 3/Student_analyser/models/best_model.pkl
