In [17]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
df = pd.read_csv("student_performance_dataset.csv")

In [9]:
df.head()

Unnamed: 0,Student_ID,Gender,Study_Hours_per_Week,Attendance_Rate,Past_Exam_Scores,Parental_Education_Level,Internet_Access_at_Home,Extracurricular_Activities,Final_Exam_Score,Pass_Fail
0,S147,Male,31,68.267841,86,High School,Yes,Yes,63,Pass
1,S136,Male,16,78.222927,73,PhD,No,No,50,Fail
2,S209,Female,21,87.525096,74,PhD,Yes,No,55,Fail
3,S458,Female,27,92.076483,99,Bachelors,No,No,65,Pass
4,S078,Female,37,98.655517,63,Masters,No,Yes,70,Pass


In [10]:
df = df.drop(columns=["Student_ID"])

In [11]:
df.head()

Unnamed: 0,Gender,Study_Hours_per_Week,Attendance_Rate,Past_Exam_Scores,Parental_Education_Level,Internet_Access_at_Home,Extracurricular_Activities,Final_Exam_Score,Pass_Fail
0,Male,31,68.267841,86,High School,Yes,Yes,63,Pass
1,Male,16,78.222927,73,PhD,No,No,50,Fail
2,Female,21,87.525096,74,PhD,Yes,No,55,Fail
3,Female,27,92.076483,99,Bachelors,No,No,65,Pass
4,Female,37,98.655517,63,Masters,No,Yes,70,Pass


In [12]:
df["Pass_Fail"] = df["Pass_Fail"].map({"Fail":0,"Pass":1})

In [13]:
df.head()

Unnamed: 0,Gender,Study_Hours_per_Week,Attendance_Rate,Past_Exam_Scores,Parental_Education_Level,Internet_Access_at_Home,Extracurricular_Activities,Final_Exam_Score,Pass_Fail
0,Male,31,68.267841,86,High School,Yes,Yes,63,1
1,Male,16,78.222927,73,PhD,No,No,50,0
2,Female,21,87.525096,74,PhD,Yes,No,55,0
3,Female,27,92.076483,99,Bachelors,No,No,65,1
4,Female,37,98.655517,63,Masters,No,Yes,70,1


In [14]:
df = df.drop(columns=["Final_Exam_Score"])
df.head()

Unnamed: 0,Gender,Study_Hours_per_Week,Attendance_Rate,Past_Exam_Scores,Parental_Education_Level,Internet_Access_at_Home,Extracurricular_Activities,Pass_Fail
0,Male,31,68.267841,86,High School,Yes,Yes,1
1,Male,16,78.222927,73,PhD,No,No,0
2,Female,21,87.525096,74,PhD,Yes,No,0
3,Female,27,92.076483,99,Bachelors,No,No,1
4,Female,37,98.655517,63,Masters,No,Yes,1


In [15]:
x = df.drop("Pass_Fail", axis = 1)
y = df["Pass_Fail"]

In [16]:
categorical_cols = x.select_dtypes(include = "object").columns
numerical_cols = x.select_dtypes(exclude = "object").columns 

In [18]:
prepocessor = ColumnTransformer(
    transformers = [
        ("cat", OneHotEncoder(drop="first"), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)

In [20]:
model = Pipeline(steps = [
    ("preprocessing",prepocessor ),
    ("classifier", LogisticRegression(max_iter=1000))
]
)

In [21]:
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2,random_state=42)

In [22]:
model.fit(x_train,y_train)

In [23]:
prediction = model.predict(x_test)

In [24]:
print("accuracy:", accuracy_score(y_test,prediction))
print("\n confusion matrix : ",confusion_matrix(y_test,prediction))
print("\n report : ", classification_report(y_test, prediction))

accuracy: 0.8309859154929577

 confusion matrix :  [[58 13]
 [11 60]]

 report :                precision    recall  f1-score   support

           0       0.84      0.82      0.83        71
           1       0.82      0.85      0.83        71

    accuracy                           0.83       142
   macro avg       0.83      0.83      0.83       142
weighted avg       0.83      0.83      0.83       142

