In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("Student_Performance.csv")

In [3]:
df

Unnamed: 0,student_id,age,gender,school_type,parent_education,study_hours,attendance_percentage,internet_access,travel_time,extra_activities,study_method,math_score,science_score,english_score,overall_score,final_grade
0,1,14,male,public,post graduate,3.1,84.3,yes,<15 min,yes,notes,42.7,55.4,57.0,53.1,e
1,2,18,female,public,graduate,3.7,87.8,yes,>60 min,no,textbook,57.6,68.8,64.8,61.3,d
2,3,17,female,private,post graduate,7.9,65.5,no,<15 min,no,notes,84.8,95.0,79.2,89.6,b
3,4,16,other,public,high school,1.1,58.1,no,15-30 min,no,notes,44.4,27.5,54.7,41.6,e
4,5,16,female,public,high school,1.3,61.0,yes,30-60 min,yes,group study,8.9,32.7,30.0,25.4,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,12047,17,female,public,phd,1.8,55.2,yes,15-30 min,no,mixed,55.8,48.5,46.7,46.1,e
24996,1102,16,female,private,diploma,2.7,97.1,yes,<15 min,no,coaching,64.8,48.2,52.3,56.5,d
24997,4422,19,other,private,post graduate,1.0,63.0,yes,<15 min,no,group study,50.5,20.3,36.1,36.7,f
24998,7858,14,male,private,diploma,1.0,69.4,yes,15-30 min,yes,group study,13.0,34.2,7.3,34.1,f


In [4]:
df.head()

Unnamed: 0,student_id,age,gender,school_type,parent_education,study_hours,attendance_percentage,internet_access,travel_time,extra_activities,study_method,math_score,science_score,english_score,overall_score,final_grade
0,1,14,male,public,post graduate,3.1,84.3,yes,<15 min,yes,notes,42.7,55.4,57.0,53.1,e
1,2,18,female,public,graduate,3.7,87.8,yes,>60 min,no,textbook,57.6,68.8,64.8,61.3,d
2,3,17,female,private,post graduate,7.9,65.5,no,<15 min,no,notes,84.8,95.0,79.2,89.6,b
3,4,16,other,public,high school,1.1,58.1,no,15-30 min,no,notes,44.4,27.5,54.7,41.6,e
4,5,16,female,public,high school,1.3,61.0,yes,30-60 min,yes,group study,8.9,32.7,30.0,25.4,f


In [5]:
df.shape

(25000, 16)

In [6]:
df.columns


Index(['student_id', 'age', 'gender', 'school_type', 'parent_education',
       'study_hours', 'attendance_percentage', 'internet_access',
       'travel_time', 'extra_activities', 'study_method', 'math_score',
       'science_score', 'english_score', 'overall_score', 'final_grade'],
      dtype='object')

In [7]:
df = df.drop("student_id",axis=1)

In [8]:
y = df["final_grade"]

In [9]:
X = df.drop("final_grade",axis=1)

In [10]:
X.shape

(25000, 14)

In [11]:
y.shape

(25000,)

In [12]:
X.dtypes

age                        int64
gender                    object
school_type               object
parent_education          object
study_hours              float64
attendance_percentage    float64
internet_access           object
travel_time               object
extra_activities          object
study_method              object
math_score               float64
science_score            float64
english_score            float64
overall_score            float64
dtype: object

In [13]:
le = LabelEncoder()
X["gender"] = le.fit_transform(X["gender"])

In [14]:
X["gender"].head()

0    1
1    0
2    0
3    2
4    0
Name: gender, dtype: int64

In [15]:
df["gender"].value_counts()

gender
other     8463
female    8290
male      8247
Name: count, dtype: int64

In [16]:
X.dtypes

age                        int64
gender                     int64
school_type               object
parent_education          object
study_hours              float64
attendance_percentage    float64
internet_access           object
travel_time               object
extra_activities          object
study_method              object
math_score               float64
science_score            float64
english_score            float64
overall_score            float64
dtype: object

In [17]:
X["school_type"] = le.fit_transform(X["school_type"])

In [18]:
X["school_type"].head()

0    1
1    1
2    0
3    1
4    1
Name: school_type, dtype: int64

In [19]:
X.dtypes

age                        int64
gender                     int64
school_type                int64
parent_education          object
study_hours              float64
attendance_percentage    float64
internet_access           object
travel_time               object
extra_activities          object
study_method              object
math_score               float64
science_score            float64
english_score            float64
overall_score            float64
dtype: object

In [20]:
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = le.fit_transform(X[col])

In [21]:
X.dtypes

age                        int64
gender                     int64
school_type                int64
parent_education           int64
study_hours              float64
attendance_percentage    float64
internet_access            int64
travel_time                int64
extra_activities           int64
study_method               int64
math_score               float64
science_score            float64
english_score            float64
overall_score            float64
dtype: object

In [22]:
y = le.fit_transform(y)

In [23]:
y[:10]

array([4, 3, 1, 4, 5, 3, 5, 3, 3, 3])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [25]:
train_df = X_train.copy()
train_df["final_grade"] = y_train
test_df = X_test.copy()
test_df["final_grade"] = y_test
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

In [26]:
X_train.shape, X_test.shape

((20000, 14), (5000, 14))

In [27]:
model = RandomForestClassifier(random_state=42)

In [28]:
model.fit(X_train,y_train)

In [29]:
y_pred = model.predict(X_test)

In [30]:
y_pred[:10]

array([1, 4, 3, 4, 5, 1, 5, 3, 4, 2])

In [31]:
acc = accuracy_score(y_test,y_pred)
print(acc)

0.9988
