In [50]:
import kagglehub
import pandas as pd
import os

In [51]:
# Download latest version
path = kagglehub.dataset_download("ayeshaseherr/student-performance")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Eni-o\.cache\kagglehub\datasets\ayeshaseherr\student-performance\versions\1


In [52]:
data_path = r"C:\Users\Eni-o\.cache\kagglehub\datasets\ayeshaseherr\student-performance\versions\1"

# See what files are inside
print(os.listdir(data_path))

['StudentPerformanceFactors.csv']


In [53]:
# df_raw is your untouchable extracted data
df_raw = pd.read_csv(f"{data_path}/StudentPerformanceFactors.csv")

In [54]:
# df_spf is a copy of the original dataset where I'll do all my cleaning, processing etc 
df_spf = df_raw.copy()
display(df_spf.head())

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [55]:
df_spf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

# Imputation

#### Filling in the missing values (and performing some ordinal enconding).

In [64]:
# Uses boolean to show columns with missing values.
df_spf.isnull().any()

Hours_Studied                 False
Attendance                    False
Parental_Involvement          False
Access_to_Resources           False
Extracurricular_Activities    False
Sleep_Hours                   False
Previous_Scores               False
Motivation_Level              False
Internet_Access               False
Tutoring_Sessions             False
Family_Income                 False
Teacher_Quality               False
School_Type                   False
Peer_Influence                False
Physical_Activity             False
Learning_Disabilities         False
Parental_Education_Level       True
Distance_from_Home             True
Gender                        False
Exam_Score                    False
dtype: bool

In [93]:
# shows the sum of rows that have no data in the Teacher_Quality column
display(df_spf['Teacher_Quality'].isna().sum())

# displays all the unique values in a column
display(df_spf['Teacher_Quality'].unique())

np.int64(0)

array([2, 3, 1])

In [58]:
# Checks the mode
mode_value = df_spf['Teacher_Quality'].mode()[0]
print("Most frequent value:", mode_value)

Most frequent value: Medium


In [99]:
# Replace missing values
df_spf['Teacher_Quality'] = df_spf['Teacher_Quality'].fillna(mode_value)

# check to see if there are still missing values after change
display(df_spf['Teacher_Quality'].isna().sum())

np.int64(0)

In [85]:
# mapped each category to a numeric value
mapping = {'Low': 1, 'Medium': 2, 'High': 3}

# used the mapping dictionary to change the Teacher_Quality data type
df_spf['Teacher_Quality'] = df_spf['Teacher_Quality'].map(mapping)

# checking the new data type
df_spf.info()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,2,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,2,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,2,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,2,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,3,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,2,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,3,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,2,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,2,Private,Positive,3,No,High School,Far,Female,68


In [100]:
display(df_spf['Parental_Education_Level'].unique())

display(df_spf['Parental_Education_Level'].isna().sum())

array(['High School', 'College', 'Postgraduate'], dtype=object)

np.int64(0)

In [95]:
mode_value = df_spf['Parental_Education_Level'].mode()[0]
print("Most frequent value:", mode_value)

Most frequent value: High School


In [98]:
df_spf['Parental_Education_Level'] = df_spf['Parental_Education_Level'].fillna(mode_value)

# check to see if there are still missing values after change
display(df_spf['Parental_Education_Level'].isna().sum())

np.int64(0)

In [107]:
df_spf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6607 non-null   int64 
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [104]:
display(df_spf['Distance_from_Home'].unique())

display(df_spf['Distance_from_Home'].isna().sum())

array(['Near', 'Moderate', 'Far', nan], dtype=object)

np.int64(67)

In [105]:
# Checks the mode
mode_value = df_spf['Distance_from_Home'].mode()[0]
print("Most frequent value:", mode_value)

Most frequent value: Near


In [106]:
# Replace missing values
df_spf['Distance_from_Home'] = df_spf['Distance_from_Home'].fillna(mode_value)

# check to see if there are still missing values after change
display(df_spf['Distance_from_Home'].isna().sum())

np.int64(0)

In [109]:
distance_mapping = {'Near': 1, 'Moderate': 2, 'Far': 3}

df_spf['Distance_from_Home'] = df_spf['Distance_from_Home'].map(distance_mapping)

df_spf.head(60)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,2,Public,Positive,3,No,High School,,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,2,Public,Negative,4,No,College,,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,2,Public,Neutral,4,No,Postgraduate,,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,2,Public,Negative,4,No,High School,,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,3,Public,Neutral,4,No,College,,Female,70
5,19,88,Medium,Medium,Yes,8,89,Medium,Yes,3,Medium,2,Public,Positive,3,No,Postgraduate,,Male,71
6,29,84,Medium,Low,Yes,7,68,Low,Yes,1,Low,2,Private,Neutral,2,No,High School,,Male,67
7,25,78,Low,High,Yes,6,50,Medium,Yes,1,High,3,Public,Negative,2,No,High School,,Male,66
8,17,94,Medium,High,No,6,80,High,Yes,0,Medium,1,Private,Neutral,1,No,College,,Male,69
9,23,98,Medium,Medium,Yes,8,71,Medium,Yes,0,High,3,Public,Positive,5,No,High School,,Male,72


# Encoding

#### Changing data types, to make it more easier for analysis.

In [112]:
display(df_spf['Parental_Involvement'].unique())

# used mapping dictionary defined earlier to change data type
df_spf['Parental_Involvement'] = df_spf['Parental_Involvement'].map(mapping)

array(['Low', 'Medium', 'High'], dtype=object)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Hours_Studied               6607 non-null   int64  
 1   Attendance                  6607 non-null   int64  
 2   Parental_Involvement        6607 non-null   int64  
 3   Access_to_Resources         6607 non-null   object 
 4   Extracurricular_Activities  6607 non-null   object 
 5   Sleep_Hours                 6607 non-null   int64  
 6   Previous_Scores             6607 non-null   int64  
 7   Motivation_Level            6607 non-null   object 
 8   Internet_Access             6607 non-null   object 
 9   Tutoring_Sessions           6607 non-null   int64  
 10  Family_Income               6607 non-null   object 
 11  Teacher_Quality             6607 non-null   int64  
 12  School_Type                 6607 non-null   object 
 13  Peer_Influence              6607 

In [119]:
display(df_spf['Access_to_Resources'].unique())

# used mapping dictionary defined earlier to change data type
df_spf['Access_to_Resources'] = df_spf['Access_to_Resources'].map(mapping)

# checking the new data type
df_spf.tail(60)

array([nan])

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
6547,15,94,2,,Yes,6,77,Low,Yes,3,High,3,Private,Neutral,4,No,College,,Female,70
6548,22,75,3,,No,7,52,High,Yes,0,Low,2,Private,Neutral,3,No,College,,Female,66
6549,23,90,2,,No,7,89,Low,Yes,2,High,2,Public,Positive,4,No,College,,Female,71
6550,20,79,2,,Yes,8,71,Medium,Yes,1,Medium,3,Public,Neutral,3,No,College,,Female,68
6551,27,68,1,,Yes,8,80,High,Yes,3,Medium,2,Public,Positive,4,Yes,High School,,Male,66
6552,22,65,1,,No,8,98,High,Yes,4,Medium,2,Private,Neutral,3,No,High School,,Female,66
6553,16,74,1,,No,8,76,Medium,Yes,5,Low,2,Public,Negative,3,No,College,,Male,65
6554,18,66,2,,Yes,7,90,Medium,Yes,0,Medium,2,Public,Negative,4,No,Postgraduate,,Male,64
6555,12,100,1,,Yes,7,65,Medium,Yes,1,Low,2,Private,Positive,2,No,High School,,Male,67
6556,19,62,2,,Yes,8,86,Low,Yes,1,High,2,Public,Positive,3,No,College,,Male,65
