In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../datasets/Student_Performance.csv')
print(df.shape)
df.head()


(25000, 16)


Unnamed: 0,student_id,age,gender,school_type,parent_education,study_hours,attendance_percentage,internet_access,travel_time,extra_activities,study_method,math_score,science_score,english_score,overall_score,final_grade
0,1,14,male,public,post graduate,3.1,84.3,yes,<15 min,yes,notes,42.7,55.4,57.0,53.1,e
1,2,18,female,public,graduate,3.7,87.8,yes,>60 min,no,textbook,57.6,68.8,64.8,61.3,d
2,3,17,female,private,post graduate,7.9,65.5,no,<15 min,no,notes,84.8,95.0,79.2,89.6,b
3,4,16,other,public,high school,1.1,58.1,no,15-30 min,no,notes,44.4,27.5,54.7,41.6,e
4,5,16,female,public,high school,1.3,61.0,yes,30-60 min,yes,group study,8.9,32.7,30.0,25.4,f


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   student_id             25000 non-null  int64  
 1   age                    25000 non-null  int64  
 2   gender                 25000 non-null  object 
 3   school_type            25000 non-null  object 
 4   parent_education       25000 non-null  object 
 5   study_hours            25000 non-null  float64
 6   attendance_percentage  25000 non-null  float64
 7   internet_access        25000 non-null  object 
 8   travel_time            25000 non-null  object 
 9   extra_activities       25000 non-null  object 
 10  study_method           25000 non-null  object 
 11  math_score             25000 non-null  float64
 12  science_score          25000 non-null  float64
 13  english_score          25000 non-null  float64
 14  overall_score          25000 non-null  float64
 15  fi

In [5]:
df.describe()


Unnamed: 0,student_id,age,study_hours,attendance_percentage,math_score,science_score,english_score,overall_score
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,7493.0438,16.48276,4.253224,75.084084,63.785944,63.74532,63.681948,64.006172
std,4323.56215,1.703895,2.167541,14.373171,20.875262,20.970529,20.792693,18.932025
min,1.0,14.0,0.5,50.0,0.0,0.0,0.0,14.5
25%,3743.75,15.0,2.4,62.8,48.3,48.2,48.3,49.0
50%,7461.5,16.0,4.3,75.1,64.1,64.1,64.2,64.2
75%,11252.0,18.0,6.1,87.5,80.0,80.0,80.0,79.0
max,15000.0,19.0,8.0,100.0,100.0,100.0,100.0,100.0


In [None]:
before = df.shape[0]
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
after = df.shape[0]
print(f"Duplicates removed : {before - after}")
print(f"Remaining rows     : {after}")


Duplicates removed : 10000
Remaining rows     : 15000


In [7]:
missing = df.isnull().sum().sum()
print(missing)



0


In [12]:
df.drop(columns=['student_id'], inplace=True)


KeyError: "['student_id'] not found in axis"

In [13]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
for col in cat_cols:
    df[col] = df[col].str.strip().str.lower()

print("Unique values per categorical column:")
for col in cat_cols:
    print(f" {col:25s}: {sorted(df[col].unique().tolist())}")


Unique values per categorical column:
 gender                   : ['female', 'male', 'other']
 school_type              : ['private', 'public']
 parent_education         : ['diploma', 'graduate', 'high school', 'no formal', 'phd', 'post graduate']
 internet_access          : ['no', 'yes']
 travel_time              : ['15-30 min', '30-60 min', '<15 min', '>60 min']
 extra_activities         : ['no', 'yes']
 study_method             : ['coaching', 'group study', 'mixed', 'notes', 'online videos', 'textbook']
 final_grade              : ['a', 'b', 'c', 'd', 'e', 'f']


In [14]:
numeric_cols = ['study_hours', 'attendance_percentage',
                'math_score', 'science_score', 'english_score', 'overall_score']

print("Outlier summary (IQR method):")
for col in numeric_cols:
    Q1  = df[col].quantile(0.25)
    Q3  = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    n_out = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"  {col:25s}  lower={lower:.2f}  upper={upper:.2f}  outliers={n_out}")


Outlier summary (IQR method):
  study_hours                lower=-3.15  upper=11.65  outliers=0
  attendance_percentage      lower=25.75  upper=124.55  outliers=0
  math_score                 lower=0.75  upper=127.55  outliers=10
  science_score              lower=0.50  upper=127.70  outliers=12
  english_score              lower=0.75  upper=127.55  outliers=2
  overall_score              lower=4.00  upper=124.00  outliers=0


In [15]:
for col in numeric_cols:
    Q1  = df[col].quantile(0.25)
    Q3  = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower, upper=upper)

print(" Outliers capped using IQR Winsorisation.")
df[numeric_cols].describe()


 Outliers capped using IQR Winsorisation.


Unnamed: 0,study_hours,attendance_percentage,math_score,science_score,english_score,overall_score
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,4.253224,75.084084,63.786244,63.74552,63.682008,64.006172
std,2.167541,14.373171,20.87435,20.969924,20.79251,18.932025
min,0.5,50.0,0.75,0.5,0.75,14.5
25%,2.4,62.8,48.3,48.2,48.3,49.0
50%,4.3,75.1,64.1,64.1,64.2,64.2
75%,6.1,87.5,80.0,80.0,80.0,79.0
max,8.0,100.0,100.0,100.0,100.0,100.0


In [None]:

binary_map = {'yes': 1, 'no': 0}
for col in ['internet_access', 'extra_activities']:
    df[col] = df[col].map(binary_map)
    print(f" {col} → {df[col].unique()}")

travel_order = [['<15 min', '15-30 min', '30-60 min', '>60 min']]
ordinal_enc = OrdinalEncoder(categories=travel_order)
df['travel_time'] = ordinal_enc.fit_transform(df[['travel_time']]).astype(int)

print(f"\n  travel_time categories: {travel_order[0]}")
print(f"  travel_time → {sorted(df['travel_time'].unique())}")

# 9c. Ordinal column: parent_education (has a natural order)
edu_order = [['no formal', 'high school', 'diploma', 'graduate', 'post graduate', 'phd']]
ord_edu = OrdinalEncoder(categories=edu_order)
df['parent_education'] = ord_edu.fit_transform(df[['parent_education']]).astype(int)

print(f"\n  parent_education categories: {edu_order[0]}")
print(f"  parent_education → {sorted(df['parent_education'].unique())}")




 internet_access → [nan]
 extra_activities → [nan]


ValueError: invalid literal for int() with base 10: '<15 min'

In [None]:
nominal_cols = ['gender', 'school_type', 'study_method']
df = pd.get_dummies(df, columns=nominal_cols, drop_first=False, dtype=int)

grade_order = [['f', 'e', 'd', 'c', 'b', 'a']]
ord_grade = OrdinalEncoder(categories=grade_order)
df['final_grade'] = ord_grade.fit_transform(df[['final_grade']]).astype(int)
print("Grade mapping (f=0, e=1, d=2, c=3, b=4, a=5):")
print(df['final_grade'].value_counts().sort_index())

df.head(3)


Grade mapping (f=0, e=1, d=2, c=3, b=4, a=5):
final_grade
0    1796
1    3378
2    3770
3    3697
4    1638
5     721
Name: count, dtype: int64

Dataset shape after encoding: (15000, 23)


Unnamed: 0,age,parent_education,study_hours,attendance_percentage,internet_access,travel_time,extra_activities,math_score,science_score,english_score,...,gender_male,gender_other,school_type_private,school_type_public,study_method_coaching,study_method_group study,study_method_mixed,study_method_notes,study_method_online videos,study_method_textbook
0,14,4,3.1,84.3,1,0,1,42.7,55.4,57.0,...,1,0,0,1,0,0,0,1,0,0
1,18,3,3.7,87.8,1,3,0,57.6,68.8,64.8,...,0,0,0,1,0,0,0,0,0,1
2,17,4,7.9,65.5,0,0,0,84.8,95.0,79.2,...,0,0,1,0,0,0,0,1,0,0


In [18]:
X = df.drop(columns=['final_grade'])
y = df['final_grade']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set   : {X_train.shape}")
print(f"Test set       : {X_test.shape}")


Training set   : (20000, 14)
Test set       : (5000, 14)


In [19]:
scale_cols = ['age', 'study_hours', 'attendance_percentage',
              'math_score', 'science_score', 'english_score',
              'overall_score', 'travel_time', 'parent_education']

scaler = StandardScaler()
X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols]  = scaler.transform(X_test[scale_cols])

print("Scaling applied to:", scale_cols)
print("\nSample of scaled training features:")
X_train[scale_cols].describe().round(2)


Scaling applied to: ['age', 'study_hours', 'attendance_percentage', 'math_score', 'science_score', 'english_score', 'overall_score', 'travel_time', 'parent_education']

Sample of scaled training features:


Unnamed: 0,age,study_hours,attendance_percentage,math_score,science_score,english_score,overall_score,travel_time,parent_education
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.46,-1.73,-1.74,-3.03,-3.03,-3.03,-2.62,-1.36,-1.47
25%,-0.87,-0.86,-0.86,-0.74,-0.74,-0.73,-0.79,-0.45,-0.88
50%,-0.29,0.02,-0.0,0.01,0.01,0.02,0.01,0.45,-0.29
75%,0.89,0.85,0.87,0.77,0.78,0.79,0.79,0.45,0.89
max,1.48,1.73,1.73,1.74,1.73,1.75,1.9,1.35,1.48


In [21]:
X_train.assign(final_grade=y_train.values).to_csv(
    '../datasets/train_cleaned.csv', index=False)
X_test.assign(final_grade=y_test.values).to_csv(
    '../datasets/test_cleaned.csv', index=False)




  Final training features : (12000, 22)
  Final test features     : (3000, 22)
  Feature columns         : ['age', 'parent_education', 'study_hours', 'attendance_percentage', 'internet_access', 'travel_time', 'extra_activities', 'math_score', 'science_score', 'english_score', 'overall_score', 'gender_female', 'gender_male', 'gender_other', 'school_type_private', 'school_type_public', 'study_method_coaching', 'study_method_group study', 'study_method_mixed', 'study_method_notes', 'study_method_online videos', 'study_method_textbook']
