In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("data/studentdata.csv")

In [3]:
#show first few rows
df.head()

Unnamed: 0,Age,Grade,Gender,Race,SES_Quartile,ParentalEducation,SchoolType,Locale,TestScore_Math,TestScore_Reading,...,GPA,AttendanceRate,StudyHours,InternetAccess,Extracurricular,PartTimeJob,ParentSupport,Romantic,FreeTime,GoOut
0,15,10,Female,White,1,HS,Public,City,72.346053,62.217134,...,2.521745,0.868836,0.310172,0,1,1,1,0,3,3
1,16,11,Female,Hispanic,1,<HS,Private,City,77.889157,72.74803,...,3.275626,0.909595,1.175586,1,1,0,0,1,3,1
2,17,12,Female,Black,2,HS,Public,Rural,72.966587,65.585472,...,2.974137,0.870952,1.112556,1,1,0,0,0,3,3
3,16,11,Female,White,2,HS,Public,Town,96.674049,88.035852,...,3.67659,1.0,1.067679,0,0,0,0,1,4,5
4,16,11,Male,Black,3,Bachelors+,Public,Rural,81.98927,77.485372,...,2.255014,0.897957,0.841936,0,1,0,1,0,4,2


In [4]:
df.shape

(999997, 21)

In [5]:
#check Data Structure and Summary Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999997 entries, 0 to 999996
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                999997 non-null  int64  
 1   Grade              999997 non-null  int64  
 2   Gender             999997 non-null  object 
 3   Race               999997 non-null  object 
 4   SES_Quartile       999997 non-null  int64  
 5   ParentalEducation  999997 non-null  object 
 6   SchoolType         999997 non-null  object 
 7   Locale             999997 non-null  object 
 8   TestScore_Math     999997 non-null  float64
 9   TestScore_Reading  999997 non-null  float64
 10  TestScore_Science  999997 non-null  float64
 11  GPA                999997 non-null  float64
 12  AttendanceRate     999997 non-null  float64
 13  StudyHours         999997 non-null  float64
 14  InternetAccess     999997 non-null  int64  
 15  Extracurricular    999997 non-null  int64  
 16  Pa

In [6]:
#check Basic Statistics
df.describe(include='all')

Unnamed: 0,Age,Grade,Gender,Race,SES_Quartile,ParentalEducation,SchoolType,Locale,TestScore_Math,TestScore_Reading,...,GPA,AttendanceRate,StudyHours,InternetAccess,Extracurricular,PartTimeJob,ParentSupport,Romantic,FreeTime,GoOut
count,999997.0,999997.0,999997,999997,999997.0,999997,999997,999997,999997.0,999997.0,...,999997.0,999997.0,999997.0,999997.0,999997.0,999997.0,999997.0,999997.0,999997.0,999997.0
unique,,,2,6,,4,2,4,,,...,,,,,,,,,,
top,,,Female,White,,HS,Public,Suburban,,,...,,,,,,,,,,
freq,,,510091,439714,,325268,844422,390734,,,...,,,,,,,,,,
mean,15.999193,10.799551,,,2.500284,,,,74.980073,74.978168,...,2.99676,0.899562,0.999933,0.850403,0.549809,0.18061,0.422772,0.200068,3.002909,2.072477
std,1.413712,1.165967,,,1.117889,,,,9.944334,9.954602,...,0.459472,0.048024,0.348642,0.356677,0.497513,0.384695,0.494,0.400051,1.008419,0.919165
min,14.0,9.0,,,1.0,,,,29.146751,30.796723,...,0.752867,0.7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,15.0,10.0,,,2.0,,,,68.248703,68.250126,...,2.684924,0.866991,0.763883,1.0,0.0,0.0,0.0,0.0,2.0,1.0
50%,16.0,11.0,,,3.0,,,,74.998702,74.985049,...,2.999782,0.899881,0.999841,1.0,1.0,0.0,0.0,0.0,3.0,2.0
75%,17.0,12.0,,,3.0,,,,81.75024,81.746642,...,3.314308,0.932912,1.236215,1.0,1.0,0.0,1.0,0.0,4.0,3.0


In [7]:
#check for Duplicates
df.duplicated().sum()

np.int64(0)

In [8]:
#check for Missing Values
df.isnull().sum()

Age                  0
Grade                0
Gender               0
Race                 0
SES_Quartile         0
ParentalEducation    0
SchoolType           0
Locale               0
TestScore_Math       0
TestScore_Reading    0
TestScore_Science    0
GPA                  0
AttendanceRate       0
StudyHours           0
InternetAccess       0
Extracurricular      0
PartTimeJob          0
ParentSupport        0
Romantic             0
FreeTime             0
GoOut                0
dtype: int64

In [9]:
# Check value ranges for important columns
range_checks = {
    'Age': (14, 18),
    'Grade': (9, 12), 
    'GPA': (0.0, 4.0),
    'AttendanceRate': (0.70, 1.00),
    'StudyHours': (0, 4),
    'FreeTime': (1, 5),
    'GoOut': (1, 5),
    'TestScore_Math': (0, 100),
    'TestScore_Reading': (0, 100),
    'TestScore_Science': (0, 100)
}

In [10]:
issues_found = False
for col, (min_val, max_val) in range_checks.items():
    if col in df.columns:
        invalid_count = df[(df[col] < min_val) | (df[col] > max_val)].shape[0]
        if invalid_count > 0:
            print(f"⚠️ {col}: {invalid_count} values outside valid range ({min_val}–{max_val})")
            issues_found = Tru

In [11]:
if not issues_found:
    print("All column values are within valid ranges.")

All column values are within valid ranges.


In [12]:
output_path_clean = "data/studentdata_clean.csv"
df.to_csv(output_path_clean, index=False)
print(f"\nClean dataset saved as: {output_path_clean}")


Clean dataset saved as: data/studentdata_clean.csv
