In [None]:
!pip install pandas numpy scikit-learn


In [3]:
# =========================================
# Week 2 â€“ Data Cleansing & Preprocessing
# Dataset: StudentsPerformance.csv
# Tool: Jupyter Notebook (Pandas)
# =========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# -----------------------------------------
# Step 1: Load Dataset
# -----------------------------------------
df = pd.read_csv("StudentsPerformance.csv")

print("Initial Dataset Shape:", df.shape)
df.head()


Initial Dataset Shape: (1000, 8)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
# Check missing values
df.isnull().sum()

# Check duplicates
df.duplicated().sum()

# Check data types
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [8]:
# Fill missing numerical values with mean
numeric_cols = ['math score', 'reading score', 'writing score']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing categorical values with mode (SAFE WAY)
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


In [9]:
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)


Shape after removing duplicates: (1000, 8)


In [12]:
df['average_score'] = round(
    (df['math score'] + df['reading score'] + df['writing score']) / 3, 2
)


In [13]:
df['pass_status'] = np.where(df['average_score'] >= 50, 'pass', 'fail')


In [14]:
scaler = MinMaxScaler()

df[['math_score_scaled', 'reading_score_scaled', 'writing_score_scaled']] = scaler.fit_transform(
    df[['math score', 'reading score', 'writing score']]
)


In [15]:
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average_score,pass_status,math_score_scaled,reading_score_scaled,writing_score_scaled
0,female,group b,bachelor's degree,standard,none,72,72,74,72.67,pass,0.72,0.662651,0.711111
1,female,group c,some college,standard,completed,69,90,88,82.33,pass,0.69,0.879518,0.866667
2,female,group b,master's degree,standard,none,90,95,93,92.67,pass,0.9,0.939759,0.922222
3,male,group a,associate's degree,free/reduced,none,47,57,44,49.33,fail,0.47,0.481928,0.377778
4,male,group c,some college,standard,none,76,78,75,76.33,pass,0.76,0.73494,0.722222


In [16]:
df.to_csv("StudentsPerformance_CLEANED.csv", index=False)
print("Cleaned dataset saved successfully!")


Cleaned dataset saved successfully!


In [17]:
data_dictionary = {
    "gender": "Student gender",
    "race/ethnicity": "Student race/ethnicity group",
    "parental level of education": "Highest education level of parents",
    "lunch": "Lunch type (standard or free/reduced)",
    "test preparation course": "Test prep course completion status",
    "math score": "Math exam score",
    "reading score": "Reading exam score",
    "writing score": "Writing exam score",
    "average_score": "Average of math, reading, and writing scores",
    "pass_status": "Pass if average_score >= 50, else Fail",
    "math_score_scaled": "Normalized math score",
    "reading_score_scaled": "Normalized reading score",
    "writing_score_scaled": "Normalized writing score"
}

for k, v in data_dictionary.items():
    print(f"{k}: {v}")


gender: Student gender
race/ethnicity: Student race/ethnicity group
parental level of education: Highest education level of parents
lunch: Lunch type (standard or free/reduced)
test preparation course: Test prep course completion status
math score: Math exam score
reading score: Reading exam score
writing score: Writing exam score
average_score: Average of math, reading, and writing scores
pass_status: Pass if average_score >= 50, else Fail
math_score_scaled: Normalized math score
reading_score_scaled: Normalized reading score
writing_score_scaled: Normalized writing score
