In [1]:
import pandas as pd
import numpy as np

data = {
    'Student_ID': range(1, 21),
    'Name': [f'Student_{i}' for i in range(1, 21)],
    'Age': [18, 19, np.nan, 20, 21, 17, 19, np.nan, 20, 18, 22, 19, 20, 17, 18, 19, 20, 21, 17, 18],
    'Gender': ['M', 'Male', 'F', 'Female', 'M', 'F', 'm', 'f', 'Male', 'Female', 'M', 'F', 'M', 'F', 'Male', 'Female', 'm', 'f', 'M', 'F'],
    'Math_Score': [85, 92, 88, np.nan, 120, 75, 82, 90, 95, 100, 110, 78, 88, 92, 85, 76, 105, 89, 94, 99],
    'Science_Score': [78, 85, 90, 88, 92, 80, 84, 91, 76, 89, 95, 77, 83, 85, 90, 87, 96, 82, 88, 93],
    'English_Score': [80, 88, 85, 90, 78, 82, 91, 84, 89, 92, 75, 86, 79, 93, 87, 84, 80, 85, 90, 95],
    'Attendance_Percentage': [90, 85, 95, 88, 92, 75, 80, 98, 89, 93, 100, 82, 87, 91, 84, 79, 96, 83, 88, 94],
    'Study_Hours_Per_Week': [15, 20, 25, 18, 50, 10, 22, 30, 19, 24, 40, 12, 28, 16, 21, 14, 35, 17, 23, 29]
}

df = pd.DataFrame(data)

In [2]:
print(df.isnull().sum()) #Check Missing Values

Student_ID               0
Name                     0
Age                      2
Gender                   0
Math_Score               1
Science_Score            0
English_Score            0
Attendance_Percentage    0
Study_Hours_Per_Week     0
dtype: int64


In [3]:
df['Age'].fillna(df['Age'].median(), inplace=True)#Handle Missing Values
df['Math_Score'].fillna(df['Math_Score'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Math_Score'].fillna(df['Math_Score'].median(), inplace=True)


In [9]:
#Check Inconsistencies
gender_mapping = {'Male': 'M', 'Female': 'F', 'm': 'M', 'f': 'F'}
df['Gender'] = df['Gender'].replace(gender_mapping).str.upper()

In [6]:
#Identify Outliers in Numeric Variables
# Cap Math_Score
df['Math_Score'] = df['Math_Score'].clip(upper=100)

# Cap Study_Hours using IQR
Q1 = df['Study_Hours_Per_Week'].quantile(0.25)
Q3 = df['Study_Hours_Per_Week'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
df['Study_Hours_Per_Week'] = df['Study_Hours_Per_Week'].clip(upper=upper_bound)

In [7]:
#Check Skewness and Transform
# Check skewness before
print(df['Study_Hours_Per_Week'].skew())  # Likely positive (right-skewed)

# Apply log transformation
df['Study_Hours_Log'] = np.log1p(df['Study_Hours_Per_Week'])

# Check skewness after
print(df['Study_Hours_Log'].skew())  # Reduced skewness

0.9075788395532246
0.06843187165824348


In [8]:
print(df.head())

   Student_ID       Name   Age Gender  Math_Score  Science_Score  \
0           1  Student_1  18.0      M        85.0             78   
1           2  Student_2  19.0      M        92.0             85   
2           3  Student_3  19.0      F        88.0             90   
3           4  Student_4  20.0      F        90.0             88   
4           5  Student_5  21.0      M       100.0             92   

   English_Score  Attendance_Percentage  Study_Hours_Per_Week  Study_Hours_Log  
0             80                     90                  15.0         2.772589  
1             88                     85                  20.0         3.044522  
2             85                     95                  25.0         3.258097  
3             90                     88                  18.0         2.944439  
4             78                     92                  45.5         3.839452  
