In [1]:
# Perform the following operations using Python by creating a student performance dataset.
# 1.Display Missing Values
# 2.Replace missing values using any 2 suitable 
# 3.Identify outliers using IQR and ZScore
# 4 Handle outlier using any technique
# 5.Perform data normalization using decimal scaling

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore

In [5]:
data={
    'name':['A','B','C','D','E'],
    'math': [85, 90, np.nan, 45, 300],        # 300 is an outlier
    'science': [88, 95, 89, 30, 500],         # 500 is an outlier
    'english': [78, 82, 85, np.nan, 40]
}
df=pd.DataFrame(data)
df


Unnamed: 0,name,math,science,english
0,A,85.0,88,78.0
1,B,90.0,95,82.0
2,C,,89,85.0
3,D,45.0,30,
4,E,300.0,500,40.0


In [6]:
# 1.Display Missing Values
df.isnull().sum()

name       0
math       1
science    0
english    1
dtype: int64

In [7]:
# 2.Replace missing values using any 2 suitable 
# Replace with mean (for math)
df['math'] = df['math'].fillna(df['math'].mean())

# Replace with median (for english)
df['english'] = df['english'].fillna(df['english'].median())

df

Unnamed: 0,name,math,science,english
0,A,85.0,88,78.0
1,B,90.0,95,82.0
2,C,130.0,89,85.0
3,D,45.0,30,80.0
4,E,300.0,500,40.0


In [10]:
# 3.Identify outliers using IQR and ZScore
Q1 = df['science'].quantile(0.25)
Q3 = df['science'].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = df[(df['science'] < (Q1 - 1.5 * IQR)) | (df['science'] > (Q3 + 1.5 * IQR))]
print("Outliers using IQR:")
print(outliers_iqr)

# Zscore
z_scores = np.abs(zscore(df[['math', 'science', 'english']]))
outliers_zscore = df[(z_scores > 3).any(axis=1)]
print("Outliers using Z-Score:")
print(outliers_zscore)


Outliers using IQR:
  name   math  science  english
3    D   45.0       30     80.0
4    E  300.0      500     40.0
Outliers using Z-Score:
Empty DataFrame
Columns: [name, math, science, english]
Index: []


In [11]:
# 4 Handle outlier using any technique
df['math'] = df['math'].clip(0, 100)
df['science'] = df['science'].clip(0, 100)
df['english'] = df['english'].clip(0, 100)

df

Unnamed: 0,name,math,science,english
0,A,85.0,88,78.0
1,B,90.0,95,82.0
2,C,100.0,89,85.0
3,D,45.0,30,80.0
4,E,100.0,100,40.0


In [12]:
# 5.Perform data normalization using decimal scaling
def decimal_scaling(col):
    max_val = col.abs().max()
    j = len(str(int(max_val)))
    return col / (10**j)

df['math_scaled'] = decimal_scaling(df['math'])
df['science_scaled'] = decimal_scaling(df['science'])
df['english_scaled'] = decimal_scaling(df['english'])

df[['math_scaled', 'science_scaled', 'english_scaled']]

Unnamed: 0,math_scaled,science_scaled,english_scaled
0,0.085,0.088,0.78
1,0.09,0.095,0.82
2,0.1,0.089,0.85
3,0.045,0.03,0.8
4,0.1,0.1,0.4
