In [3]:
import pandas as pd
import numpy as np

In [5]:
# Step 1: Create dataset with missing values and outliers
data = {
    'StudentID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Math': [85, 90, np.nan, 92, 88, 300, 95, np.nan, 83, 91],         # 300 is outlier, NaNs present
    'Physics': [80, 89, 75, 85, 82, np.nan, 96, 70, 84, 200],          # 200 is outlier, NaN present
    'Chemistry': [78, 85, 82, 89, 87, 95, 92, 76, np.nan, 88]          # 1 missing value
}

In [7]:
dataf = pd.DataFrame(data)
print("Initial Dataset:\n", dataf)

Initial Dataset:
    StudentID   Math  Physics  Chemistry
0          1   85.0     80.0       78.0
1          2   90.0     89.0       85.0
2          3    NaN     75.0       82.0
3          4   92.0     85.0       89.0
4          5   88.0     82.0       87.0
5          6  300.0      NaN       95.0
6          7   95.0     96.0       92.0
7          8    NaN     70.0       76.0
8          9   83.0     84.0        NaN
9         10   91.0    200.0       88.0


In [9]:
# Step 2: Handle missing values (Requirement 1)
dataf.fillna(dataf.mean(numeric_only=True), inplace=True)
print("\nAfter Handling Missing Values:\n", dataf)


After Handling Missing Values:
    StudentID   Math     Physics  Chemistry
0          1   85.0   80.000000  78.000000
1          2   90.0   89.000000  85.000000
2          3  115.5   75.000000  82.000000
3          4   92.0   85.000000  89.000000
4          5   88.0   82.000000  87.000000
5          6  300.0   95.666667  95.000000
6          7   95.0   96.000000  92.000000
7          8  115.5   70.000000  76.000000
8          9   83.0   84.000000  85.777778
9         10   91.0  200.000000  88.000000


In [13]:
# Step 3: Remove outliers using IQR method (Requirement 2)
numeric_cols = dataf.select_dtypes(include='number')
Q1 = numeric_cols.quantile(0.25)
Q3 = numeric_cols.quantile(0.75)
IQR = Q3 - Q1
dataf = dataf[(numeric_cols >= Q1 - 1.5 * IQR) & (numeric_cols <= Q3 + 1.5 * IQR)].dropna()
print("\nAfter Removing Outliers:\n", dataf)


After Removing Outliers:
    StudentID   Math  Physics  Chemistry
0          1   85.0     80.0  78.000000
1          2   90.0     89.0  85.000000
2          3  115.5     75.0  82.000000
3          4   92.0     85.0  89.000000
4          5   88.0     82.0  87.000000
6          7   95.0     96.0  92.000000
7          8  115.5     70.0  76.000000
8          9   83.0     84.0  85.777778


In [15]:
# Step 4: Apply log transformation to 'Math' (Requirement 3)
dataf['Math'] = np.log1p(dataf['Math'])
print("\nAfter Log Transformation on 'Math':\n", dataf)


After Log Transformation on 'Math':
    StudentID      Math  Physics  Chemistry
0          1  4.454347     80.0  78.000000
1          2  4.510860     89.0  85.000000
2          3  4.757891     75.0  82.000000
3          4  4.532599     85.0  89.000000
4          5  4.488636     82.0  87.000000
6          7  4.564348     96.0  92.000000
7          8  4.757891     70.0  76.000000
8          9  4.430817     84.0  85.777778


In [17]:
# Step 5: Final summary
print("\nFinal Summary Statistics:\n")
print(dataf.describe())


Final Summary Statistics:

       StudentID      Math    Physics  Chemistry
count   8.000000  8.000000   8.000000   8.000000
mean    4.875000  4.562174  82.625000  84.347222
std     2.900123  0.127846   8.034524   5.414214
min     1.000000  4.430817  70.000000  76.000000
25%     2.750000  4.480064  78.750000  81.000000
50%     4.500000  4.521729  83.000000  85.388889
75%     7.250000  4.612734  86.000000  87.500000
max     9.000000  4.757891  96.000000  92.000000
