In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
data = {
    'Age': [25, np.nan, 30, 22, 40, np.nan, 28],
    'Salary': [50000, 60000, np.nan, 52000, 58000, 62000, np.nan],
    'City': ['New York', 'Los Angeles', 'New York', np.nan, 'Chicago', 'Chicago', 'Los Angeles'],
    'Purchased': ['Yes', 'No', np.nan, 'No', 'Yes', 'Yes', 'No']
}

In [3]:
df_dummy = pd.DataFrame(data)
print("=== Original Dummy Dataset with Missing Values ===")
print(df_dummy)

=== Original Dummy Dataset with Missing Values ===
    Age   Salary         City Purchased
0  25.0  50000.0     New York       Yes
1   NaN  60000.0  Los Angeles        No
2  30.0      NaN     New York       NaN
3  22.0  52000.0          NaN        No
4  40.0  58000.0      Chicago       Yes
5   NaN  62000.0      Chicago       Yes
6  28.0      NaN  Los Angeles        No


In [4]:
# Count missing values
print("\n=== Missing Values Count ===")
print(df_dummy.isnull().sum())


=== Missing Values Count ===
Age          2
Salary       2
City         1
Purchased    1
dtype: int64


# TODO: Drop rows/columns with more than 50% missing values


In [5]:
df_drop = df_dummy.copy()
print("\n=== After Drop Strategy ===")
print(df_drop)


=== After Drop Strategy ===
    Age   Salary         City Purchased
0  25.0  50000.0     New York       Yes
1   NaN  60000.0  Los Angeles        No
2  30.0      NaN     New York       NaN
3  22.0  52000.0          NaN        No
4  40.0  58000.0      Chicago       Yes
5   NaN  62000.0      Chicago       Yes
6  28.0      NaN  Los Angeles        No


In [6]:
df_imputed = df_dummy.copy()

# TODO: Impute 'Age' with mean
# TODO: Impute 'Salary' with median

print("\n=== After Numerical Imputation ===")
print(df_imputed[['Age', 'Salary']])



=== After Numerical Imputation ===
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  30.0      NaN
3  22.0  52000.0
4  40.0  58000.0
5   NaN  62000.0
6  28.0      NaN


In [7]:
df_imputed = df_dummy.copy()

# TODO: Impute 'Age' with mean
# TODO: Impute 'Salary' with median

print("\n=== After Numerical Imputation ===")
print(df_imputed[['Age', 'Salary']])


=== After Numerical Imputation ===
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  30.0      NaN
3  22.0  52000.0
4  40.0  58000.0
5   NaN  62000.0
6  28.0      NaN


In [8]:
# TODO: Impute 'City' and 'Purchased' with mode

print("\n=== After Categorical Imputation ===")
print(df_imputed[['City', 'Purchased']])


=== After Categorical Imputation ===
          City Purchased
0     New York       Yes
1  Los Angeles        No
2     New York       NaN
3          NaN        No
4      Chicago       Yes
5      Chicago       Yes
6  Los Angeles        No


In [9]:
# TODO: Apply KNN Imputer on numerical columns

print("\n=== After KNN Imputation (Numerical Columns) ===")
print(df_imputed[['Age', 'Salary']])


=== After KNN Imputation (Numerical Columns) ===
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  30.0      NaN
3  22.0  52000.0
4  40.0  58000.0
5   NaN  62000.0
6  28.0      NaN


In [10]:
# TODO: Verify missing values
print("\n=== Missing Values After Imputation ===")
print(df_imputed.isnull().sum())

# TODO: Print summary statistics
print("\n=== Summary Statistics ===")
print(df_imputed.describe())

# TODO: Print unique values of categorical columns
print("\n=== Unique Values (City) ===", df_imputed['City'].unique())
print("=== Unique Values (Purchased) ===", df_imputed['Purchased'].unique())


=== Missing Values After Imputation ===
Age          2
Salary       2
City         1
Purchased    1
dtype: int64

=== Summary Statistics ===
             Age        Salary
count   5.000000      5.000000
mean   29.000000  56400.000000
std     6.855655   5176.871642
min    22.000000  50000.000000
25%    25.000000  52000.000000
50%    28.000000  58000.000000
75%    30.000000  60000.000000
max    40.000000  62000.000000

=== Unique Values (City) === ['New York' 'Los Angeles' nan 'Chicago']
=== Unique Values (Purchased) === ['Yes' 'No' nan]
