In [1]:
import pandas as pd
from io import StringIO

# Sample CSV data with missing values(simulate employees.csv)
csv_data = """
id,name,age,salary,department
1,Alice,25,50000,HR
2,Bob,,60000,Finance
3,Charlie,30,,IT
4,David,22,45000,
5,Eve,28,52000,HR
6,Frank,,,
"""

# Load dataset
df = pd.read_csv(StringIO(csv_data))

print("Original Data:")
print(df)

# 1. Dropping Missing Data
df_dropped = df.dropna()
print("\nAfter Dropping Rows with Missing Values:")
print(df_dropped)

# 2. Imputation using Mean (numerical columns only)
df_mean_imputed = df.copy()
for col in ['age', 'salary']:
    df_mean_imputed[col].fillna(df_mean_imputed[col].mean(), inplace=True)

print("\nAfter Imputation with Mean:")
print(df_mean_imputed)

# 3. Imputation using Median (numerical) and Mode (categorical)
df_median_mode_imputed = df.copy()

# Impute numerical columns with median
for col in ['age', 'salary']:
    df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].median(), inplace=True)

# Impute categorical columns with mode
for col in ['department', 'name']:
    mode_val = df_median_mode_imputed[col].mode()
    if not mode_val.empty:
        df_median_mode_imputed[col].fillna(mode_val[0], inplace=True)

print("\nAfter Imputation with Median (numerical) and Mode (categorical):")
print(df_median_mode_imputed)


Original Data:
   id     name   age   salary department
0   1    Alice  25.0  50000.0         HR
1   2      Bob   NaN  60000.0    Finance
2   3  Charlie  30.0      NaN         IT
3   4    David  22.0  45000.0        NaN
4   5      Eve  28.0  52000.0         HR
5   6    Frank   NaN      NaN        NaN

After Dropping Rows with Missing Values:
   id   name   age   salary department
0   1  Alice  25.0  50000.0         HR
4   5    Eve  28.0  52000.0         HR

After Imputation with Mean:
   id     name    age   salary department
0   1    Alice  25.00  50000.0         HR
1   2      Bob  26.25  60000.0    Finance
2   3  Charlie  30.00  51750.0         IT
3   4    David  22.00  45000.0        NaN
4   5      Eve  28.00  52000.0         HR
5   6    Frank  26.25  51750.0        NaN

After Imputation with Median (numerical) and Mode (categorical):
   id     name   age   salary department
0   1    Alice  25.0  50000.0         HR
1   2      Bob  26.5  60000.0    Finance
2   3  Charlie  30.0  51000

In [2]:
import pandas as pd
from io import StringIO
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
import numpy as np

# Sample CSV data with missing values
csv_data = """
id,name,age,salary,department
1,Alice,25,50000,HR
2,Bob,,60000,Finance
3,Charlie,30,,IT
4,David,22,45000,
5,Eve,28,52000,HR
6,Frank,,,
"""

df = pd.read_csv(StringIO(csv_data))

# Prepare data for imputation (numerical columns only)
num_cols = ['age', 'salary']

# 4. ML-based Imputation with SimpleImputer (mean strategy)
simple_imputer = SimpleImputer(strategy='mean')
df_simple_imputed = df.copy()
df_simple_imputed[num_cols] = simple_imputer.fit_transform(df_simple_imputed[num_cols])
print("4. SimpleImputer with Mean:")
print(df_simple_imputed)

# 5. Imputation using Regression Model (predict missing 'salary' based on 'age')
df_reg = df.copy()

# Training data: rows where both age and salary are present
train_data = df_reg.dropna(subset=['age', 'salary'])

# Test data: rows where salary is missing but age present
test_data = df_reg[df_reg['salary'].isna() & df_reg['age'].notna()]

# Train regression model to predict salary from age
if not train_data.empty and not test_data.empty:
    lr = LinearRegression()
    lr.fit(train_data[['age']], train_data['salary'])

    predicted_salary = lr.predict(test_data[['age']])
    df_reg.loc[test_data.index, 'salary'] = predicted_salary

print("\n5. Regression-based Imputation for 'salary':")
print(df_reg)

# 6. K-Nearest Neighbors Imputation
df_knn = df.copy()

# KNNImputer works on numerical data only
knn_imputer = KNNImputer(n_neighbors=2)
df_knn[num_cols] = knn_imputer.fit_transform(df_knn[num_cols])

print("\n6. KNN Imputation:")
print(df_knn)


4. SimpleImputer with Mean:
   id     name    age   salary department
0   1    Alice  25.00  50000.0         HR
1   2      Bob  26.25  60000.0    Finance
2   3  Charlie  30.00  51750.0         IT
3   4    David  22.00  45000.0        NaN
4   5      Eve  28.00  52000.0         HR
5   6    Frank  26.25  51750.0        NaN

5. Regression-based Imputation for 'salary':
   id     name   age        salary department
0   1    Alice  25.0  50000.000000         HR
1   2      Bob   NaN  60000.000000    Finance
2   3  Charlie  30.0  54833.333333         IT
3   4    David  22.0  45000.000000        NaN
4   5      Eve  28.0  52000.000000         HR
5   6    Frank   NaN           NaN        NaN

6. KNN Imputation:
   id     name    age   salary department
0   1    Alice  25.00  50000.0         HR
1   2      Bob  26.50  60000.0    Finance
2   3  Charlie  30.00  51000.0         IT
3   4    David  22.00  45000.0        NaN
4   5      Eve  28.00  52000.0         HR
5   6    Frank  26.25  51750.0        