In [1]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.

# Ques_9.ipynb
# Predictive Imputation Using Machine Learning
# Objective: Use a simple predictive model to impute missing values in a column

# ----------------------------------------
# Step 1: Install necessary libraries
# ----------------------------------------
# Uncomment the next line if not already installed
# !pip install pandas scikit-learn

# ----------------------------------------
# Step 2: Import Libraries
# ----------------------------------------
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# ----------------------------------------
# Step 3: Create Sample Dataset with Missing Values
# ----------------------------------------
data = {
    'ID': [1, 2, 3, 4, 5, 6],
    'Age': [25, None, 30, 22, None, 27],  # 'Age' column has missing values
    'Salary': [50000, 60000, 65000, 70000, 72000, 75000]
}

df = pd.DataFrame(data)
print("🔹 Sample Dataset with Missing Values:")
display(df)

# ----------------------------------------
# Step 4: Prepare Data for Predictive Imputation
# ----------------------------------------

# Step 4.1: Separate rows with missing 'Age' values
train_data = df[df['Age'].notnull()]
test_data = df[df['Age'].isnull()]

# Step 4.2: Define the predictor (X) and target (y) for the model
X_train = train_data[['Salary']]  # Using 'Salary' to predict 'Age'
y_train = train_data['Age']

# Step 4.3: Train a Decision Tree Regressor model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# ----------------------------------------
# Step 5: Predict Missing 'Age' Values
# ----------------------------------------
X_test = test_data[['Salary']]  # We will predict the 'Age' based on 'Salary'
predicted_ages = model.predict(X_test)

# Step 5.1: Fill missing 'Age' values with predictions
df.loc[df['Age'].isnull(), 'Age'] = predicted_ages

# ----------------------------------------
# Step 6: Display Imputed Dataset
# ----------------------------------------
print("\n✅ Dataset After Predictive Imputation:")
display(df)


🔹 Sample Dataset with Missing Values:


Unnamed: 0,ID,Age,Salary
0,1,25.0,50000
1,2,,60000
2,3,30.0,65000
3,4,22.0,70000
4,5,,72000
5,6,27.0,75000



✅ Dataset After Predictive Imputation:


Unnamed: 0,ID,Age,Salary
0,1,25.0,50000
1,2,30.0,60000
2,3,30.0,65000
3,4,22.0,70000
4,5,22.0,72000
5,6,27.0,75000
