In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("mice_demo.csv")
df

Unnamed: 0,Age,Salary,Experience,Score
0,25.0,50000.0,2.0,78.0
1,30.0,60000.0,5.0,
2,35.0,,7.0,85.0
3,40.0,80000.0,,88.0
4,45.0,90000.0,15.0,92.0
5,,100000.0,20.0,95.0
6,50.0,110000.0,,98.0


In [2]:
from sklearn.impute import SimpleImputer

In [3]:
mean_imputer = SimpleImputer(strategy="mean")
df_init = pd.DataFrame(
    mean_imputer.fit_transform(df),
    columns=df.columns
)

In [4]:
from sklearn.linear_model import LinearRegression

df_mice = df_init.copy()
original_missing = df.isna()


In [5]:
for col in df.columns:
    # rows where this column was originally missing
    missing_rows = original_missing[col]
    
    if missing_rows.sum() == 0:
        continue
    
    # X = other columns
    X_train = df_mice.loc[~missing_rows].drop(columns=[col])
    y_train = df_mice.loc[~missing_rows, col]
    
    X_test = df_mice.loc[missing_rows].drop(columns=[col])
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # predict missing values
    y_pred = model.predict(X_test)
    
    # OPTIONAL: add randomness (important for true MICE)
    residual_std = np.std(y_train - model.predict(X_train))
    y_pred = y_pred + np.random.normal(0, residual_std, size=len(y_pred))
    
    # replace values
    df_mice.loc[missing_rows, col] = y_pred


In [6]:
df_mice


Unnamed: 0,Age,Salary,Experience,Score
0,25.0,50000.0,2.0,78.0
1,30.0,60000.0,5.0,81.329898
2,35.0,71404.226815,7.0,85.0
3,40.0,80000.0,11.364589,88.0
4,45.0,90000.0,15.0,92.0
5,51.786632,100000.0,20.0,95.0
6,50.0,110000.0,14.061603,98.0


In [7]:
df_mice = df_init.copy()

n_iterations = 10

for iteration in range(n_iterations):
    for col in df.columns:
        missing_rows = original_missing[col]
        
        if missing_rows.sum() == 0:
            continue
        
        X_train = df_mice.loc[~missing_rows].drop(columns=[col])
        y_train = df_mice.loc[~missing_rows, col]
        X_test = df_mice.loc[missing_rows].drop(columns=[col])
        
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        residual_std = np.std(y_train - model.predict(X_train))
        y_pred = y_pred + np.random.normal(0, residual_std, size=len(y_pred))
        
        df_mice.loc[missing_rows, col] = y_pred


In [9]:
df_mice.to_csv("mice_imputed_output.csv", index=False)
