In [37]:
import numpy as np
import pandas as pd
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [38]:
np.random.seed(42)

# Example dataset
data = pd.DataFrame({
    "Feature1": np.random.randn(10),
    "Feature2": np.random.randn(10),
    "Feature3": np.random.randn(10)
})

# Introduce missing values
data.loc[2, "Feature1"] = np.nan
data.loc[5, "Feature2"] = np.nan
data.loc[7, "Feature3"] = np.nan
data.loc[9, "Feature1"] = np.nan


# MICE (Multivariate Imputation by Chained Equation)

Types of Missing data patterns:
1. MCAR : randomly missing. no way to know why
2. MAR (Missing at random) : user didnt fill in the data. Hence it can be filled in with other columns values.
3. MNAR (Missing not at random) : data has been purposely removed hence we cant predict the relation with other cols


Above algo works great with MAR.
- adv :  accurate
- disadv : slow, and training data to be put on server

## Working

1. fill NAN with Mean of each col (Iteration 1)
2. move from left col to right and replace tha mean value that we put with NAN (for the pred)
3. Predict row-wise for each NAN (for the row which has no NAN will act as input for prediction, which will help predict the current NAN using its own row i.e. other cols). Any algo can be used LinearRegression, KNN, Dt
4. Do it for each col one by one
5. Take the Difference From Iteration 1 i.e. When we put fake mean and Iteration 1 i.e. predicted. (All the other values will be 0 except the where the NAN was)
6. repeat above till that difference becomes 0 or max possible

Flow :

Fake Mean -> NAN -> predict -> difference -> Pass Pred1 ->  NAN -> predict -> difference ---->

In [39]:
print("Original Data (with missing values):\n", data)

Original Data (with missing values):
    Feature1  Feature2  Feature3
0  0.496714 -0.463418  1.465649
1 -0.138264 -0.465730 -0.225776
2       NaN  0.241962  0.067528
3  1.523030 -1.913280 -1.424748
4 -0.234153 -1.724918 -0.544383
5 -0.234137       NaN  0.110923
6  1.579213 -1.012831 -1.150994
7  0.767435  0.314247       NaN
8 -0.469474 -0.908024 -0.600639
9       NaN -1.412304 -0.291694


In [40]:
# Step 3: Initialize Iterative Imputer with Linear Regression
imputer = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=10,
    random_state=42
)

In [41]:
imputed_data = imputer.fit_transform(data)

# Step 5: Convert back to DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)

print("\nImputed Data (missing values filled):\n", imputed_df)


Imputed Data (missing values filled):
    Feature1  Feature2  Feature3
0  0.496714 -0.463418  1.465649
1 -0.138264 -0.465730 -0.225776
2  0.463762  0.241962  0.067528
3  1.523030 -1.913280 -1.424748
4 -0.234153 -1.724918 -0.544383
5 -0.234137 -0.668702  0.110923
6  1.579213 -1.012831 -1.150994
7  0.767435  0.314247  0.422611
8 -0.469474 -0.908024 -0.600639
9  0.343819 -1.412304 -0.291694


## Manual Background work

In [57]:
data = pd.DataFrame({
    "A": [150, 160, np.nan, 170, 180, np.nan, 200],
    "B": [50, np.nan, 70, 80, np.nan, 100, 110],
    "C": [20, 25, 30, np.nan, 40, 45, np.nan]
})

In [58]:
# 2. Simple loop to mimic iterative imputation
data_copy = data.copy()

In [59]:
# 2. Initialize missing values with column means (like IterativeImputer does initially)
data_copy = data.copy()
for col in data_copy.columns:
    data_copy[col].fillna(data_copy[col].mean(), inplace=True)

print("\nAfter initial mean imputation:\n", data_copy)


After initial mean imputation:
        A      B     C
0  150.0   50.0  20.0
1  160.0   82.0  25.0
2  172.0   70.0  30.0
3  170.0   80.0  32.0
4  180.0   82.0  40.0
5  172.0  100.0  45.0
6  200.0  110.0  32.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_copy[col].fillna(data_copy[col].mean(), inplace=True)


In [62]:
for step in range(3):
    print('--------------')
    print(f"\nIteration {step+1}")
    
    for col in data.columns:
        miss = data[col].isnull()
        print(f"  Checking column {col}, missing count = {miss.sum()}")
        
        if miss.sum() == 0:
            continue  
        
        not_miss = ~miss
        X_train = data_copy.loc[not_miss].drop(columns=[col])
        y_train = data_copy.loc[not_miss, col]
        X_pred = data_copy.loc[miss].drop(columns=[col])
        
        model = LinearRegression()
        model.fit(X_train, y_train)
        data_copy.loc[miss, col] = model.predict(X_pred)
        
        print(f"  Filled missing in column {col} with {data_copy.loc[miss, col]}")



print("\nData after manual iterative imputation:\n", data_copy)

--------------

Iteration 1
  Checking column A, missing count = 2
  Filled missing in column A with 2    165.226469
5    188.066449
Name: A, dtype: float64
  Checking column B, missing count = 2
  Filled missing in column B with 1    59.711062
4    90.054859
Name: B, dtype: float64
  Checking column C, missing count = 2
  Filled missing in column C with 3    34.842611
6    50.239437
Name: C, dtype: float64
--------------

Iteration 2
  Checking column A, missing count = 2
  Filled missing in column A with 2    165.226565
5    188.066438
Name: A, dtype: float64
  Checking column B, missing count = 2
  Filled missing in column B with 1    59.711066
4    90.054857
Name: B, dtype: float64
  Checking column C, missing count = 2
  Filled missing in column C with 3    34.842610
6    50.239437
Name: C, dtype: float64
--------------

Iteration 3
  Checking column A, missing count = 2
  Filled missing in column A with 2    165.226572
5    188.066432
Name: A, dtype: float64
  Checking column B, 