import

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


Load Dataset

For this example, we will create a simple dataset with missing values. We will use the Iris dataset, but modify it by introducing some missing values.

In [4]:
data = load_iris()
X = data.data  # Features
y = data.target  # Labels

df = pd.DataFrame(X, columns=data.feature_names)

# Introduce missing values randomly
np.random.seed(42)
missing_values = np.random.choice([1, 0], size=df.shape, p=[0.1, 0.9])
df_with_missing = df.mask(missing_values == 1)

# Display the first few rows of the DataFrame with missing values
df_with_missing.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,,0.2
2,4.7,3.2,,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Imputation Techniques

Imputation with the Mean (for Numerical Data)

The most common strategy for imputing missing values in numerical data is to replace them with the mean of the column.

In [5]:
# Create a SimpleImputer instance with the 'mean' strategy
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the data and transform it to fill missing values
df_imputed_mean = pd.DataFrame(imputer.fit_transform(df_with_missing), columns=df_with_missing.columns)

# Display the first few rows after imputation
df_imputed_mean.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,3.737879,0.2
2,4.7,3.2,3.737879,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Imputation with the Median (for Robustness to Outliers)

If the dataset contains outliers, you might want to use the median instead of the mean, as the median is more robust to extreme values.

In [6]:
# Create a SimpleImputer instance with the 'median' strategy
imputer = SimpleImputer(strategy='median')

# Fit the imputer on the data and transform it to fill missing values
df_imputed_median = pd.DataFrame(imputer.fit_transform(df_with_missing), columns=df_with_missing.columns)

# Display the first few rows after imputation
df_imputed_median.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,4.3,0.2
2,4.7,3.2,4.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Imputation with the Most Frequent Value (for Categorical Data)

For categorical data, the most common imputation strategy is to replace missing values with the most frequent value in the column.

In [8]:
categorical_data = pd.DataFrame({
    'color': ['red', 'blue', np.nan, 'blue', 'green', np.nan, 'red', 'green'],
    'size': ['S', 'M', 'L', np.nan, 'L', 'M', 'M', np.nan]
})

imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer on the data and transform it to fill missing values
categorical_imputed = pd.DataFrame(imputer.fit_transform(categorical_data), columns=categorical_data.columns)

categorical_imputed


Unnamed: 0,color,size
0,red,S
1,blue,M
2,blue,L
3,blue,M
4,green,L
5,blue,M
6,red,M
7,green,M


Imputation with a Constant Value

You can also choose to replace missing values with a constant value (such as 0, -1, or a placeholder like 'Unknown').

In [10]:
imputer = SimpleImputer(strategy='constant', fill_value=0)

# Fit the imputer on the data and transform it to fill missing values
df_imputed_constant = pd.DataFrame(imputer.fit_transform(df_with_missing), columns=df_with_missing.columns)

# Display the first few rows after imputation
df_imputed_constant.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,0.0,0.2
2,4.7,3.2,0.0,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


Imputation on Training and Test Data

In practice, you’ll usually fit the imputer to your training data and then apply the transformation to both your training and test datasets, ensuring the same imputation strategy is applied.


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_with_missing, y, test_size=0.3, random_state=42)

# Create a SimpleImputer instance
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform it
X_train_imputed = imputer.fit_transform(X_train)

# Transform the test data (using the same imputer)
X_test_imputed = imputer.transform(X_test)

# Display the imputed data
X_train_imputed[:5], X_test_imputed[:5]


(array([[5.5       , 2.4       , 3.7       , 1.        ],
        [6.3       , 2.8       , 5.1       , 1.5       ],
        [6.4       , 3.1       , 5.5       , 1.8       ],
        [5.88764045, 3.        , 4.4       , 1.4       ],
        [5.88764045, 3.6       , 6.1       , 1.19032258]]),
 array([[6.1       , 2.8       , 4.7       , 1.2       ],
        [5.88764045, 3.8       , 1.7       , 0.3       ],
        [5.88764045, 2.6       , 6.9       , 2.3       ],
        [5.88764045, 2.9       , 4.5       , 1.5       ],
        [6.8       , 2.8       , 4.8       , 1.4       ]]))

Summary of Imputation Strategies

Mean Imputation: Replaces missing values with the mean of the feature. Best for numerical features when the data is fairly symmetric.

Median Imputation: Replaces missing values with the median of the feature. Useful for numerical data with outliers.

Most Frequent Imputation: Replaces missing values with the most frequent value in the column. Best for categorical data.

Constant Imputation: Replaces missing values with a constant value (e.g., 0 or 'Unknown'). Often used in special cases where missingness represents a meaningful value.