In [2]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [13]:
# Import the IRIS dataset from the datasets package
irisRaw = datasets.load_iris()
iris = pd.DataFrame(data = irisRaw.data, columns = irisRaw.feature_names)

### Simulating Missing Data

In [17]:
# Simulating MCAR (missing completely at random) data
irisSepalLengthMCAR = iris.copy()
irisAllMCAR = iris.copy()

missing = 0.3

# Creating a dataframe which the sepal length column is MCAR
irisSepalLengthMCAR.loc[irisSepalLengthMCAR.sample(frac = missing).index, 'sepal length (cm)'] = np.nan

# Creating a dataframe which all columns are MCAR
for col in irisAllMCAR:
    irisAllMCAR.loc[irisAllMCAR.sample(frac = missing).index, col] = np.nan

In [21]:
# Simulating MAR (missing at random) data
irisMAR = iris.copy()

missing = 0.3

# Using the 'weights' parameter in the sample function, we are able to make it likely that larger values in 'sepal length (cm)'
# are likely to have null values.
irisMAR.loc[irisMAR.sample(frac = missing, weights = 'sepal length (cm)').index, 'sepal length (cm)'] = np.nan

### Dealing with Missing data

### Listwise Deletion  
Listwise deletion simply deletes all rows with missing values. It is capable of producing unbiased estimates when the data is MCAR, but reduces power.

In [22]:
irisMCAR_listwise = irisMCAR.copy()
irisMCAR_listwise.dropna()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
2,4.7,3.2,1.3,0.2
6,4.6,3.4,1.4,0.3
10,5.4,3.7,1.5,0.2
11,4.8,3.4,1.6,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### Pairwise Deletion
Pairwise deletion only deletes null values that are relevant to the function at hand.  
This is automatically conducted on functions such as mean(), sum(), etc.  
Unsurprisingly, this produces more accurate results than simple listwise deletion.

In [27]:
irisMCAR_pairwise = irisMCAR.copy()
irisMAR_pairwise = irisMAR.copy()

print(f"Mean estimation in MCAR data with pairwise deletion: {irisMCAR_pairwise['sepal length (cm)'].mean()}", 
      f"\nSD estimation in MCAR data with pairwise deletion: {irisMCAR_pairwise['sepal length (cm)'].std()}")
print(f"Mean estimation in MAR data with pairwise deletion: {irisMAR_pairwise['sepal length (cm)'].mean()}", 
      f"\nSD estimation in MAR data with pairwise deletion: {irisMAR_pairwise['sepal length (cm)'].std()}")
print(f"Mean estimation in the original data: {iris['sepal length (cm)'].mean()}",
      f"\nSD estimation in the original data: {iris['sepal length (cm)'].std()}")

Mean estimation in MCAR data with pairwise deletion: 5.835238095238094 
SD estimation in MCAR data with pairwise deletion: 0.8046715708886063
Mean estimation in MAR data with pairwise deletion: 5.893333333333332 
SD estimation in MAR data with pairwise deletion: 0.8576877083928657
Mean estimation in the original data 5.843333333333335 
SD estimation in the original data 0.8280661279778629


### Single Imputation
As opposed to deletion, we can impute (or in plain English -- "guess") the missing value with various imputation methods.

### Simple Imputation
The simplest ones include mean/median/model imputation.

In [29]:
from sklearn.impute import SimpleImputer
irisMAR_mean = irisMAR.copy()

imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit(irisMAR_mean)
print(imp.transform(irisMAR_mean))

[[5.1        3.5        1.4        0.2       ]
 [5.89333333 3.         1.4        0.2       ]
 [5.89333333 3.2        1.3        0.2       ]
 [5.89333333 3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]
 [5.4        3.9        1.7        0.4       ]
 [4.6        3.4        1.4        0.3       ]
 [5.         3.4        1.5        0.2       ]
 [4.4        2.9        1.4        0.2       ]
 [4.9        3.1        1.5        0.1       ]
 [5.89333333 3.7        1.5        0.2       ]
 [4.8        3.4        1.6        0.2       ]
 [5.89333333 3.         1.4        0.1       ]
 [4.3        3.         1.1        0.1       ]
 [5.8        4.         1.2        0.2       ]
 [5.7        4.4        1.5        0.4       ]
 [5.4        3.9        1.3        0.4       ]
 [5.1        3.5        1.4        0.3       ]
 [5.7        3.8        1.7        0.3       ]
 [5.1        3.8        1.5        0.3       ]
 [5.89333333 3.4        1.7        0.2       ]
 [5.1        

### Regression Imputation/Conditional Mean Imputation
A more common way to impute null values is using regression and anova models.  
This is useful when we can identify what the dependent variables are (the 'rule' behind the missing values).  

In [35]:
from sklearn import linear_model
irisMAR_reg = irisMAR.copy()

irisMAR_reg_model = irisMAR_reg.copy().dropna()

model = linear_model.LinearRegression()
Xs = irisMAR_reg_model['sepal width (cm)'].values.reshape(-1, 1)
ys = irisMAR_reg_model['sepal length (cm)'].values.reshape(-1, 1)
model.fit(X = Xs, y = ys)

null_index = irisMAR_reg['sepal length (cm)'].isnull()

na_result = model.predict(irisMAR_reg[null_index]['sepal width (cm)'].values.reshape(-1, 1))


irisMAR_reg.loc[irisMAR_reg['sepal length (cm)'].isnull(), 'sepal length (cm)'] = na_result.reshape(len(na_result),)

print(irisMAR_reg)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0             5.100000               3.5                1.4               0.2
1             5.902854               3.0                1.4               0.2
2             5.865130               3.2                1.3               0.2
3             5.883992               3.1                1.5               0.2
4             5.000000               3.6                1.4               0.2
..                 ...               ...                ...               ...
145           5.902854               3.0                5.2               2.3
146           6.300000               2.5                5.0               1.9
147           6.500000               3.0                5.2               2.0
148           6.200000               3.4                5.4               2.3
149           5.900000               3.0                5.1               1.8

[150 rows x 4 columns]


### Nearest Neighbors or Hot Deck Imputation


In [36]:
from sklearn.impute import KNNImputer
irisMAR_knn = irisMAR.copy()

knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")

knn_imputer.fit_transform(irisMAR_knn)

array([[5.1 , 3.5 , 1.4 , 0.2 ],
       [4.4 , 3.  , 1.4 , 0.2 ],
       [4.8 , 3.2 , 1.3 , 0.2 ],
       [4.85, 3.1 , 1.5 , 0.2 ],
       [5.  , 3.6 , 1.4 , 0.2 ],
       [5.4 , 3.9 , 1.7 , 0.4 ],
       [4.6 , 3.4 , 1.4 , 0.3 ],
       [5.  , 3.4 , 1.5 , 0.2 ],
       [4.4 , 2.9 , 1.4 , 0.2 ],
       [4.9 , 3.1 , 1.5 , 0.1 ],
       [5.1 , 3.7 , 1.5 , 0.2 ],
       [4.8 , 3.4 , 1.6 , 0.2 ],
       [4.65, 3.  , 1.4 , 0.1 ],
       [4.3 , 3.  , 1.1 , 0.1 ],
       [5.8 , 4.  , 1.2 , 0.2 ],
       [5.7 , 4.4 , 1.5 , 0.4 ],
       [5.4 , 3.9 , 1.3 , 0.4 ],
       [5.1 , 3.5 , 1.4 , 0.3 ],
       [5.7 , 3.8 , 1.7 , 0.3 ],
       [5.1 , 3.8 , 1.5 , 0.3 ],
       [4.95, 3.4 , 1.7 , 0.2 ],
       [5.1 , 3.7 , 1.5 , 0.4 ],
       [4.6 , 3.6 , 1.  , 0.2 ],
       [5.1 , 3.3 , 1.7 , 0.5 ],
       [4.9 , 3.4 , 1.9 , 0.2 ],
       [4.85, 3.  , 1.6 , 0.2 ],
       [5.  , 3.4 , 1.6 , 0.4 ],
       [5.2 , 3.5 , 1.5 , 0.2 ],
       [5.1 , 3.4 , 1.4 , 0.2 ],
       [4.7 , 3.2 , 1.6 , 0.2 ],
       [4.