# Simple Imputation

Investigation of principled imputation for very simple cases i.e. under the assumption that the observations are Gaussian and that the data is MAR/ MCAR

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.datasets import load_boston
from scipy import stats
from scipy import linalg

Load the data:

In [None]:
X,y = load_boston(return_X_y=True)

In [None]:
print(X.shape, y.shape)

In [None]:
pd.DataFrame(X)

Now create a mask for the missingness:

In [None]:
mask = np.random.rand(*X.shape) > 0.9

In [None]:
masked_X = X.copy()
masked_X[mask] = np.NaN
pd.DataFrame(masked_X)

In [None]:
mean = np.nanmean(masked_X, axis=0)
std = np.nanstd(masked_X, axis=0)
scaled_X = (masked_X - mean)/std

In [None]:
pd.DataFrame(scaled_X)

## Most simple approach possible (without loosing data) - Mean Imputation

In [None]:
mean_imputed_X = scaled_X.copy()
mean_imputed_X[np.isnan(mean_imputed_X)] = np.nanmean(scaled_X, axis=0)[np.where(np.isnan(mean_imputed_X))[1]]
mean_imputed_X = (mean_imputed_X)*std + mean

pd.DataFrame(mean_imputed_X)

In [None]:
np.sqrt(np.mean(np.power(mean_imputed_X-X,2)))

## Now lets try using a multivariate gaussian with ML esimtation

In [None]:
def g_ll(imputed_X, μ, Σ):
    ll = 0
    for i in range(imputed_X.shape[0]):
        ll += stats.multivariate_normal.pdf(imputed_X[i,:], mean=μ, cov=Σ)
    return np.log(ll)

In [None]:
μML = np.nanmean(masked_X, axis = 0)
μML

In [None]:
N_max = masked_X.shape[0]
ΣML = np.zeros((X.shape[1], X.shape[1]))
# to keep track of how often the xi combinations have been seen together
counts = np.zeros((X.shape[1], X.shape[1]))
ΣML = np.zeros((X.shape[1], X.shape[1]))

for i in range(masked_X.shape[0]):
    X_row = masked_X[i,:]
    # figure out which values are missing
    valid_locs = np.where(~np.isnan(X_row))[0]
    coords = tuple(zip(*[(i, j) for i in valid_locs for j in valid_locs]))
    
    # update the counts
    counts[coords] += 1
    
    # get the x vals and the means
    μ = μML[valid_locs]
    x = X_row[valid_locs]
    
    # calc the variance
    diff = x - μ
    Σ = np.outer(diff, diff.T)
    
    # update variance 
    size = len(valid_locs)
    rescale = np.multiply((counts[coords] - 1), ΣML[coords]).reshape(size, size)
    add = (Σ + rescale)
    ΣML[coords] = (add/(counts[coords].reshape(size, size))).reshape(size*size) # do we need the -1? makes performance worse also seems to make the cov matrix not pos def

In [None]:
# check that the covariance matrix is +ve def
np.all(linalg.eigvals(ΣML) > 0)

In [None]:
# impute by taking the mean of the conditional distro
def impute(input_X, μ, Λ):
    imputed_X = input_X.copy()
    for i in range(imputed_X.shape[0]):
        X_row = imputed_X[i,:]
        # if there are no missing values then go to next iter
        if np.all(~np.isnan(X_row)): continue

        # figure out which values are missing
        b_locs = np.where(~np.isnan(X_row))[0]
        a_locs = np.where(np.isnan(X_row))[0]
        ab_coords = tuple(zip(*[(i, j) for i in a_locs for j in b_locs]))
        aa_coords = tuple(zip(*[(i, j) for i in a_locs for j in a_locs]))

        # get the subsets of the precision matrices
        Λaa = Λ[aa_coords].reshape(len(a_locs), len(a_locs))
        Λab = Λ[ab_coords].reshape(len(a_locs), len(b_locs))

        # calculate the mean of a|b
        μab = μ[a_locs] - linalg.inv(Λaa) @ Λab @ (X_row[b_locs] - μ[b_locs])
        imputed_X[i,:][a_locs] = μab
        
    return imputed_X

In [None]:
# calc the inverse of the covariance matrix - the precision matrix
ΛML = np.linalg.inv(ΣML)

In [None]:
ML_imputed_X = impute(masked_X, μML, ΛML)    

In [None]:
pd.DataFrame(ML_imputed_X)

In [None]:
np.sqrt(np.mean(np.power(ML_imputed_X-X,2)))

In [None]:
g_ll(ML_imputed_X, μML, ΣML)

In [None]:
pd.DataFrame(ML_imputed_X - X)

## EM estimation of ML

The above ML estimation is a bit of a hack - lets see how it compares to the EM algorithm

In [None]:
# # start with random μ and Σ
μEM = np.random.rand(*μML.shape)
ΣEM = np.random.rand(*ΣML.shape)
# μEM = np.nanmean(scaled_X, axis=0)
# ΣEM = np.eye(scaled_X.shape[1])

In [None]:
for i in range(10):
    if i == 0:
        # using the current parameters, estiamte the values of the missing data:
        ΛEM = np.linalg.inv(ΣEM)
        # impute by taking the mean of the conditional distro
        EM_imputed_X = impute(scaled_X, μEM, ΛEM)

    # now re-estimate μEM and ΣEM
    μEM = np.mean(EM_imputed_X, axis = 0)
    diff = EM_imputed_X - μEM
    ΣEM = (diff.T @ diff)/diff.shape[0]
    
    # using the current parameters, estiamte the values of the missing data:
    ΛEM = linalg.inv(ΣEM)
    # impute by taking the mean of the conditional distro
    EM_imputed_X = impute(scaled_X, μEM, ΛEM)
    
    # calc RMSE and LL
    print(np.sqrt(np.mean(np.power(EM_imputed_X*std + mean - X,2))))
    print(g_ll(EM_imputed_X, μEM, ΛEM))

In [None]:
# check that the covariance matrix is +ve def
np.all(np.linalg.eigvals(ΣEM) > 0)