# Simple Imputation

Investigation of principled imputation for very simple cases i.e. under the assumption that the observations are Gaussian and that the data is MAR/ MCAR

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.datasets import load_boston
from scipy import stats
from scipy import linalg

Load the data:

In [None]:
TrueX, _ = load_boston(return_X_y=True)

In [None]:
X = np.genfromtxt("../data/boston-10-MCAR.csv", delimiter=",")

In [None]:
pd.DataFrame(X)

In [None]:
mean = np.nanmean(X, axis=0)
std = np.nanstd(X, axis=0)
scaled_X = (X - mean)/std

## Most simple approach possible (without loosing data) - Mean Imputation

In [None]:
mean_imputed_X = X.copy()
mean_imputed_X[np.isnan(mean_imputed_X)] = np.nanmean(X, axis=0)[np.where(np.isnan(mean_imputed_X))[1]]

pd.DataFrame(mean_imputed_X)

In [None]:
np.sqrt(np.mean(np.power(mean_imputed_X-TrueX,2)))

## Now lets try using a multivariate gaussian with ML esimtation

In [None]:
def g_ll(input_X, μ, Σ):
    ll = 0
    for i in range(input_X.shape[0]):
        ll += stats.multivariate_normal.pdf(input_X[i,:], mean=μ, cov=Σ)
    return np.log(ll)

In [None]:
μML = np.nanmean(X, axis = 0)
μML

In [None]:
N_max = X.shape[0]
ΣML = np.zeros((X.shape[1], X.shape[1]))
# to keep track of how often the xi combinations have been seen together
counts = np.zeros((X.shape[1], X.shape[1]))
ΣML = np.zeros((X.shape[1], X.shape[1]))

for i in range(X.shape[0]):
    X_row = X[i,:]
    # figure out which values are missing
    valid_locs = np.where(~np.isnan(X_row))[0]
    coords = tuple(zip(*[(i, j) for i in valid_locs for j in valid_locs]))
    
    # update the counts
    counts[coords] += 1
    
    # get the x vals and the means
    μ = μML[valid_locs]
    x = X_row[valid_locs]
    
    # calc the variance
    diff = x - μ
    Σ = np.outer(diff, diff.T)
    
    # update variance 
    size = len(valid_locs)
    rescale = np.multiply((counts[coords] - 1), ΣML[coords]).reshape(size, size)
    add = (Σ + rescale)
    ΣML[coords] = (add/(counts[coords].reshape(size, size))).reshape(size*size) # do we need the -1? makes performance worse also seems to make the cov matrix not pos def
ΣML += np.eye(X.shape[1])*1e-3

In [None]:
pd.DataFrame(ΣML)

In [None]:
# check that the covariance matrix is +ve def
np.all(linalg.eigvals(ΣML) > 0)

In [None]:
# impute by taking the mean of the conditional distro
def impute(input_X, μ, Σ):
    imputed_X = input_X.copy()
    for i in range(imputed_X.shape[0]):
        X_row = imputed_X[i,:]
        # if there are no missing values then go to next iter
        if np.all(~np.isnan(X_row)): continue

     # figure out which values are missing
        o_locs = np.where(~np.isnan(X_row))[0]
        m_locs = np.where(np.isnan(X_row))[0]
        oo_coords = tuple(zip(*[(i, j) for i in o_locs for j in o_locs]))
        mo_coords = tuple(zip(*[(i, j) for i in m_locs for j in o_locs]))

        # calculate the mean of m|o
        μmo = μ[m_locs] 
        if (len(o_locs)): # if there are any observations
            # get the subsets of the precision matrices
            Σoo = Σ[oo_coords].reshape(len(o_locs),len(o_locs))
            Σmo = Σ[mo_coords].reshape(len(m_locs),len(o_locs))
            μmo += Σmo @ linalg.inv(Σoo) @ (X_row[o_locs] - μ[o_locs])

        imputed_X[i,:][m_locs] = μmo
        
    return imputed_X

In [None]:
ML_imputed_X = impute(X, μML, ΣML)    

In [None]:
pd.DataFrame(ML_imputed_X)

In [None]:
np.sqrt(np.mean(np.power(ML_imputed_X-TrueX,2)))

In [None]:
g_ll(ML_imputed_X, μML, ΣML)

## EM estimation of ML

The above ML estimation is a bit of a hack - lets see how it compares to the EM algorithm

In [None]:
# # start with random μ and Σ
# μEM = np.random.rand(*μML.shape)
# ΣEM = np.random.rand(*ΣML.shape)
μEM = np.nanmean(X, axis=0)
ΣEM = np.nanmean([np.outer(X[i,:] - μEM, X[i,:] - μEM) for i in range(X.shape[0])], axis=0)

In [None]:
g_ll(input_X=TrueX,μ=μEM,Σ=ΣML)

In [None]:
print(np.sqrt(np.mean(np.power(μEM - μML,2))))
print(np.sqrt(np.mean(np.power(ΣEM - ΣML,2))))

In [None]:
for i in range(1):
    if i == 0:
        # using the current parameters, estiamte the values of the missing data:
        # impute by taking the mean of the conditional distro
        EM_imputed_X = impute(X, μEM, ΣEM)

    # now re-estimate μEM and ΣEM
    μEM = np.mean(EM_imputed_X, axis = 0)
    ΣEM = np.zeros_like(ΣEM)
    for i in range(EM_imputed_X.shape[0]):
        diff = EM_imputed_X[i,:] - μEM
        ΣEM += np.outer(diff, diff.T)
    ΣEM = ΣEM/EM_imputed_X.shape[0]
    ΣEM += np.eye(EM_imputed_X.shape[1])*1e-3
    
    # using the current parameters, estiamte the values of the missing data:
    # impute by taking the mean of the conditional distro
    EM_imputed_X = impute(X, μEM, ΣEM)
    
    # calc RMSE and LL
    
    print(np.sqrt(np.mean(np.power(EM_imputed_X - TrueX,2))))
    print(g_ll(EM_imputed_X, μEM, ΣEM))

In [None]:
# check that the covariance matrix is +ve def
np.all(np.linalg.eigvals(ΣEM) > 0)