In [2]:
# Runs various forms of Missing Value Imputation on 'BESW8numeric' (BES Panel W8 substantive data in numeric form)
# Most thorough approach is *slooooow* (8+hrs)
# So a compromise has been added at the top to drop variables which tend to get ditched further down the line

# Ultimately, it would be preferable to use algorithms which are okay with missing values
# (or, even better, okay with weighted samples)
# But most algorithms are not

# Bottom half of the notebook contains a noble attempt to actually test and compare the imputation methods
# The fact that median imputation appears to more or less win suggests
# That either the methods suck or the measure is rubbish

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import gc
import re

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from IPython.display import display
import pickle, os

import seaborn as sns
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MICE

BES_data_folder = "../BES_analysis_data/"
BES_code_folder = "../BES_analysis_code/"

Using Theano backend.


In [3]:
BES_numeric = pd.read_hdf( BES_data_folder+"BESW8numeric.hdf", "BESW8numeric" )
print("BES_numeric",  BES_numeric.shape )
var_type    = pd.read_hdf( BES_data_folder+"var_type.hdf", "var_type" )
print("var_type"   ,  var_type.shape )



BES_numeric (31409, 659)
var_type (733, 1)


In [3]:
BES_numeric = pd.read_hdf( BES_data_folder+"BESW10numeric.hdf", "BESW10numeric" )
print("BES_numeric",  BES_numeric.shape )
var_type    = pd.read_hdf( BES_data_folder+"var_type.hdf", "var_type" )
print("var_type"   ,  var_type.shape )



BES_numeric (64689, 3955)
var_type (4247, 1)


In [4]:
var_type

Unnamed: 0,type
id,-5
wt_core_W3,-1
wt_core_W4,-1
wt_core_W5,-1
wt_core_W6,-1
wt_core_W7,-1
wt_core_W8,-1
wt_core_W9,-1
wt_core_W1,-1
wt_core_W2,-1


In [None]:
### COMPROMISE DUE TO SLOOOOOWNESS OF MICE


# thresh = 0.5 , 226 -> 2 hours
# thresh = 0.02, 378 -> 6 hours
thresh = .02
counts = BES_numeric.count()
mostly_not_filled_out = counts[counts<= (counts.max()*thresh)].index
# mostly_filled_out     = counts[counts>  (counts.max()*thresh)].index
W9_vars = pd.Index([x for x in BES_numeric.columns if "W9" in x])

many_cat_drop_list = ['Age', 'EUMIICategory', 'age', 'country_of_residence1',
       'country_of_residence2', 'euRefExpectation', 'ns_sec', 'pano',
       'profile_lea', 'profile_oslaua']

high_corr_drop_list = ['ageGroup', 'euRefVote2', 'euRefVotePost', 'euRefVoteUnsqueeze',
       'recallVote15', 'voted2015']

very_low_var = ['partyContactLD_6', 'partyContactLD_7', 'partyContactUKIP_1']


BES_numeric.drop(W9_vars,               axis=1, inplace=True, errors='ignore')
BES_numeric.drop(mostly_not_filled_out, axis=1, inplace=True, errors='ignore')
BES_numeric.drop(many_cat_drop_list   , axis=1, inplace=True, errors='ignore')
BES_numeric.drop(high_corr_drop_list  , axis=1, inplace=True, errors='ignore')
BES_numeric.drop(very_low_var         , axis=1, inplace=True, errors='ignore')
print(BES_numeric.shape)

In [4]:
## MULTIPLE IMPUTATION WITH CHAINED EQUATIONS
# Gold standard missing value imputation
# Only improvement is to create a lots of different imputed sets and run your final analysis on all of them
# to get an implict sensitivity analysis
# I'm still pathetically hoping to find a more efficient way of doing this
# Ideally one that also tested itself to tell you whether it was spitting out crap

%%time

#  n_pmm_neighbors=BES_numeric.shape[1],impute_type = "pmm",
mice = MICE(n_imputations = 100, n_burn_in = 10,# impute_type = "pmm",n_pmm_neighbors= 20
            verbose = True, init_fill_method='median') # 1hr+ ->  2h 7min 43s
X_filled_mice = mice.complete( BES_numeric.values )
BESW8mice = pd.DataFrame( X_filled_mice, columns = BES_numeric.columns )
BESW8mice.to_hdf( BES_data_folder+"BESW8mice.hdf", "BESW8mice" )

[MICE] Completing matrix with shape (31409, 378)
[MICE] Starting imputation round 1/110, elapsed time 0.494
[MICE] Starting imputation round 2/110, elapsed time 219.795
[MICE] Starting imputation round 3/110, elapsed time 446.740
[MICE] Starting imputation round 4/110, elapsed time 675.138
[MICE] Starting imputation round 5/110, elapsed time 906.328
[MICE] Starting imputation round 6/110, elapsed time 1134.556
[MICE] Starting imputation round 7/110, elapsed time 1365.437
[MICE] Starting imputation round 8/110, elapsed time 1595.760
[MICE] Starting imputation round 9/110, elapsed time 1825.520
[MICE] Starting imputation round 10/110, elapsed time 2055.469
[MICE] Starting imputation round 11/110, elapsed time 2286.088
[MICE] Starting imputation round 12/110, elapsed time 2515.985
[MICE] Starting imputation round 13/110, elapsed time 2747.761
[MICE] Starting imputation round 14/110, elapsed time 2976.440
[MICE] Starting imputation round 15/110, elapsed time 3206.947
[MICE] Starting imputa

In [None]:
# Not nearly as hardcore as MICE
# But can run into memory issues
%%time
X_filled_knn = KNN(k=3).complete(BES_numeric) # 20 mins+
BESW8knn = pd.DataFrame( X_filled_knn, columns = BES_numeric.columns)
BESW8knn.to_hdf(BES_data_folder+"BESW8knn.hdf", "BESW8knn")

In [6]:
%%time
# infers values outside of normal range
X_filled_softimpute = SoftImpute().complete(BES_numeric) # 40s
# BESW8softimpute = pd.DataFrame( X_filled_softimpute, columns = BES_numeric.columns)
# BESW8softimpute.to_hdf(BES_data_folder+"BESW8softimpute.hdf","BESW8softimpute")
BESW10softimpute = pd.DataFrame( X_filled_softimpute, columns = BES_numeric.columns)
BESW10softimpute.to_hdf(BES_data_folder+"BESW10softimpute.hdf","BESW10softimpute")

MemoryError: 

In [3]:
X_filled_median = BES_numeric.fillna(BES_numeric.median()).values
BESW8median = pd.DataFrame( X_filled_median, columns = BES_numeric.columns)
BESW8median.to_hdf(BES_data_folder+"BESW8median.hdf", "BESW8median")

In [4]:
X_filled_mean = BES_numeric.fillna(BES_numeric.mean()).values
BESW8mean = pd.DataFrame( X_filled_mean, columns = BES_numeric.columns)
BESW8mean.to_hdf(BES_data_folder+"BESW8mean.hdf", "BESW8mean")

In [9]:
# try to clear memory a bit
gc.collect()

0

In [None]:
# How do we test imputation?
# Impute on a dataset where we know the values
   # Warning - this may not be representative of situation for values we *don't know*
# Get representation subset of data
   # Warning - subsamples may respond differently just because of size (MICE should work better on big datasets ...)
    

# Practical questions
    # Memory requirements
    # variable typing (does it impute discrete values as discrete values or as continuous values?)


In [369]:
# take in missing_mask, number of missing_values
# take in variable types
# go through each missing_value
    # normalise to 0 - for random chance; 1 - for correct
    # ordinal: mean (0), correct value (1)
    
# Start with dumb algorithm -> right/wrong -> %
   


def score_mv_function( dataset_imputed,dataset_complete, missing_mask, num_values_to_drop, var_type ):
    score = 0
    dataset_maxdistance = dataset_complete.max()-dataset_complete.min()

    for (row, col) in missing_mask:
        col_name = dataset_complete.columns[col]
        col_type = var_type.loc[col_name]["type"]
#         print(col_name)
        if np.isnan( dataset_imputed.iloc[row,col] ):
            print("dataset_imputed isnan")
        if np.isnan( dataset_complete.iloc[row,col] ):
            print("dataset_complete isnan")            
        # ordinal: 0, 1, 2, 5, 6
        # non-ordinal: 3, 7
        if col_type in [3,7]:
            if round( dataset_imputed.iloc[row,col] ) == dataset_complete.iloc[row,col]:
                score = score+1 # 0/1
        elif col_type in [0, 1, 2, 5, 6]:
            if dataset_maxdistance[col_name]  == 0:
                continue
                # print(col_name, "dataset_maxdistance[col_name]  == 0")
            distance = np.abs( dataset_imputed.iloc[row,col] - dataset_complete.iloc[row,col])
            score = score + 1 - ( distance/dataset_maxdistance[col_name] )

            
    score = score / num_values_to_drop
    return score

In [254]:
def drop_fraction(dataset_complete,fraction_missing):

    dataset = dataset_complete.copy()
    n_rows, n_cols = dataset.shape

    nan_values = dataset.isnull()
    num_non_missing_values = (~nan_values).sum().sum()
    
    num_values_to_drop = int( num_non_missing_values * fraction_missing )
    
    # check to see that there are at least that many


    
#     missing_mask = np.zeros( (n_rows, n_cols), dtype=bool )

    missing_mask = []
    if num_values_to_drop > num_non_missing_values*10:
        raise Exception('not enough non-missing values!')
    
#     missing_raw_values = np.random.uniform( 0, 1, (n_rows, n_cols) )
#     missing_mask = missing_raw_values < fraction_missing    
#     overlap = sum(sum(nan_values.values & missing_mask))
    
#     dataset[missing_mask] = np.nan
#     drop_counter = overlap    #num_values_to_drop
    
    
    drop_counter = num_values_to_drop
    while drop_counter>0:
        row = int(np.random.rand()*n_rows)
        col = int(np.random.rand()*n_cols)
        if not np.isnan( dataset.iloc[row,col] ):
            dataset.iloc[row,col] = np.nan
#             missing_mask[row][col] = True
            missing_mask.append( (row,col) )
            drop_counter = drop_counter - 1

    return dataset, missing_mask, num_values_to_drop

In [298]:
%%time
BES_sample = BES_numeric.sample( frac=.1 ).dropna(axis=0, how='all').sample(axis=1, frac=.4 ).dropna(axis=1, how='all')
print( BES_sample.shape, BES_sample.isnull().mean().mean() )

(3141, 264) 0.5010286725902774
Wall time: 64 ms


In [299]:
%%time
incomplete_dataset, missing_mask, num_values_to_drop = drop_fraction(BES_sample,.1)
print( num_values_to_drop, incomplete_dataset.isnull().mean().mean() )

41375 0.5509247199791615
Wall time: 21.1 s


In [375]:
%%time
mice = MICE(n_imputations = 100, n_burn_in = 10, impute_type = "pmm",
            verbose = True, n_pmm_neighbors=BES_sample.shape[1], init_fill_method='median')
X_filled = mice.complete( incomplete_dataset.values )
dataset_imputed = pd.DataFrame( X_filled, columns = BES_sample.columns )
score = score_mv_function( dataset_imputed, BES_sample, missing_mask, num_values_to_drop, var_type  )
print( ("mice",score) )

In [370]:
X_filled = incomplete_dataset.fillna(incomplete_dataset.median()).values
dataset_imputed = pd.DataFrame( X_filled, columns = BES_sample.columns )
score = score_mv_function( dataset_imputed, BES_sample, missing_mask, num_values_to_drop, var_type  )
print( ("median",score) )

('median', 0.7323767945515518)


In [371]:
X_filled = incomplete_dataset.fillna(incomplete_dataset.mean()).values
dataset_imputed = pd.DataFrame( X_filled, columns = BES_sample.columns )
score = score_mv_function( dataset_imputed, BES_sample, missing_mask, num_values_to_drop, var_type  )
print( ("mean",score) )

('mean', 0.69239417561201744)


In [372]:
X_filled = SoftImpute().complete(incomplete_dataset) # 40s
dataset_imputed = pd.DataFrame( X_filled, columns = BES_sample.columns )
score = score_mv_function( dataset_imputed, BES_sample, missing_mask, num_values_to_drop, var_type  )
print( ("soft",score) )

[SoftImpute] Max Singular Value of X_init = 11286.870446
[SoftImpute] Iter 1: observed MAE=1.252531 rank=13
[SoftImpute] Iter 2: observed MAE=1.180320 rank=10
[SoftImpute] Iter 3: observed MAE=1.158438 rank=10
[SoftImpute] Iter 4: observed MAE=1.149442 rank=10
[SoftImpute] Iter 5: observed MAE=1.144342 rank=10
[SoftImpute] Iter 6: observed MAE=1.140927 rank=10
[SoftImpute] Iter 7: observed MAE=1.138421 rank=10
[SoftImpute] Iter 8: observed MAE=1.136483 rank=10
[SoftImpute] Iter 9: observed MAE=1.134926 rank=10
[SoftImpute] Iter 10: observed MAE=1.133639 rank=10
[SoftImpute] Iter 11: observed MAE=1.132550 rank=10
[SoftImpute] Iter 12: observed MAE=1.131624 rank=10
[SoftImpute] Iter 13: observed MAE=1.130833 rank=10
[SoftImpute] Iter 14: observed MAE=1.130152 rank=10
[SoftImpute] Iter 15: observed MAE=1.129559 rank=10
[SoftImpute] Iter 16: observed MAE=1.129037 rank=10
[SoftImpute] Iter 17: observed MAE=1.128574 rank=10
[SoftImpute] Iter 18: observed MAE=1.128159 rank=10
[SoftImpute] Ite

In [373]:
%%time
X_filled = KNN(k=30).complete(incomplete_dataset) # 20 mins+
dataset_imputed = pd.DataFrame( X_filled, columns = BES_sample.columns )
score = score_mv_function( dataset_imputed, BES_sample, missing_mask, num_values_to_drop, var_type  )
print( ("knn",score) )

Computing pairwise distances between 3141 samples
Computing distances for sample #1/3141, elapsed time: 2.468
Computing distances for sample #101/3141, elapsed time: 3.698
Computing distances for sample #201/3141, elapsed time: 4.878
Computing distances for sample #301/3141, elapsed time: 6.039
Computing distances for sample #401/3141, elapsed time: 7.218
Computing distances for sample #501/3141, elapsed time: 8.399
Computing distances for sample #601/3141, elapsed time: 9.586
Computing distances for sample #701/3141, elapsed time: 10.774
Computing distances for sample #801/3141, elapsed time: 11.962
Computing distances for sample #901/3141, elapsed time: 13.144
Computing distances for sample #1001/3141, elapsed time: 14.329
Computing distances for sample #1101/3141, elapsed time: 15.504
Computing distances for sample #1201/3141, elapsed time: 16.688
Computing distances for sample #1301/3141, elapsed time: 17.881
Computing distances for sample #1401/3141, elapsed time: 19.068
Computing

In [None]:
##
## ANOTHER MEASURE OF IMPUTING SUCCESS
## ---> NOT REDUCING CORRELATION IN DATASET
## ---> NOT RESPECTING RANGES (imputing values of 8 for categories with only 7 options)