In [1]:
import numpy as np
import numpy.ma as ma
import sys
from sklearn.datasets import load_boston, load_iris
np.random.seed(42)

In [2]:
sys.path.append("../auto_impute/")

In [3]:
from csv_reader import CSVReader
from dp import DP
from sg import SingleGaussian
from gmm import GMM
from mi import MeanImpute

Generate plots of MSE and LL vs % missing (MCAR) for the DP, SG, MI, and GMM on Iris and Boston Housing

In [12]:
def run_model_on_dataset(data_loader, model_constructor, model_params={}):
    # load dataset
    X, _ = data_loader(return_X_y=True)
    rmses = np.zeros(9)
    lls = np.zeros_like(rmses)
    
    for i in range(1, 10):
        # randomly remove a % of the dataset
        mask = np.random.rand(*X.shape) <= i/10
        masked_X = ma.masked_array(X, mask)
        
#         model = model_constructor(masked_X,m0 = ma.mean(masked_X, axis=0),**model_params)
        model = model_constructor(masked_X,**model_params)
        if hasattr(model, 'fit'):
            model.fit(max_iters=100, ϵ=0.01)
            
        imputed_X = model.ml_imputation()
        rmses[i-1] = np.sqrt(np.mean(np.power(X - imputed_X, 2)))
        
        ll = model.test_ll(X)
        lls[i-1] = np.mean(ll[mask])
        
    return rmses, lls       

In [15]:
r, l = run_model_on_dataset(load_iris, SingleGaussian, {"verbose": True})

Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Fitting single gaussian using EM:
Starting Avg LL: -0.918939
Iter: 0			Avg LL: -0.799613
Iter: 1			Avg LL: -0.732875
Iter: 2			Avg LL: -0.669218
Iter: 3			Avg LL: -0.608802
Iter: 4			Avg LL: -0.551744
Iter: 5			Avg LL: -0.498118
Iter: 6			Avg LL: -0.447955
Iter: 7			Avg LL: -0.401248
Iter: 8			Avg LL: -0.357955
Iter: 9			Avg LL: -0.318010
Iter: 10			Avg LL: -0.281321
Iter: 11			Avg LL: -0.247781
Iter: 12			Avg LL: -0.217264
Iter: 13			Avg LL: -0.189632
Iter: 14			Avg LL: -0.164737
Iter: 15			Avg LL: -0.1424

In [18]:
# SG on iris
# repeat the experiment 10 times to get a mean and variance
rmses = np.zeros(shape=(10, 9))
lls = np.zeros_like(rmses)

for i in range(10):
    print(i)
    r, l = run_model_on_dataset(load_iris, SingleGaussian, {"independent_vars":True})
    
    rmses[i, :] = r
    lls[i, :] = l

0
1
2
3
4
5
6
7
8
9


In [19]:
print(np.mean(rmses, axis=0))
print(np.std(rmses, axis=0))

[0.33944266 0.47378417 0.58659555 0.66429128 0.75628652 0.85036415
 0.90458599 0.96594156 1.05581621]
[0.04477307 0.04154822 0.02546068 0.0290023  0.03420639 0.03222403
 0.02374863 0.01348576 0.05099736]


In [20]:
print(np.mean(lls, axis=0))
print(np.std(lls, axis=0))

[-1.26865567 -1.25435851 -1.2874415  -1.30447339 -1.44070357 -1.59017655
 -1.92969948 -2.4495219  -5.10813152]
[0.12616627 0.07193603 0.0592975  0.06399185 0.07411055 0.12715965
 0.27084744 0.26129236 1.33725509]


In [21]:
# np.savetxt(X=np.stack([np.arange(1,10)*10, np.mean(rmses, axis=0), np.std(rmses, axis=0), np.mean(lls, axis=0), np.std(lls, axis=0)], axis=1),
#            delimiter=",",header="x,rm,rs,lm,ls",fname="msg_iris_test.csv", fmt="%.7g", comments='')

In [6]:
# DP on iris
# repeat the experiment 10 times to get a mean and variance
rmses = np.zeros(shape=(10, 9))
lls = np.zeros_like(rmses)

for i in range(10):
    print(i)
    r, l = run_model_on_dataset(load_iris, DP)
    
    rmses[i, :] = r
    lls[i, :] = l

0
1
2
3
4
5
6
7
8
9


In [7]:
print(np.mean(rmses, axis=0))
print(np.std(rmses, axis=0))
print(np.mean(lls, axis=0))
print(np.std(lls, axis=0))

[0.55774574 0.73373964 0.92318899 1.08093537 1.19371543 1.26009159
 1.36411511 1.44981175 1.51227675]
[0.07996679 0.07047086 0.05337376 0.05792573 0.03612686 0.09429135
 0.10781913 0.12249172 0.16580825]
[-4.01079614 -3.92478502 -4.05376544 -4.24009885 -4.21582627 -4.76353797
 -5.25689288 -6.19328716 -7.86228848]
[0.47713732 0.15868954 0.24120106 0.32835007 0.20902054 0.20700079
 0.39938846 0.40508566 0.47662959]


In [8]:
# np.savetxt(X=np.stack([np.arange(1,10)*10, np.mean(rmses, axis=0), np.std(rmses, axis=0), np.mean(lls, axis=0), np.std(lls, axis=0)], axis=1),
#            delimiter=",",header="x,rm,rs,lm,ls",fname="dp_iris_test.csv", fmt="%.7g", comments='')

In [5]:
# MI on iris
# repeat the experiment 10 times to get a mean and variance
rmses = np.zeros(shape=(10, 9))
lls = np.zeros_like(rmses)

for i in range(10):
    print(i)
    r, l = run_model_on_dataset(load_iris, MeanImpute)
    
    rmses[i, :] = r
    lls[i, :] = l

0
1


  lls[n, d] = np.log(stats.norm.pdf(test_data[n, d], loc=self.μ[d], scale=1e-1))


2
3
4
5
6
7
8
9


In [6]:
print(np.mean(rmses, axis=0))
print(np.std(rmses, axis=0))
print(np.mean(lls, axis=0))
print(np.std(lls, axis=0))

[0.35322416 0.48198945 0.60061413 0.68006311 0.74858741 0.82310308
 0.90609139 0.97588824 1.05232453]
[0.03881226 0.04310652 0.02108531 0.03021143 0.02024974 0.0156349
 0.0227959  0.03103227 0.03738613]
[-58.68820131 -57.14382078 -58.01103283 -56.72235946 -54.97268174
 -54.68677115 -57.23462467 -58.30725201         -inf]
[12.40845092  7.19065295  2.93250312  4.39693955  2.07086561  2.34304571
  2.11388371  2.70482156         nan]


  x = asanyarray(arr - arrmean)


In [7]:
# np.savetxt(X=np.stack([np.arange(1,10)*10, np.mean(rmses, axis=0), np.std(rmses, axis=0), np.mean(lls, axis=0), np.std(lls, axis=0)], axis=1),
#            delimiter=",",header="x,rm,rs,lm,ls",fname="mi_iris_test.csv", fmt="%.7g", comments='')

  x = asanyarray(arr - arrmean)


In [16]:
# GMM on iris
# repeat the experiment 10 times to get a mean and variance
rmses = np.zeros(shape=(10, 9))
lls = np.zeros_like(rmses)

for i in range(10):
    print(i)
    r, l = run_model_on_dataset(load_iris, GMM, {"num_components":3})
    
    rmses[i, :] = r
    lls[i, :] = l

0
1
2
3
4
5
6
7
8
9


In [17]:
print(np.mean(rmses, axis=0))
print(np.std(rmses, axis=0))
print(np.mean(lls, axis=0))
print(np.std(lls, axis=0))

[0.8247901  1.39741195 0.84539919 1.37009881 0.83022527 1.03687559
 4.02625484 6.07836903 6.99796845]
[0.54763152 0.48197278 0.86062669 0.49086184 0.59280844 1.57382629
 5.92128991 7.66876541 9.64189914]
[0.097766   0.08641216 0.12317278 0.11233042 0.13800434 0.13067409
 0.12326236 0.10933529 0.08261871]
[0.03985159 0.02420976 0.03553549 0.01757556 0.01640817 0.03753369
 0.04503257 0.04727305 0.04315229]


In [18]:
# np.savetxt(X=np.stack([np.arange(1,10)*10, np.mean(rmses, axis=0), np.std(rmses, axis=0), np.mean(lls, axis=0), np.std(lls, axis=0)], axis=1),
#            delimiter=",",header="x,rm,rs,lm,ls",fname="gmm3_iris_test.csv", fmt="%.7g", comments='')