In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

from keras.objectives import mse
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.regularizers import l1_l2

from collections import defaultdict

Using TensorFlow backend.


In [2]:
df = pd.read_csv('Adult_copy_sample.csv')

In [14]:
def make_reconstruction_loss(n_features):

    def reconstruction_loss(input_and_mask, y_pred):
        X_values = input_and_mask[:, :n_features]
#        X_values.name = "$X_values"

        missing_mask = input_and_mask[:, n_features:]
#         missing_mask.name = "$missing_mask"
        observed_mask = 1 - missing_mask
#         observed_mask.name = "$observed_mask"

        X_values_observed = X_values * observed_mask
#         X_values_observed.name = "$X_values_observed"

        pred_observed = y_pred * observed_mask
#         pred_observed.name = "$y_pred_observed"

        return mse(y_true=X_values_observed, y_pred=pred_observed)
    return reconstruction_loss

In [4]:
def masked_mae(X_true, X_pred, mask):
    masked_diff = X_true[mask] - X_pred[mask]
    return np.mean(np.abs(masked_diff))

In [7]:
df_try = df.iloc[:, 2:193]
df_try = df_try.select_dtypes(exclude=['object'])


np.any(np.isnan(df_try))

False

In [8]:
df_try.head()

Unnamed: 0,age,sex,height,weight,ethnicity,abdose_abx_1___1,abdose_abx_1___2,abdose_abx_1___3,abdose_abx_1___4,abdose_abx_1___5,...,other_antimicrobial_28,other_antimicrobial_29,other_antimicrobial_30,other_antimicrobial_31,other_antimicrobial_32,other_antimicrobial_33,other_antimicrobial_34,other_antimicrobial_35,reason_for_icu_admission_1,reason_for_icu_admission_2
0,60.5,1,160.0,48.2,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,75.5,1,166.0,70.0,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,61.3,0,192.0,120.0,10,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,45.9,0,170.0,127.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,83.7,1,154.0,81.0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [15]:
class Autoencoder:

    def __init__(self, data,
                 recurrent_weight=0.5,
                 optimizer="adam",
                 dropout_probability=0.5,
                 hidden_activation="relu",
                 output_activation="sigmoid",
                 init="glorot_normal",
                 l1_penalty=0.01,
                 l2_penalty=0.01):
        self.data = data.copy()
        self.recurrent_weight = recurrent_weight
        self.optimizer = optimizer
        self.dropout_probability = dropout_probability
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.init = init
        self.l1_penalty = l1_penalty
        self.l2_penalty = l2_penalty

    def _get_hidden_layer_sizes(self):
        n_dims = self.data.shape[1]
        return [min(2000, 8*n_dims),
                min(500, 2*n_dims),
                int(np.ceil(0.5*n_dims))]

    def _create_model(self):

        hidden_layer_sizes = self._get_hidden_layer_sizes()
        first_layer_size = hidden_layer_sizes[0]
        n_dims = self.data.shape[1]
        
        model = Sequential()

        model.add(Dense(
            first_layer_size,
            input_dim=2*n_dims,
            activation=self.hidden_activation,
            W_regularizer=l1_l2(self.l1_penalty, self.l2_penalty),
            init=self.init))
        model.add(Dropout(self.dropout_probability))

        for layer_size in hidden_layer_sizes[1:]:
            model.add(Dense(
                layer_size,
                activation=self.hidden_activation,
                W_regularizer=l1_l2(self.l1_penalty, self.l2_penalty),
                init=self.init))
            model.add(Dropout(self.dropout_probability))

        model.add(Dense(
            n_dims,
            activation=self.output_activation,
            W_regularizer=l1_l2(self.l1_penalty, self.l2_penalty),
            init=self.init))

        loss_function = make_reconstruction_loss(n_dims)

        model.compile(optimizer=self.optimizer, loss=loss_function)
        return model

    def fill(self, missing_mask):
        self.data[missing_mask] = -1

    def _create_missing_mask(self):
        if self.data.dtype != "f" and self.data.dtype != "d":
            self.data = self.data.astype(float)

        return np.isnan(self.data)

    def _train_epoch(self, model, missing_mask, batch_size):
        input_with_mask = np.hstack([self.data, missing_mask])
        n_samples = len(input_with_mask)
        n_batches = int(np.ceil(n_samples / batch_size))
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_shuffled = input_with_mask[indices]

        for batch_idx in range(n_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size
            batch_data = X_shuffled[batch_start:batch_end, :]
            model.train_on_batch(batch_data, batch_data)
        return model.predict(input_with_mask)

    def train(self, batch_size=256, train_epochs=100):
        missing_mask = self._create_missing_mask()
        self.fill(missing_mask)
        self.model = self._create_model()

        observed_mask = ~missing_mask

        for epoch in range(train_epochs):
            X_pred = self._train_epoch(self.model, missing_mask, batch_size)
            observed_mae = masked_mae(X_true=self.data,
                                    X_pred=X_pred,
                                    mask=observed_mask)
            if epoch % 50 == 0:
                print("observed mae:", observed_mae)

            old_weight = (1.0 - self.recurrent_weight)
            self.data[missing_mask] *= old_weight
            pred_missing = X_pred[missing_mask]
            self.data[missing_mask] += self.recurrent_weight * pred_missing
        return self.data.copy()

In [9]:
prob_missing = 0.1
df_incomplete = pd.DataFrame(index=df_try.index, columns=df_try.columns)

ix = [(row, col) for row in range(df_try.shape[0]) for col in range(df_try.shape[1])]
for row, col in random.sample(ix, int(round((1 - prob_missing)*len(ix)))):
      df_incomplete.iat[row, col] = df_try.iat[row, col]

In [10]:
missing_encoded = pd.get_dummies(df_incomplete)

for col in df_try.columns:
    missing_cols = missing_encoded.columns.str.startswith(str(col) + "_")
    missing_encoded.loc[df_incomplete[col].isnull(), missing_cols] = np.nan

In [11]:
missing_encoded.head()

Unnamed: 0,age_17.2,age_19.1,age_19.8,age_22.5,age_24.9,age_25.0,age_25.5,age_25.7,age_29.3,age_30.0,...,other_antimicrobial_33_0,other_antimicrobial_33_1,other_antimicrobial_34_0,other_antimicrobial_34_1,other_antimicrobial_35_0,other_antimicrobial_35_1,reason_for_icu_admission_1_0,reason_for_icu_admission_1_1,reason_for_icu_admission_2_0,reason_for_icu_admission_2_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,,,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,,,1.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [17]:
imputer = Autoencoder(missing_encoded.values)
complete_encoded = imputer.train(train_epochs=1500, batch_size=256)



observed mae: 0.499893032682
observed mae: 0.488022494941
observed mae: 0.476423392148
observed mae: 0.465017065276
observed mae: 0.453826598943
observed mae: 0.442875000934
observed mae: 0.432177995197
observed mae: 0.42172930425
observed mae: 0.411526287215
observed mae: 0.401618503616
observed mae: 0.391992295314
observed mae: 0.382636064734
observed mae: 0.373546829468
observed mae: 0.3647429145
observed mae: 0.356214871012
observed mae: 0.347953604093
observed mae: 0.339957755432
observed mae: 0.332218293544
observed mae: 0.324735938469
observed mae: 0.317490821648
observed mae: 0.310489370752
observed mae: 0.3037237793
observed mae: 0.297182975691
observed mae: 0.290860302252
observed mae: 0.284745800168
observed mae: 0.278838383992
observed mae: 0.273129574021
observed mae: 0.267605760924
observed mae: 0.262263052341
observed mae: 0.257107200635


In [63]:
complete_encoded.shape

(144, 3033)

In [59]:
def mle(row):
    res = np.zeros(row.shape[0])
    res[np.argmax(row)] = 1
    return res


col_classes = [len(df_try[c].unique()) for c in df_try.columns]

dummy_df = pd.get_dummies(df_try)

mle_complete = None

# te=-1
for i, cnt in enumerate(col_classes):
#     te+=1
    start_idx = int(sum(col_classes[0:i]))
    col_true = dummy_df.values[:, start_idx:start_idx+cnt]
    col_completed = complete_encoded[:, start_idx:start_idx+cnt]
#     print(te,'col_completed done')
    mle_completed = np.apply_along_axis(mle, axis=1, arr=col_completed)
#     print(te,'mle_completed done')
    if mle_complete is None:
        mle_complete = mle_completed
    else:
        mle_complete = np.hstack([mle_complete, mle_completed])
        
print('Done')

ValueError: attempt to get argmax of an empty sequence

In [40]:
dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Columns: 3240 entries, age_17.2 to reason_for_icu_admission_2_1
dtypes: uint8(3240)
memory usage: 455.7 KB


In [29]:
dummy_df.head()

Unnamed: 0,age,sex,height,weight,ethnicity,abdose_abx_1___1,abdose_abx_1___2,abdose_abx_1___3,abdose_abx_1___4,abdose_abx_1___5,...,other_antimicrobial_28,other_antimicrobial_29,other_antimicrobial_30,other_antimicrobial_31,other_antimicrobial_32,other_antimicrobial_33,other_antimicrobial_34,other_antimicrobial_35,reason_for_icu_admission_1,reason_for_icu_admission_2
0,60.5,1,160.0,48.2,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,75.5,1,166.0,70.0,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,61.3,0,192.0,120.0,10,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,45.9,0,170.0,127.0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,83.7,1,154.0,81.0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [95]:
def reverse_dummy(df_dummies):
    pos = defaultdict(list)
    vals = defaultdict(list)

    for i, c in enumerate(df_dummies.columns):
        if "_" in c:
            k, v = c.rsplit("_", 1)
            pos[k].append(i)
            vals[k].append(v)
        else:
            pos["_"].append(i)

    df = pd.DataFrame({k: pd.Categorical.from_codes(
                              np.argmax(df_dummies.iloc[:, pos[k]].values, axis=1),
                              vals[k])
                      for k in vals})

    df[df_dummies.columns[pos["_"]]] = df_dummies.iloc[:, pos["_"]]
    return df

In [96]:
rev_df = reverse_dummy(pd.DataFrame(data=mle_complete, columns=missing_encoded.columns))
rev_df = rev_df[list(df_try.columns)]

In [118]:
for column in rev_df.columns:
    rev_df[column] = rev_df[column].astype(float)

In [125]:
df = df_try.astype(float)

In [141]:
incorrect = (rev_df != df)
incorrect_cnts = incorrect.apply(pd.value_counts)
incorrect_sum = incorrect_cnts.sum(axis=1)
incorrect_sum[1]

5550.0

In [144]:
sum_incorrect = 0

for col in df_try.columns:
    ind_incorrect = sum(df[col]!=rev_df[col])
    print(ind_incorrect)   

7
53
134
18
122
16
1
1
0
18
34
17
0
1
0
30
10
9
3
6
3
2
1
0
2
0
2
0
3
1
3
49
133
45
15
13
19
6
1
0
3
3
58
64
34
62
4
22
19
6
0
7
4
1
2
1
0
1
11
104
84
141
44
135
10
69
141
0
3
0
1
138
4
10
70
76
11
11
8
6
54
127
79
8
2
36
127
6
116
132
29
56
73
44
36
24
31
30
31
9
25
27
30
11
6
82
103
17
22
113
92
32
85
87
58
64
6
37
36
51
53
3
41
5
47
24
19
30
89
43
67
95
11
26
72
86
86
18
46
135
2
7
10
4
34
16
4
0
5
1
2
18
1
1
3
6
1
2
1
1
1
9
2
2
1
1
6
4
8
1
28
27
1
2
1
9
0
0
1
1
2
1
1
1
1
99
45


In [142]:
missing = df_incomplete.apply(pd.isnull)
missing_cnts = missing.apply(pd.value_counts)
missing_sum = missing_cnts.sum(axis=1)
missing_sum[1]

2693

In [143]:
accuracy = 1.0 - (incorrect_sum[1] / missing_sum[1])
print(accuracy)

-1.06089862607
