In [122]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
import numpy as np
import warnings
warnings.filterwarnings('ignore') #to avoid some ugly warnings

In [4]:
def autoencoder(n, theta):
    # input layer
    input_layer = layers.Input(shape= (n,))
    # drop out of 0.5 to the input
    input_layer = tf.keras.layers.Dropout(0.5)(input_layer)
    # dimensionality increasing
    dimensionality = n+theta
    # first encoding hidden layer H1
    encoder_layer = layers.Dense(dimensionality, activation='tanh')(input_layer)
    # dimensionality increasing
    dimensionality = n+(2*theta)
    # second encoding hidden layer H2
    encoder_layer = layers.Dense(dimensionality, activation='tanh')(encoder_layer)
    # dimensionality increasing
    dimensionality = n+(3*theta)
    # third encoding hidden layer H3
    common_layer = layers.Dense(dimensionality, activation='tanh')(encoder_layer)
    # dimensionality decreasing
    dimensionality = n+(2*theta)
    # second decoding hidden layer H4
    decoder_layer = layers.Dense(dimensionality, activation='tanh')(common_layer)
    # dimensionality decreasing
    dimensionality = n+theta
    # third decoding hidden layer H5
    decoder_layer = layers.Dense(dimensionality, activation='tanh')(decoder_layer)
    # output layer
    output = layers.Dense(n, activation='tanh')(decoder_layer)
   
    return output_layer
    
    
    

In [293]:
class clean_data(object):
    def __init__(self):
        self.bh_df = pd.read_csv('data/bh.csv', index_col=0)
        self.bc_df = pd.read_csv('data/bc.csv', index_col=0)
        self.dn_df = pd.read_csv('data/dna.csv', index_col=0)
        self.gl_df = pd.read_csv('data/gl.csv', index_col=0)
        self.hv_df = pd.read_csv('data/hv.csv', index_col=0)
        self.is_df = pd.read_csv('data/is.csv', index_col=0)
        self.on_df = pd.read_csv('data/on.csv', index_col=0)
        self.sl_df = pd.read_csv('data/sl.csv', index_col=0)
        self.sr_df = pd.read_csv('data/sr.csv', index_col=0)
        self.st_df = pd.read_csv('data/st.csv', index_col=0)
        self.sn_df = pd.read_csv('data/sn.csv', index_col=0)
        self.sb_df = pd.read_csv('data/sb.csv', index_col=0)
        self.vc_df = pd.read_csv('data/vc.csv', index_col=0)
        self.vw_df = pd.read_csv('data/vw.csv', index_col=0)
        self.zo_df = pd.read_csv('data/zo.csv', index_col=0)
        
    def BH_df(self):
        '''
        No modifications: it seems okay to be processed by the model.
        '''
        df = self.bh_df
        return df
    
    def BC_df(self):
        '''
        Returns cleaned BC dataframe:
        
            - Class feature converted in 0 (malignant) and 1 (benign);
            
            - Id feature deleted (not useful information for the model).
        '''
        df = self.bc_df
        # we have just 2 labels for class: one is opposite to the other
        # so using binary values 0,1 is sufficient
        df['Class'][df['Class']=='benign'] = 1
        df['Class'][df['Class']=='malignant'] = 0
        # drop id feature
        df.drop("Id", axis = 1, inplace = True)
        return df
    
    def DN_df(self):
        '''
        Returns cleaned DN dataframe:
            - Class feature: one hot encoding is applied.
        '''
        df = self.dn_df
        # we have more class labels: one hot encoding 
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop('Class', axis = 1, inplace = True)
        return df
    
    def GL_df(self):
        '''
        No modifications: it seems okay to be processed by the model.
        '''
        df = self.gl_df
        return df
    
    def HV_df(self):
        '''
        Returns cleaned HV dataframe:
            
            - y values converted in 1 and n values converted in 0.
            
            - Class feature: one hot encoding is applied.
        '''
        df = self.hv_df
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop('Class', axis = 1, inplace = True)
        # substitution of y-n with 1-0
        cols = df.columns
        for feature in cols:
            df[feature][df[feature]=='y'] = 1
            df[feature][df[feature]=='n'] = 0
        return df
    
    def IS_df(self):
        '''
        Returns cleaned IS dataframe:
            
            - Class feature converted in 0 (bad) and 1 (good).
        '''
        df = self.is_df
        # substitution of bad-good with 0-1
        df['Class'][df['Class']=='bad'] = 0
        df['Class'][df['Class']=='good'] = 1
        return df
    
    def ON_df(self):
        '''
        No modifications: it seems okay to be processed by the model.
        '''       
        df = self.on_df
        return df
    
    def SL_df(self):
        '''
        Returns cleaned SL dataframe:
            - classes: one hot encoding is applied.
        '''
        df = self.sl_df
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.classes)], axis=1)
        # drop classes column
        df.drop('classes', axis = 1, inplace = True)
        return df
    
    def SR_df(self):
        '''
        Cleaned SR dataset with Motor and Screw classes converted to:
            - 1 (A), 
            - 2 (B), 
            - 3 (C), 
            - 4 (D), 
            - 5 (E).
        '''
        df = self.sr_df
        # converting in category numbers 1-5: labels are somehow ordered (A,B,..)
        # so it's not necessary the one hot encoding
        motor_labels = list(set(df.Motor))
        screw_labels = list(set(df.Screw))
        for i in range(len(motor_labels)):
            df['Motor'][df['Motor'] == motor_labels[i]] = i+1
            df['Screw'][df['Screw'] == screw_labels[i]] = i+1
        return df
    
    def ST_df(self):
        '''
        Returns cleaned dataframe:
            - Class feature: one hot encoding.
        '''
        df = self.st_df
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop('Class', axis = 1, inplace = True)
        return df
    
    def SN_df(self):
        '''
        Returns cleaned SN dataframe:
            - Class feature: one hot encoding.
        '''
        df = self.sn_df
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop('Class', axis = 1, inplace = True)
        return df
    
    def SB_df(self):
        '''
        Returns cleaned SB dataframe:
            - Class feature: one hot encoding is applied.
        '''
        
        df = self.sb_df
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop('Class', axis = 1, inplace = True)
        return df
    
    def VC_df(self):
        '''
        Returns cleaned VC dataframe:
            - Class feature: one hot encoding is applied.
        '''
        df = self.vc_df
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop("Class", axis = 1, inplace = True)
        return df
    
    def VW_df(self):
        '''
        Returns cleaned VC dataframe:
            - Class feature: one hot encoding is applied.
        '''
        
        df = self.vw_df
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.Class)], axis=1)
        # drop class column
        df.drop("Class", axis = 1, inplace = True)
        return df
    
    def ZO_df(self):
        '''
        Returns cleaned VC dataframe: 
            - features having boolean values: substitution with 1 (True) and 0 (False).
            - type feature: one hot encoding.
        '''
        df = self.zo_df
        # substitution of true-false with 1-0
        col_names = df.columns
        for feat in col_names:
            try:
                df[feat][df[feat] == True] = 1
                df[feat][df[feat] == False] = 0
            except:
                pass
        # one hot encoding
        df = pd.concat([df,pd.get_dummies(df.type)], axis=1)
        # drop type column
        df.drop("type", axis = 1, inplace = True)
        return df
    

In [294]:
cleaner = clean_data()
bh_df = cleaner.BH_df()
bc_df = cleaner.BC_df()
dn_df = cleaner.DN_df()
gl_df = cleaner.GL_df()
hv_df = cleaner.HV_df()
is_df = cleaner.IS_df()
on_df = cleaner.ON_df()
sl_df = cleaner.SL_df()
sr_df = cleaner.SR_df()
st_df = cleaner.ST_df()
sn_df = cleaner.SN_df()
sb_df = cleaner.SB_df()
vc_df = cleaner.VC_df()
vw_df = cleaner.VW_df()
zo_df = cleaner.ZO_df()


In [123]:
class missingness(object):
    
    def __init__(self, file, th):
        # load dataset
        self.df = pd.read_csv(file, index_col=0)
        # set missingness proportion
        self.th = th
        # random uniform vector with values in [0,1], length = n obs.
        self.vector_1 = np.random.uniform(0,1,len(self.df))
        # number of features
        self.n_attributes = len(self.df.columns)
        # list of feature indeces
        self.attr_idx = [i for i in range(self.n_attributes)]
        
    def MCAR_uniform(self):
        df_mcar_uni = self.df
        ### MCAR uniform 
        # initialize col with random uniform vector
        df_mcar_uni['MCAR uniform'] = self.vector_1
        # set values of feature under threshold to nan
        df_mcar_uni[df_mcar_uni['MCAR uniform'] <= self.th] = np.nan
        return df_mcar_uni
    
    def MCAR_random(self):
        df_mcar_rand = self.df
        ### MCAR random
        # initialize col with random uniform vector
        df_mcar_rand['MCAR random'] = self.vector_1
        # random half sample of feature indeces
        half_attr = np.random.choice(self.attr_idx, size =self.n_attributes//2,replace= False)
        # set values of selected features to nan in correspondence of values under th
        df_mcar_rand.iloc[:, half_attr][df_mcar_rand['MCAR random']<=th] = np.nan
        return df_mcar_rand
     
    def MNAR_uniform(self):
        df_mnar_uni = self.df
        ### MNAR uniform 
        # initialize col with random uniform vector
        df_mnar_uni['MNAR uniform'] = self.vector_1
        # pick 2 random attributes
        attr_2_idx = np.random.choice(self.attr_idx, size = 2, replace=False)
        median_1 = df_mnar_uni.iloc[:,attr_2_idx[0]].median(skipna = False)
        median_2 = df_mnar_uni.iloc[:, attr_2_idx[1]].median(skipna = False)
        # ???
        df_mnar_uni[(df_mnar_uni['MNAR uniform']<= th & (df_mnar_uni.iloc[:,attr_2_idx[0]] <= median_1 | df_mnar_uni.iloc[:,attr_2_idx[1]] >= median_2))] = np.nan
        return df_mnar_uni
   
