## Package instalation

In [None]:
!pip install feature_engine
!pip install tabgan

In [None]:
!pip install variants
!pip install smote_variants

### Imports

In [None]:
import pandas as pd 
import numpy as np
import os
from datetime import time
import collections 
from matplotlib import pyplot as plt
import argparse
import json
import scipy.io as sio
from sklearn import metrics, preprocessing
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier 
import tensorflow as tf 
tf.config.experimental_run_functions_eagerly(True)
from tensorflow.keras.layers import Dense
from sklearn.decomposition import PCA
import variants as variants
from sklearn.metrics import accuracy_score, make_scorer
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import cohen_kappa_score
import smote_variants as sv
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import scipy.stats as stats
from scipy.stats import ks_2samp
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
from tabgan.sampler import OriginalGenerator, GANGenerator
np.random.seed(42)

## WGAN and related functions

In [None]:
# generator
class Generator(tf.keras.Model):
  def __init__(self,n_inp,n_noise,n_hid=128):
    super().__init__()
    init=tf.keras.initializers.GlorotUniform
    self.input_layer=Dense(units=n_noise,kernel_initializer=init)
    self.hidden_layer=Dense(units=n_hid,activation="relu",kernel_initializer=init)
    self.output_layer=Dense(units=n_inp,activation="sigmoid",kernel_initializer=init)
  def call(self,inputs):
    x=self.input_layer(inputs)
    x=self.hidden_layer(x)
    return self.output_layer(x)
# critic   
class Critic(tf.keras.Model):
  def __init__(self,n_inp,n_hid=128):
    super().__init__()
    init=tf.keras.initializers.GlorotUniform
    self.input_layer=Dense(units=n_inp,kernel_initializer=init)
    self.hidden_layer=Dense(units=n_hid,activation="relu",kernel_initializer=init)
    self.logits=Dense(units=1,activation=None,kernel_initializer=init)
    
  def call(self,inputs):
    x=self.input_layer(inputs)
    x=self.hidden_layer(x)
    return self.logits(x)

@tf.function
def train_step(real_data,gen,critic,noise_dim,generator_optimizer,critic_optimizer):
  batch_size=real_data.shape[0]# gaussian noise :z
  noise=tf.random.normal([batch_size,noise_dim])
  with tf.GradientTape() as gen_tape,tf.GradientTape() as critic_tape:# x' = G(z)
    fake_data=gen(noise,training=True)# s^ = c(x)
    real_output=critic(real_data,training=True)# s_ = c(x')
    fake_output=critic(fake_data,training=True)
    critic_loss=tf.reduce_mean(fake_output)-tf.reduce_mean(real_output)
    critic_loss_real=tf.reduce_mean(real_output)
    critic_loss_fake=tf.reduce_mean(fake_output)# G loss fucntion is the critic's output for fake data -(s_)
    gen_loss=-tf.reduce_mean(fake_output)
  wasserstein=tf.reduce_mean(real_output)-tf.reduce_mean(fake_output)# calculate gradients for gen and critic to update them weights
  gradients_of_generator=gen_tape.gradient(gen_loss,gen.trainable_variables)
  gradients_of_critic=critic_tape.gradient(critic_loss,critic.trainable_variables)# update gen and critic weights 
  generator_optimizer.apply_gradients(zip(gradients_of_generator,gen.trainable_variables))
  critic_optimizer.apply_gradients(zip(gradients_of_critic,critic.trainable_variables))
  tf.group(*(var.assign(tf.clip_by_value(var,-0.01,0.01)) for var in critic.trainable_variables)) 
  return wasserstein,gen_loss,critic_loss_real,critic_loss_fake

In [None]:
def generate_synthetic_samples(generator,class_id,headers_name,nb_instance,NOISE_DIM):
  # generete instances
  fake_data=generator(tf.random.normal([nb_instance,NOISE_DIM]))
  # prepare syhtentic dataset for export
  synthetic_data=pd.DataFrame(data=np.array(fake_data),columns=headers_name)
  synthetic_data["0"]=np.repeat(class_id,len(fake_data))
  # synthetic_data.to_csv("GAN_Synthetic_Data"+str(class_id)+".csv",index=False,header=True)
  return synthetic_data
def fake_data_generation(training_data,nb_instances_to_generate,target):
  # setting training parameters for GAN
  BATCH_SIZE=8
  NOISE_DIM=10
  learning_rate=0.001
  epochs=150# save column names for later
  headers_name=list(training_data.columns.values)
  headers_name=headers_name[0:-1]# prepre training data
  # class_id=training_data["TypeGlass"].values[0]
  class_id=training_data[target].values[0]
  print('CLASS ID',class_id)
  X=training_data.iloc[:,:-1].values.astype("float32")# number of features for training data 
  n_inp=X.shape[1]# slice training data into small batches
  train_dataset=(tf.data.Dataset.from_tensor_slices(X.reshape(X.shape[0],n_inp)).batch(BATCH_SIZE))
  # init the generator with number of features desired for the output and noise dimension
  generator=Generator(n_inp,NOISE_DIM)
  critic=Critic(n_inp)
  # Init RMSprop optimizer for the generator and the critic 
  generator_optimizer=tf.keras.optimizers.RMSprop(learning_rate)
  critic_optimizer=tf.keras.optimizers.RMSprop(learning_rate)
  # WD distance across epochs
  # Gen loss across epochs
  # Desc loss across epochs
  epoch_wasserstein=[] 
  epoch_gen_loss=[] 
  epoch_critic_loss_real=[] 
  epoch_critic_loss_fake=[]
  for epoch in range(epochs):
    batch_idx=0
    batch_wasserstein=0
    batch_gen=0
    batch_critic_real=0
    batch_critic_fake=0
    # training
    for batch in train_dataset:
      wasserstein,gen_loss,critic_loss_real,critic_loss_fake=train_step(batch,generator,critic,NOISE_DIM,generator_optimizer,critic_optimizer)
      epoch_wasserstein.append(wasserstein)
      epoch_gen_loss.append(gen_loss)
      epoch_critic_loss_real.append(critic_loss_real)
      epoch_critic_loss_fake.append(critic_loss_fake)
      batch_gen+=gen_loss
      batch_critic_real+=critic_loss_real
      batch_critic_fake+=critic_loss_fake
      batch_wasserstein+=wasserstein
      batch_idx+=1
    batch_wasserstein=batch_wasserstein/batch_idx
    batch_gen=batch_gen/batch_idx
    batch_critic_real=batch_critic_real/batch_idx
    batch_critic_fake=batch_critic_fake/batch_idx
    if epoch%50==0:
      print("Epoch %d / %d completed. Gen loss: %.8f. Desc loss_real: %.8f . Desc loss_fake: %.8f"%(epoch+1,epochs,batch_gen,batch_critic_real,batch_critic_fake))
      """nb_instances_to_generate = len(class_0["target"]) - len(class_1["target"])    """
  data=generate_synthetic_samples(generator,class_id,headers_name,nb_instances_to_generate,NOISE_DIM)
  return data

In [None]:
# the function to generate fake data with WGAN for the given classes 
def gen_data(X_train,y_train,target,classes):
  # count_classes=dict(y_train.value_counts())
  count_classes=collections.Counter(y_train)
  max_class=max(count_classes.values())
  print('MAX CLASS',max_class)
  new_data=pd.DataFrame()
  tmp=X_train.copy()
  tmp[target]=y_train
  for c in set(classes):
    training_data=tmp[tmp[target]==c]
    nb_instances_to_generate=max_class-count_classes[c]
    if nb_instances_to_generate !=0:
      syhtnetic_data=fake_data_generation(training_data,nb_instances_to_generate,target)
      syhtnetic_data.rename(columns={'0':target},inplace=True)
      syhtnetic_data[target]=c
      new_data=new_data.append(syhtnetic_data)
  return new_data

## TABGAN related functions

In [None]:
def run_tabgan(X_train, y_train, X_test,y_test,target,classes):
  count_classes=dict(y_train[target].value_counts())
  max_class=max(count_classes.values())
  new_data=pd.DataFrame()
  new_train=pd.DataFrame()
  new_target=pd.Series()
  tmp=X_train.copy()
  tmp[target]=y_train
  tmp_test=X_test.copy()
  tmp_test[target]=y_test
  for c in set(classes):
    training_data=tmp[tmp[target]==c]
    print('CLASS',c)
    nb_instances_to_generate=1+max_class/count_classes[c] 
    if nb_instances_to_generate !=1:
      new_tr, new_tar = GANGenerator(gen_x_times=nb_instances_to_generate, cat_cols=None,
            bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=False,
            adversarial_model_params={
                "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
                "learning_rate": 0.02, "random_state": 42, "n_estimators": 500,
            }, pregeneration_frac=2, only_generated_data=True,
            gan_params = {"batch_size": 16, "patience": 5, "epochs" : 150,}).generate_data_pipe(pd.DataFrame(training_data.drop(target,1)), 
                                                                                                pd.DataFrame(training_data[target]), 
                                                                                                tmp_test[tmp_test[target]==c].drop(target,1), 
                                                                                                deep_copy=True, only_adversarial=False, 
                                                                                                use_adversarial=True)
      new_train=new_train.append(new_tr)
      new_target=new_target.append(new_tar)
  new_target=pd.DataFrame(new_target,columns=[9])
  return new_train, new_target

## Helper functions

In [None]:
# function to load data 
def data_loader(filename):
  data=pd.read_csv(filename+".csv")
  return data
# check data on null values
def check_notnull(data):
    plt.figure(figsize=(15, 5))
    plt.xticks(rotation=90)
    plt.ylabel('Number')
    plt.title('Non-Missing Values in columns within %d instances ' % data.shape[0])
    plt.bar(data.columns, data.notnull().sum())

# functions for EDA
def plot_displot(data):
    fig = plt.figure(1, figsize=(20, 40))

    for i in range(len(data.columns)):
        fig.add_subplot(10, 5, i + 1)
        sns.histplot(data.iloc[i], kde=True)
        plt.axvline(data[data.columns[i]].mean(), c='green')
        plt.axvline(data[data.columns[i]].median(), c='blue')

def plot_scatter(data, x, y, target):
    fig = plt.figure(1, figsize=(8, 5))
    sns.scatterplot(data=data, x=x, y=y, hue=target)
    plt.xlabel('ftr# {}'.format(x))
    plt.ylabel('ftr# {}'.format(y))
    plt.show()


def plot_class_dist(target_column):
    ax = target_column.value_counts().plot(kind='bar', figsize=(12, 8), 
                                           fontsize=12, 
                                           color=['#6ca5ce','#a06cce','#6cb4ce',
                                                  '#6cce81','#c92c4c','#c726c9'])
    ax.set_title('Target class\n', size=20, pad=30)
    ax.set_ylabel('Number of samples', fontsize=12)
    for i in ax.patches:
        ax.text(i.get_x() + 0.19, i.get_height(), str(round(i.get_height(), 2)), 
                fontsize=12)

def plot_pie(data,labels,title):
  #Usage:
  # data = df[target].value_counts()
  # print(df[target].value_counts(True)*100)
  # plot_pie(data,classes,'Gallagher Dataset')
    fig, ax = plt.subplots(figsize =(20, 10))
    colors = sns.color_palette('pastel')
    ax.pie(data, labels = labels, colors = colors)
    ax.set_title(title,fontsize=14)
    plt.show()

def plot_class_dist(target_column):
  # Usage: plot_class_dist(df[target])
    ax = target_column.value_counts().plot(kind='bar', figsize=(12, 6),
         fontsize=12, 
         color=['#6ca5ce','#a06cce','#6cb4ce','#6cce81','#c92c4c','#c726c9'])
    ax.set_title('Target class\n', size=16, pad=30)
    ax.set_ylabel('Number of samples', fontsize=12)
    for i in ax.patches:
        ax.text(i.get_x() + 0.19, i.get_height(), str(round(i.get_height(), 2)),
                fontsize=12)

def fill_missing_values(data, num_features, cat_features):
    for f in num_features:
        median = data[f].mean()
        data[f].fillna(median, inplace=True)
    for col in cat_features:
        most_frequent_category = data[col].mode()[0]
        data[col].fillna(most_frequent_category, inplace=True)


def encode_target(data, target):
    label_encoder = LabelEncoder()
    target_encoded = label_encoder.fit_transform(data[target])
    return target_encoded


def standardize_data(data, num_features):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[num_features])
    return scaled_data


def transfrom_cat_features(data, cat_features):
    for c in cat_features:
        data = data.merge(pd.get_dummies(data[c], prefix=c), 
                          left_index=True, right_index=True)
    data.drop(cat_features, axis=1, inplace=True)


def split_data(data, target):
    X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                        stratify=target, 
                                                        test_size=0.33, 
                                                        random_state=42)
    return X_train, X_test, y_train, y_test

# get the oversampler model by key from dict
def get_oversampler(oversamplers_dict, oversampler_num, proportion):
    if proportion == None:
        return oversamplers_dict[oversampler_num]()
    else:
        return oversamplers_dict[oversampler_num](proportion=proportion)

def get_filter(filters_dict, filter_num):
    return filters_dict[filter_num]

# function to preprocess input data
def preprocess(data, target, num_features, cat_features):
    # check_notnull(data.drop(target, 1))
    fill_missing_values(data.drop(target, 1), num_features, cat_features)
    data[target] = encode_target(data, target)
    data[num_features] = standardize_data(data, num_features)
    transfrom_cat_features(data, cat_features)
    return data

# function to print evaluation metrics values
def print_eval_results(y_test, preds):
    print('Classification report:')
    print(classification_report(y_test, preds))
    print('Geometric mean:', geometric_mean_score(y_test, preds, 
                                                  average='weighted'))
    print('Geometric mean default:', geometric_mean_score(y_test, preds))
    print('Cohen Kappa', cohen_kappa_score(y_test, preds))

# function to get optimal DT model
def get_model(X_train, y_train):
    param_grid = { 'criterion':['gini','entropy'],
                  'max_depth': np.arange(3, 200),
                  'max_features': ['auto', 'log2'],
                  }
    model=DecisionTreeClassifier()
    adb = GridSearchCV(model, param_grid, cv=5,scoring='f1_weighted')
    adb.fit(X_train, y_train)
    return adb.best_estimator_ 

# function for calculation of error per class
def error_per_class(y_test,preds,classes):
  cm = confusion_matrix(y_test, preds)
  # to store the results in a dictionary for easy access later
  per_class_accuracies = {}
  per_class_error={}
  # Calculate the accuracy for each one of our classes
  for idx, cls in enumerate(classes):
      # TN - all the samples that are not current GT class 
      # and not predicted as the current class
      true_negatives = np.sum(np.delete(np.delete(cm, idx, axis=0), idx, axis=1))
      # TP are all the samples of current GT class that were predicted as such
      true_positives = cm[idx, idx]
      # accuracy for the current class
      per_class_accuracies[cls] = (true_positives) / np.sum(cm[idx,:])
      per_class_error[cls] = 1-(true_positives) / np.sum(cm[idx,:])
  print('PER CLASS ERROR', per_class_error)
  return per_class_error

# functions to perform KS test for gen/real data
def ks_test(real, gen):
  df_a=np.array(real.values)
  df_b=np.array(gen.values)
  ks_scores=ks_2samp(df_a, df_b)
  print("Gen vs Real: ks statistic",ks_scores.statistic)
  print("Gen vs Real: ks pvalue",ks_scores.pvalue)
  print("Gen & Real distributions are equal",ks_scores.pvalue>0.05)

def run_kstwo(X_sample,X_train):
    df_gen=pd.DataFrame(X_sample.copy(),columns=X_train.columns)
    df_gen=df_gen[~df_gen.isin(X_train)].dropna()
    df_gen['gen']='generated'
    df_real=pd.DataFrame(X_train.copy())
    df_real['gen']='real'
    df=pd.concat([df_real,df_gen])
    for col in df_gen.drop(['gen'],1).columns.to_list():
      print('Feature',col)
      ks_test(df_gen[col], df_real[col])

# functions for filtering data points 

# function to find N neighbors for a point
def get_neighbours(X_train,X_gen):
  nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree', p=2).fit(np.array(X_train.values))
  _,ind=nbrs.kneighbors(np.array(X_gen.values))
  return ind

# filtering
def filter_data(X_train,y_train,X_gen,y_gen,X_test,init_error,c,y_test,classes):
  # init empty F set
  n=len(X_gen)
  X_filtered=pd.DataFrame()
  y_filtered=pd.Series()
  i=0
  # find n-neighbors for the data point
  k_neighbours=get_neighbours(X_train,X_gen)

  for kn in k_neighbours:
    X_tmp=X_train.copy()
    y_tmp=y_train.copy()
    # find class of the neigborhood 
    max_class=max(collections.Counter(y_train[kn]))
    # if gen_sample class equals to neighborhood's class we append sample to F set
    if max_class==y_gen.iloc[i]:
      X_filtered=X_filtered.append(X_gen.iloc[i])
      y_filtered=pd.concat([pd.Series(y_filtered),pd.Series(y_gen.iloc[i])])
    # otherwise we check whether there is an improvement in error rate
    else:
      X_tmp=X_tmp.append(X_gen.iloc[i])
      y_tmp=pd.concat([pd.Series(y_tmp),pd.Series(y_gen.iloc[i])])
      clf_model=get_model(X_tmp,y_tmp)
      preds = clf_model.predict(X_test)
      error=error_per_class(y_test,preds,classes)
      # if there is an improvement
      # we append sample to F set
      if init_error[c]>error[c]:
        X_filtered.append(X_gen.iloc[i])
        y_filtered.append(y_gen.iloc[i])
  i+=1
  return pd.DataFrame(X_filtered),pd.Series(y_filtered)

## Main

In [None]:
def main():

   # 1. Data Upload and Pre-processing 
    
    ### START SECTION ###
    ### PUT HERE STRINGS FROM README SECTION TO UPLOAD THE REQUIRED DATASET ###
    filename = 'glass.csv'
    target = 'TypeGlass'
    classes=[0,1,2,3,4,5]
    data = pd.read_csv(filename, header=0)
    cat_features = []
    num_features = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'] 
    
    data = preprocess(data, target, num_features, cat_features)
    X_train, X_test, y_train, y_test = split_data(data.drop(target, 1), 
                                                  data[target])
    X_train.reset_index(inplace=True,drop=True)
    X_test.reset_index(inplace=True,drop=True)
    y_train.reset_index(inplace=True,drop=True)
    y_test.reset_index(inplace=True,drop=True)
    ### END SECTION ###

    print('Trainset shape', X_train.shape)
    print('Testset shape', X_test.shape)
    print(y_train.value_counts() )
    print(y_test.value_counts() / y_test.shape[0])

    # 2. Baseline results

    print('BASELINE MODEL')
    clf_model=get_model(X_train,y_train)
    preds = clf_model.predict(X_test)
    print_eval_results(y_test, preds)
    #initial error_rate per class
    init_error=error_per_class(y_test,preds,classes)

    # 3. WGAN oversampling 
    print('WGAN OVERSAMPLING')
    X_sample=gen_data(X_train,y_train,target,classes)
    X_sample.rename(columns={'TargetClass':target},inplace=True)
    X_train[target]=y_train
    X_sample=X_sample.append(X_train)
    X_train=X_train.drop(target,1)
    y_sample=X_sample[target]
    X_sample=X_sample.drop(target,1)
    clf_model=get_model(X_sample, y_sample)
    preds = clf_model.predict(X_test)
    print_eval_results(y_test, preds)
    error_per_class(y_test,preds,classes)

    # 4. KS test for WGAN data
    print('KS tests for WGAN')
    run_kstwo(X_sample,X_train)

    # 5. SMOTE based oversampling 
    print('SMOTE BASED OVERSAMPLING')
    oversamplers_dict = {1: sv.G_SMOTE, 
                         2: sv.SMOTE, 
                         3: sv.RWO_sampling, 
                         5: sv.ANS, 
                         6: sv.kmeans_SMOTE}
    oversampler = sv.MulticlassOversampling(get_oversampler(oversamplers_dict, 2, None))
    X_sample, y_sample = oversampler.sample(X_train.values, y_train.values)
    clf_model=get_model(X_sample, y_sample)
    preds = clf_model.predict(X_test)
    print_eval_results(y_test, preds)
    error_per_class(y_test,preds,classes)
    # 6. CTGAN based oversampling 
    print('TABGAN OVERSAMPLING')
    X_sample,y_sample = run_tabgan(X_train, pd.DataFrame(y_train),X_test,
                                   pd.DataFrame(y_test),target,classes)
    X_sample=X_sample.append(pd.DataFrame(X_train))
    y_sample=pd.concat([pd.DataFrame(y_sample,columns=[9]),pd.DataFrame(y_train)])
    clf_model=get_model(X_sample,y_sample)
    preds = clf_model.predict(X_test)
    print_eval_results(y_test, preds)
    error_per_class(y_test,preds,classes)

    # 7. Filtering
    print('FILTERING PART')
    X_out=X_train.copy()
    y_out=y_train.copy()
    for c in classes:
      error=init_error
      ins=y_train.index[y_train==c].tolist()
      print('GEN DATA FOR CLASS ', c)
      X_gan= gen_data(X_train, pd.DataFrame(y_train),target,set([c])) #1
      y_gan = pd.Series(X_gan[target]) #2
      X_gan=X_gan.drop(target,1) #3
      #-----------OR-----------#
      # to test CTGAN replace 1,2,3 with
      # X_gan,y_gan = run_tabgan(X_train.iloc[ins], 
      #                          pd.DataFrame(y_train).iloc[ins],
      #                          X_test,pd.DataFrame(y_test),target,set([c]))
      #-------------------------#
      X_filtered,y_filtered = filter_data(X_train,y_train,X_gan,pd.Series(y_gan),
                                          X_test,init_error,c,y_test,classes)
      X_out=X_out.append(pd.DataFrame(X_filtered))
      y_out=pd.concat([pd.DataFrame(y_out,columns=[target]),
                       pd.DataFrame(y_filtered,columns=[target])])
      y_out.reset_index(inplace=True,drop=True)
      X_out.reset_index(inplace=True,drop=True)
      X_gan=X_filtered.append(pd.DataFrame(X_train))
      y_gan=pd.concat([pd.DataFrame(y_filtered,columns=[target]),
                       pd.DataFrame(y_train)])
      clf_model=get_model(X_gan,y_gan.astype('int'))
      preds = clf_model.predict(X_test)
      error=error_per_class(y_test,preds,classes)
      y_gan.reset_index(inplace=True,drop=True)
      X_gan.reset_index(inplace=True,drop=True)

    clf_model=get_model(X_out,y_out)
    preds = clf_model.predict(X_test)
    error=error_per_class(y_test,preds,classes)

if __name__ == '__main__':
    main()

## README 



### glass dataset <br>

filename = 'glass.csv' <br>
target = 'TypeGlass' <br>
classes=[0,1,2,3,4,5] <br>
data = pd.read_csv(filename, header=0) <br>
cat_features = [] <br>
num_features = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'] <br>
data = preprocess(data, target, num_features, cat_features) <br>
X_train, X_test, y_train, y_test = split_data(data.drop(target, 1), data[target) <br>

### dermathology dataset
filename = 'dermatology.csv' <br>
target = 'Class' <br>
classes=[1,2,3,4,5,6] <br>
data = pd.read_csv(filename, header=0) <br>
cat_features = [] <br>
num_features = ['Erythema', 'Scaling', 'Definite_borders', 'Itching',
'Koebner_phenomenon', 'Polygonal_papules', 'Follicular_papules',
'Oral_mucosal', 'Knee_and_elbow', 'Scalp_involvement', 'Family_history',
'Melanin_incontinence', 'Eosinophils', 'PNL_infiltrate', 'Fibrosis',
'Exocytosis', 'Acanthosis', 'Hyperkeratosis', 'Parakeratosis',
'Clubbing', 'Elongation', 'Thinning', 'Spongiform_pustule',
'Munro_microabcess', 'Focal_hypergranulosis', 'Granular_layer',
'Vacuolisation', 'Spongiosis', 'Saw-tooth_appearance',
'Follicular_horn_plug', 'Perifollicular_parakeratosis',
'Inflammatory_monoluclear', 'Band-like_infiltrate', 'Age']  <br>
data = preprocess(data, target, num_features, cat_features) <br>
X_train, X_test, y_train, y_test = split_data(data.drop(target, 1), data[target) <br>

### wine dataset
filename= 'wine.csv' <br>
target = 'Class' <br>
classes=[1,2,3] <br>
data = pd.read_csv(filename, header=0) <br>
cat_features = [] <br>
num_features= ['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium',
'TotalPhenols', 'flavanoids', 'NonflavanoidsPhenols', 'Proanthocyanins',
'ColorIntensity', 'Hue', 'OD280/OD315', 'Proline'] <br>
data = preprocess(data, target, num_features, cat_features) <br>
X_train, X_test, y_train, y_test = split_data(data.drop(target, 1), data[target) <br>

### page-blocks
filename = 'page-blocks.csv' <br>
target = 'Class' <br>
classes=[0,1,2,3,4,5,6] <br>
data = pd.read_csv(filename, header=0) <br>
cat_features = [] <br>
num_features = ['Height', 'Lenght', 'Area', 'Eccen', 'P_black', 'P_and', 'Mean_tr','Blackpix', 'Blackand', 'Wb_trans'] <br>
data = preprocess(data, target, num_features, cat_features) <br>
X_train, X_test, y_train, y_test = split_data(data.drop(target, 1), data[target]) <br>

### internet firewall

num_features=["Bytes Sent","Bytes Received","Elapsed Time (sec)", "Bytes",'Packets',"pkts_sent","pkts_received"] <br>
cat_features = ['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port'] <br>

scaler = StandardScaler() <br>
scaler.fit(data[num_features]) <br>
data[num_features] = scaler.transform(data[num_features]) <br>
data=data.astype({'Source Port': 'object',
                  'Destination Port': 'object',
                  'NAT Source Port': 'object',
                  'NAT Destination Port': 'object',
                  }) <br>
count_enc = CountFrequencyEncoder(encoding_method="frequency", variables=cat_features) <br>
count_enc.fit(data) <br>
data = count_enc.transform(data) <br>
data.to_csv('log2_preproc.csv', index=False) <br>

filename = 'log2_preproc.csv' <br>
target = 'Action' <br>
classes=[0,1,2,3] <br>
data = pd.read_csv(filename, header=0) <br>
X_train, X_test, y_train, y_test = split_data(data.drop(target, 1), data[target]) <br>

### shuttle dataset

df_train=pd.read_csv('shuttle.trn', delimiter=' ',header=None) <br>
df_test=pd.read_csv('shuttle.tst', delimiter=' ',header=None) <br>
target=9 <br>
num_features=[0,1,2,3,4,5,6,7,8] <br>
cat_features=[] <br>
df_train=preprocess(df_train, target, num_features, cat_features) <br>
df_test=preprocess(df_test, target, num_features, cat_features) <br>
X_train, y_train = df_train.drop(9,1),df_train[9] <br>
X_test, y_test = df_test.drop(9,1),df_test[9] <br>

### gallagher

features_file = 'mobnet_features.npy' <br>
vectors = np.load(features_file) <br>
labels= np.load('mobnet_labels.npy') <br>
label_enc = preprocessing.LabelEncoder() <br>
label_enc.fit(labels) <br>
labels = label_enc.transform(labels) <br>
df=pd.DataFrame(vectors) <br>
df['Class']=labels <br>
target='Class' <br>
classes=[11,22,0,30,31,27,26,28,3,17,29,13,6,25,10,19] <br>
df=df[df.Class.isin(classes)] <br>
df.reset_index(inplace=True,drop=True) <br>
X_train, X_test, y_train, y_test = train_test_split(df.drop(target,1), df[target], test_size=0.33, random_state=42) <br>

