<a href="https://colab.research.google.com/github/ItalianPepper/deepfill/blob/master/v3_deepfill.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# clonig repo datawig - i dataset sono dentro la cartella examples
!git clone "https://github.com/awslabs/datawig"

In [0]:
# Necessario per far funzionare datawig
!pip install mxnet

In [0]:
# Cartella per i risultati della soluzione
!mkdir result

In [0]:
# Cartella dei risultati di datawig
!mkdir result_datawig

In [0]:
!rm -r result
!rm -r result_datawig
!rm -r imputer_model

In [0]:
# Zip pronto per il download dei risultati
!zip -r "/content/result.zip" "/content/result"
!zip -r "/content/result_datawig.zip" "/content/result_datawig"

In [0]:
# Collegamento con Drive (nel caso si voglia caricare dataset da drive o salvare i risultati)
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Il valore di y viene aggiunto al tokenizer ma non viene tokenizzato
# uso di keras e non di tensorflow.keras -> manca l'engine utile alla costurzione dell'attention layer
import random
import time
import numpy as np
import pandas as pd
from keras.models import Sequential, Model
from keras.layers import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras import initializers as initializers, regularizers, constraints
from keras import backend as K
from keras.engine.topology import Layer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import re
import math
import matplotlib.pyplot as plt
import os
import nltk

if not os.path.exists("./root/ntlk_data/corpora/stopwords"):
  nltk.download("stopwords")

from nltk.corpus import stopwords

from typing import List

def random_split(data_frame: pd.DataFrame,
                 split_ratios: List[float] = None,
                 seed: int = 10) -> List[pd.DataFrame]:
    """
    Da datawig.utils

    Shuffles and splits a Data frame into partitions with specified percentages of data

    :param data_frame: a pandas DataFrame
    :param split_ratios: percentages of splits
    :param seed: seed of random number generator
    :return:
    """
    if split_ratios is None:
        split_ratios = [.8, .2]
    sections = np.array([int(r * len(data_frame)) for r in split_ratios]).cumsum()
    return np.split(data_frame.sample(frac=1, random_state=seed), sections)[:len(split_ratios)]



def clean_df(df, other_unclassified_values = []):
    
    # Rimozione dei caratteri speciali con la stringa vuota
    df = df.replace(
        to_replace=r'[!@#$%^&*()_+\-=\[\]{};\':\"\\|,.<>\/?]',
               value="", regex=True)
    
    # Cambio di valori come ad esempio 'None', 'Unclassified', ecc... a NaN
    if len(other_unclassified_values) > 0:
      df = df.replace(to_replace=other_unclassified_values, value="")

    return df


def is_y_categorical (df, y_label):

    y_col = df[y_label].tolist()

    categories = set(y for y in y_col)
    
    if len(categories) == len(y_col):
        raise Exception("Y is not categorical!")


def fit_tokenizer(df, input_x_label, output_y_label, size_dict_words, other_unclassified_values=[]):

    all_attributes = input_x_label + [output_y_label]
    words = []

    df = df.replace(to_replace=r'[!@#$%^&*()_+\-=\[\]{};\':\"\\|,.<>\/?]',
               value="", regex=True)

    if len(other_unclassified_values) > 0:
      df = df.replace(to_replace=other_unclassified_values, value="")

    for i in range(0, len(df)):
      
      for attribute in all_attributes:
          row = df.at[i, attribute]
          
          if len(row.split()) > 1:

            for x in row.split():
              words.append([x])
          else:
              words.append([row])

    # default param
    # Il tokenizer considera solo le parole più frequenti con param num_words = -1
    tokenizer = Tokenizer(num_words = size_dict_words,
                          filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower = True,
                          split=" ",
                          char_level=False)
    
    tokenizer.fit_on_texts(words)
    
    return tokenizer


def tokenizing_x_y(df, input_x_label, output_y_label, tokenizer):

    tokenized_x = []
    tokenized_y = []

    mean_length_row = 0

    stop_words = set(stopwords.words('english')) 

    all_attributes = input_x_label + [output_y_label]

    for i in range(0, len(df)):
        row_x = []
        row_y = []
        
        for attribute in all_attributes:
            # Può essere una singola parola o una frase
            element = df.at[i, attribute]
            element = element.lower()

            if attribute in input_x_label:

                if len(element.split()) > 1:
                    tokenized_row = []

                    for word in element.split():

                        # Le stop words sono eliminate sono sul lato degli attributi di X
                        if word not in stop_words and word in tokenizer.word_index:

                          emb_word = tokenizer.word_index[word]
                          tokenized_row.append(emb_word)

                    row_x.extend(tokenized_row)

                else:
                  if len(element.split()) == 1 and element != " " and element in tokenizer.word_index and element not in stop_words:
                    
                    tokenized_single = tokenizer.word_index[element]
                        
                    row_x.append(tokenized_single)
            else:
              row_y.append(element)
        
        if len(row_x) > 0:
          
          tokenized_x.append(row_x)
          tokenized_y.append(row_y)

    return tokenized_x, tokenized_y


def labeling_y(df, y_label, y_dataset):

    values = df[y_label].tolist()
    values_set = set(y for y in values)
    
    mapping_y = {y:i for i, y in enumerate(values_set)}
    
    mapped_y = []

    for i in range(0, len(y_dataset)):
        map_y = mapping_y.get(y_dataset[i][0])
        mapped_y.append(map_y)

    return mapped_y, mapping_y


class AttentionLayer(Layer):
    """
    Hierarchial Attention Layer as described by Hierarchical Attention Networks for Document Classification(2016)
    - Yang et. al.
    Source: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf
    Theano backend
    """
    def __init__(self,attention_dim=100,return_coefficients=False,**kwargs):
        # Initializer 
        self.supports_masking = True
        self.return_coefficients = return_coefficients
        self.init = initializers.get('glorot_uniform') # initializes values with uniform distribution
        self.attention_dim = attention_dim
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Builds all weights
        # W = Weight matrix, b = bias vector, u = context vector
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)),name='W')
        self.b = K.variable(self.init((self.attention_dim, )),name='b')
        self.u = K.variable(self.init((self.attention_dim, 1)),name='u')
        self.trainable_weights = [self.W, self.b, self.u]

        super(AttentionLayer, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, hit, mask=None):
        # Here, the actual calculation is done
        uit = K.bias_add(K.dot(hit, self.W),self.b)
        uit = K.tanh(uit)
        
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)
        ait = K.exp(ait)
        
        if mask is not None:
            ait *= K.cast(mask, K.floatx())

        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = hit * ait
        
        if self.return_coefficients:
            return [K.sum(weighted_input, axis=1), ait]
        else:
            return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        if self.return_coefficients:
            return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[-1], 1)]
        else:
            return input_shape[0], input_shape[-1]


def create_model(size_dict_words, n_features_word, max_length_row, out_classes, embedding_maxtrix = None):
    
    # Usare l'attention layer:
    # https://github.com/adsieg/Multi_Text_Classification/blob/master/%5BSupervised%5D%20%5BDL%20method%5D%20GRU_HAN.ipynb

    # Viene passata una parola per volta
    layer_conv_params = [[100, 3, 3], [50, 5, 5]]

    in_layer = Input(shape=(max_length_row, ))
    
    x =  Embedding(size_dict_words, n_features_word, 
                                 input_length = max_length_row)(in_layer)

    x = Dropout(0.2)(x)
    
    for filters, kernel_size, pool_size  in layer_conv_params:
      
      x = Conv1D(filters = filters, kernel_size = kernel_size, activation="relu")(x)

      x = BatchNormalization()(x)
      x = SpatialDropout1D(0.1)(x)
      x = MaxPooling1D(pool_size = pool_size)(x)
    
    x = Bidirectional(GRU(50, return_sequences=True))(x)
    x = AttentionLayer(n_features_word, False)(x)

    out = Dense(out_classes, activation="softmax")(x)
    
    model = Model(in_layer, out)

    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", 
                  metrics=["accuracy"])
    
    return model


def train_classifier(x, y, max_length_row, n_features_word,
                     out_classes, size_dict_words, embedding_matrix=None):
    
    n_epochs = 3
    batch_size = 32

    model = create_model(size_dict_words, n_features_word,
                         max_length_row, out_classes)
    model.summary()

    history = model.fit(x, y,
                        validation_split=0.20,
                        epochs = n_epochs,
                        batch_size = batch_size,
                        verbose = 2)
    
    plt.figure(figsize=(15,10))
    plt.plot(history.history["loss"])
    plt.plot(history.history["val_loss"])
    plt.title("Learning Curves")
    plt.ylabel("loss")
    plt.xlabel("epoch")

    plt.legend(["train", "val"], loc="upper left")
    
    name = "./result/plot_train_val.png"
    plt.savefig(name)
    plt.close()
    
    plot_model(model, to_file="./result/model_plot.png", show_shapes=True, show_layer_names=True)

    return model

def plot_dataset(df, y_label):
    
    values = df[y_label].tolist()
    categories = set(y for y in values)
    counter_y_dataset = {y:values.count(y) for y in categories}
     
    n_classes = len(counter_y_dataset.keys())
    plt.figure(figsize=(15, 10))
    plt.bar(np.arange(0, n_classes), height=counter_y_dataset.values())
    plt.title("Dataset Classes Distribution")
    plt.xticks(np.arange(0, n_classes, step=1.0), labels=counter_y_dataset.keys(), rotation="vertical")
    plt.savefig("./result/dataset_plot.png")
    plt.show()
    plt.close()


def plot_testing(y_true, y_pred, mapping_y):
    # Plotting dataset

    counter_y_true = dict()
    
    for k in mapping_y.keys():
      counter_y_true.update({k:y_true.count(k)}) 
        
    counter_y_pred = dict()

    for k in mapping_y.keys():
      counter_y_pred.update({k:y_pred.count(k)})

    n_classes = len(mapping_y.keys())

    plt.figure(figsize=(15,10))
    plt.bar(np.arange(0, n_classes), color="red", alpha=0.5, label="y_true", height=counter_y_true.values())
    plt.bar(np.arange(0, n_classes), color="yellow", alpha=0.5, label="y_pred", height=counter_y_pred.values())
    plt.legend(loc="upper left")
    plt.title("True vs Pred")
    plt.xticks(np.arange(0, n_classes, step=1.0), labels=mapping_y.keys(), rotation="vertical")
    plt.savefig("./result/hist_all_together.png")
    plt.show()
    plt.close()


def imputing_missing_values(df_test, input_x_label, output_y_label, tokenizer,
                            mapping_y, max_length_row, n_features_word, model,
                            other_unclassified_values=[], dir_out="./result/"):
    
    df_test = clean_df(df_test, other_unclassified_values=[])

    tokenized_x = []
    
    stop_words = set(stopwords.words('english'))

    for i in range(0, len(df_test)):
        row_x = []
        row_y = []
        
        for attribute in input_x_label:
            # Può essere una singola parola o una frase
            element = df_test.at[i, attribute]
            element = element.lower()

            if len(element.split()) > 1:
                tokenized_row = []

                for word in element.split():

                # Le stop words sono eliminate sono sul lato degli attributi di X
                    if word not in stop_words and word in tokenizer.word_index:

                         emb_word = tokenizer.word_index[word]
                         tokenized_row.append(emb_word)

                row_x.extend(tokenized_row)

            elif len(element.split()) == 1 and element != " "  and element in tokenizer.word_index and element not in stop_words:
                    
                tokenized_single = tokenizer.word_index[element]
                        
                row_x.append(tokenized_single)
        
        tokenized_x.append(row_x)

    # Allineamento delle sequenze
    padded_x = pad_sequences(tokenized_x, maxlen=max_length_row,
                               padding="post", truncating="post")
    # Trasformazione in numpy array
    padded_x = np.array(padded_x)
    x_test = np.reshape(padded_x, [len(padded_x), max_length_row])
    
    # Predizioni
    predictions = model.predict(x_test)

    predictions = np.argmax(predictions, axis=1)

    predictions = predictions.tolist()

    # Mapping y -> parola: classe
    # Reverse y -> classe: parola
    reverse_mapping = {v:k for k, v in mapping_y.items()}
    predictions_to_words = []

    # Trasformazione da classi a parole
    for p in predictions:
      predictions_to_words.append(reverse_mapping.get(p))

    predicted_y_label = output_y_label + "predicted"

    df_test[predicted_y_label] = predictions_to_words
    
    path_out_dataset = dir_out+"df_test_result.csv"
    df_test.to_csv(path_out_dataset, sep=",", header=0)

    return df_test, predicted_y_label


def compute_scoring(df, y_label, y_pred_label, mapping_y, dir_out="./result/"):

    plot_testing(df[y_label].tolist(), df[y_pred_label].tolist(), mapping_y)
    
    path_file_out = dir_out+"result_experiment.txt"
     # Calculate f1 score for true vs predicted values
    
    with open(path_file_out, "w+") as result_file:
      
      f1 = f1_score(df[y_label], df[y_pred_label], average='weighted')
    
      print("F1-Score Weighted:", f1, file = result_file)

      # Print overall classification report
      print(classification_report(df[y_label], df[y_pred_label]), file = result_file)


def run(path_df, input_x_label, output_y_label, other_unclassified_values=[]):
    
    start = time.time()
    
    # Numero massimo di parole memorizzate nel tokenizer
    # param -1 : considera solo quelle più importanti
    size_dict_words = 20000
    
    # Taglia delle sequenze
    max_length_row = 100
    
    # Size dell'embedding
    n_features_word = 100

    df = pd.read_csv(path_df, sep=",", header=0, dtype="str")

    is_y_categorical(df, output_y_label)
    
    plot_dataset(df, output_y_label)

    df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])
  
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    df_train = clean_df(df_train, other_unclassified_values=[])
    # Considera l'ipotesi che ci siano altri valori semanticamente come null
    # es. "Unclassified"
    
    # Tokenizer 
    tokenizer = fit_tokenizer(df_train, input_x_label, output_y_label,
                              size_dict_words, other_unclassified_values=[])
    

    tokenized_x, y_words = tokenizing_x_y(df_train, input_x_label, output_y_label, tokenizer)

    # Allineamento delle sequenze
    padded_x = pad_sequences(tokenized_x, maxlen=max_length_row,
                               padding="post", truncating="post")

    # Mapping Y
    # Viene passato il dataset totale e il label di y per mantenere le medesime posizioni nel mapping
    # in modo che i grafici abbiano i label nelle medesime del plot del dataset totale
    y_train, mapping_y = labeling_y(df_train, output_y_label, y_words)

    # Numero di classi del dataset
    out_classes = len(mapping_y.keys())

    # Trasformazione in numpy array
    padded_x = np.array(padded_x)
    x_train = np.reshape(padded_x, [len(padded_x), max_length_row])

    y_train = np.array(y_train)
    y_train = np.reshape(y_train, [len(y_train), 1])
    
    model = train_classifier(x_train, y_train, max_length_row,
                             n_features_word, out_classes, size_dict_words)
    
    df_pred_test, y_label_pred = imputing_missing_values(df_test, input_x_label,
                                                         output_y_label, tokenizer, 
                                                         mapping_y, max_length_row,
                                                         n_features_word, model,
                                                         other_unclassified_values=None)
    
    compute_scoring(df_pred_test, output_y_label, y_label_pred, mapping_y)
    
    end = time.time()
    end = end - start
    
    print("Finished in:", end)
    
run(path_df="./datawig/examples/mae_train_dataset.csv",
    input_x_label = ["title","text","color"],
    output_y_label = "finish")


In [0]:
# Da simpleimputerintro
# WARNING: Correzione errore all'interno del file _hpo.py cambiare datawig.utils -> .utils
# Aggiungere il path completo al nome del dataset
import datawig.datawig
from datawig.datawig import SimpleImputer
from datawig.datawig.utils import random_split
from sklearn.metrics import f1_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_testing(y_true, y_pred, mapping_y):
    # Plotting dataset

    counter_y_true = dict()
    
    for k in mapping_y.keys():
      counter_y_true.update({k:y_true.count(k)}) 
        
    counter_y_pred = dict()

    for k in mapping_y.keys():
      counter_y_pred.update({k:y_pred.count(k)})

    n_classes = len(mapping_y.keys())
    
    plt.figure(figsize=(15,10))
    plt.bar(np.arange(0, n_classes), color="red", alpha=0.5, label="y_true", height=counter_y_true.values())
    plt.bar(np.arange(0, n_classes), color="yellow", alpha=0.5, label="y_pred", height=counter_y_pred.values())
    plt.legend(loc="upper left")
    plt.title("True vs Pred")
    plt.xticks(np.arange(0, n_classes, step=1.0), labels=mapping_y.keys(), rotation="vertical")
    plt.savefig("./result_datawig/hist_all_together.png")
    plt.show()
    plt.close()


def compute_scoring(df, y_label, y_pred_label, mapping_y, dir_out="./result_datawig/"):

    plot_testing(df[y_label].tolist(), df[y_pred_label].tolist(), mapping_y)
    
    
    path_file_out ="./result_datawig/result_datawig.txt"
     # Calculate f1 score for true vs predicted values
    
    with open(path_file_out, "w+") as result_file:
      
      f1 = f1_score(df[y_label], df[y_pred_label], average='weighted')
    
      print("F1-Score Weighted:", f1, file = result_file)

      # Print overall classification report
      print(classification_report(df[y_label], df[y_pred_label]), file = result_file)

def map_y(df, y_label):

    values = df[y_label].tolist()
    categories = set(y for y in values)
    mapping_y = {y:i for i,y in enumerate(categories)}
    return mapping_y

"""
Load Data
"""
input_cols = ["title","text","color"]
out_col = "finish"

df = pd.read_csv('./datawig/examples/mae_train_dataset.csv')
mapping_y = map_y(df, out_col)
df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])

# ------------------------------------------------------------------------------------

"""
Run default SimpleImputer
"""
# Initialize a SimpleImputer model
imputer = SimpleImputer(
    input_columns = input_cols,  # columns containing information about the column we want to impute
    output_column = out_col,  # the column we'd like to impute values for
    output_path='./result_datawig/imputer_model'  # stores model data and metrics
)

# Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=3)

# Impute missing values and return original dataframe with predictions
predictions = imputer.predict(df_test)

     # Calculate f1 score for true vs predicted values
out_col_pred = out_col + "_imputed"

compute_scoring(predictions, out_col, out_col_pred, mapping_y)