# Installs op cluster:

torch
click==8.0.4
keras
tensorflow
spacy 
scispacy
nltk 
torch

# Libraries

In [0]:
import pandas as pd
pd.options.mode.chained_assignment = None # Set warnings uit
import numpy as np
import os
import operator

import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql.window import Window

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.keras import TqdmCallback
from tqdm import tqdm
from tqdm.notebook import tqdm

import torch
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample

import keras
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Embedding, GlobalAveragePooling1D, Input, concatenate, LSTM
from tensorflow.keras.optimizers import Adam, SGD

# NLP libraries
import spacy
import nltk
from nltk.tokenize import word_tokenize
from scispacy.abbreviation import AbbreviationDetector
import string
import tensorflow
import click
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from IPython.display import clear_output

%matplotlib inline

# Dataframes init

In [0]:
# Koppel ndAttributen
def koppel_ndAttributen(NASA_CONTROLE):
    df_controle = spark.sql("select * from default._ndoutputattributen_25_3_2022") # Verander deze met Vernieuwde lijst?
    nd_controle = df_controle.join(NASA_CONTROLE, ["gtin"])
    nd_controle = nd_controle.filter(nd_controle.glnAH == True)
    print("Amount of rows: ",(nd_controle.count(),"Amount of columns: ",len(nd_controle.columns))) # Shape of dataframe
    return nd_controle

def koppel_ndAttributen_new_products(NASA_CONTROLE):
    df_controle = spark.sql("select * from ndoutputattributen.ndoutputattributen") # Verander deze met Vernieuwde lijst?
    nd_controle = df_controle.join(NASA_CONTROLE, ["gtin"])
    nd_controle = nd_controle.filter(nd_controle.glnAH == True)
    print("Amount of rows: ",(nd_controle.count(),"Amount of columns: ",len(nd_controle.columns))) # Shape of dataframe
    return nd_controle

def koppel_preprocessed_text(df_geformateerd, DF_NLP_spark):
    df_geformateerd = spark.createDataFrame(df_geformateerd)
    DF_merged_ = (DF_NLP_spark.join(df_geformateerd, ["gtin", "gln","glnAH"], how="left"))  # verander hier table naar bovengenoemde attribuut, zie notebook --> Functions
        
    DF_merged_ = DF_merged_.drop_duplicates()
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")
    DF_merged = DF_merged_.toPandas()
    print("Shape van dataframe: ",(DF_merged["gln"].count(),len(DF_merged.columns)))
    return DF_merged
    
def fix_empty_fields(DF_merged, ATTRIBUUT):
    # Fix empty fields van class_num
    print("null entries = ", DF_merged["class_num_"+str(ATTRIBUUT)].isna().sum())
    DF_merged = DF_merged[DF_merged["class_num_"+str(ATTRIBUUT)].notna()]
    return DF_merged


# Functions Preprocessing

In [0]:
def get_unique_classes(df, attribuut="ndTypeOfGrain"):
    unique_classes = []
    for item in df[attribuut]:
        unique_classes.append(item)
        if item == '':
            unique_classes.append("EMPTY")
    unique_classes = list(set(unique_classes))
 
    # Split op -> |
    items = []
    for item in unique_classes:
        try:
            _item = item.split("|")
            items.append(_item)
        except:
            pass
    items_flat = [item for sublist in items for item in sublist]
    unique_items = sorted(list(set(items_flat))) # met set unique classes eruit halen, daarna weer terug naar een list
    #unique_items.remove("") # Haal de empty toegewezen class eruit
    #print(unique_items, "\n\nunique classes: ", len(unique_items))
    return unique_items
 
def one_hot_encoder(df_unique, unique_items, attribuut="ndTypeOfGrain"):
    #one-hot encoder,
    for index,item in enumerate(unique_items):
        df_unique[str(unique_items[index])] = df_unique[attribuut].str.contains(pat = str(unique_items[index]))
        df_unique[str(unique_items[index])] = df_unique[str(unique_items[index])]*1    # Convert boolean naar integer voor een NN model later.  
    return df_unique
 
def create_overig(df_unique,ATTRIBUUT,unique_items, Overig_threshold):
    all_classes = unique_items
    df_unique_classes_only = df_unique[all_classes]
    lijst_kleine_classes = []
    grote_classes = []
    for column in all_classes:
        #print(column,(df_unique[column] == 1).sum())
        if (df_unique[column] == 1).sum() <= Overig_threshold:
            lijst_kleine_classes.append(column)
        else:
            grote_classes.append(column)
    print("Kleine classes die onder overige moeten: ",lijst_kleine_classes)
    
    for index, class_ in enumerate(lijst_kleine_classes):
        if index==0:
            df_unique['overig'] = np.where(df_unique[class_] == 1, 1, 0)
        else:
            df_unique['overig'] = np.where(df_unique[class_] == 1, 1, df_unique['overig'])
    print("Grotere classes: ", grote_classes)
    grote_classes.append('overig')
    encoder = OneHotEncoder(handle_unknown='ignore')
    
    for index,item in enumerate(grote_classes):
        df_encoded = pd.DataFrame(encoder.fit_transform(df_unique[[str(grote_classes[index])]]).toarray().astype(int))
        df_unique[str(grote_classes[index])] = df_encoded.values.tolist()     
    df_unique = df_unique.drop(columns=lijst_kleine_classes, axis=1)
    return df_unique, grote_classes
  
def classes_to_array(df_unique, attribuut="ndTypeOfGrain"):
    True_class = []
    class_soort = []
    df_classes_only = df_unique.drop(df_unique.columns[[0,1,2,3]], axis=1) # Drop eerste 4 columns van hierboven
 
    for class_item in df_classes_only:
        True_class.append(df_classes_only[class_item].sum())
        class_soort.append(class_item)
 
    classes_available = dict(zip(class_soort, True_class))
    classes_available = dict( sorted(classes_available.items(), key=operator.itemgetter(1),reverse=True))
 
    df_unique['Aantal_classes'] = df_unique[class_soort].sum(axis=1)
    df_only_classes = df_unique[class_soort]
    df_unique['class_num_'+attribuut] = df_only_classes.values.tolist()
    df_unique['class_names_'+attribuut] = [class_soort for row in range(len(df_unique))]
    return df_unique, classes_available
 
def without_all_classes(df_unique, attribuut="ndTypeOfGrain"): 
    df_without_all_classes = df_unique[['gtin','gln','glnAH',str(attribuut),'Aantal_classes', 'class_num_'+str(attribuut),'class_names_'+str(attribuut)]].copy()
    return df_without_all_classes
    
def save_table(df_small, ATTRIBUUT):
    # Save Table
    spark_df = spark.createDataFrame(df_small)
    file_name = '_'+ATTRIBUUT+'_ML_inc_overig'
    spark.sql("use mltrainingdata")
    x = 'drop table if exists ' + file_name
    spark.sql(x)
    spark_df.write.mode("overwrite").saveAsTable(file_name)

## Plot functions

In [0]:
def plot_class_distributie(df_unique, attribuut="ndTypeOfGrain"):
    MAX_LENGTH = df_unique['Aantal_classes'].max()
    sns.histplot(df_unique['Aantal_classes'], bins=MAX_LENGTH, kde=True, element="step", color='navy')
    plt.xlabel("Amount classes in product, dataset "+attribuut)
    plt.xlim(1,MAX_LENGTH+1)
    combinations_count = list(df_unique['Aantal_classes'].value_counts())
    plt.ylim(0, int(combinations_count[0])+100) # customize the size of the figure by the max value of bin 1
    print(df_unique['Aantal_classes'].value_counts())

def distribution_of_unique_classes_2(classes_available, attribuut="ndTypeOfGrain"):
    print(classes_available)
    plt.bar(range(len(classes_available)), list(classes_available.values()), tick_label = list(classes_available.keys()))
    plt.show()

## Create 'Others/Overige'

In [0]:
def create_overige_class(nd_controle, ATTRIBUUT,Classes_threshold, Overig_threshold):
    multi_label_flag = False
    from sklearn.preprocessing import OneHotEncoder
    df = nd_controle.toPandas() # clear dataframe
    df = df.replace(r'^\s*$', 'EMPTY', regex=True)
    Class_verdeling = df[ATTRIBUUT].value_counts().reset_index()
    print("ATTRIBUUT: ", ATTRIBUUT)
    #print("Verhouding van classes: \n",Class_verdeling)
    unique_items = get_unique_classes(df,ATTRIBUUT)

    # Split colomn into classes  / Split de column van het attribuut in classes
    df_unique = df[["gtin","gln","glnAH",ATTRIBUUT]]
    if len(unique_items) >=Classes_threshold: # check if Attribuut has more than 10 classes
        df_geformateerd = one_hot_encoder(df_unique, unique_items, ATTRIBUUT)
        
#         useless_df, classes_available_plot = classes_to_array(df_geformateerd, ATTRIBUUT)
#         distribution_of_unique_classes_2(classes_available_plot,ATTRIBUUT)
        
        df_geformateerd, grote_classes = create_overig(df_geformateerd, ATTRIBUUT,unique_items, Overig_threshold)
        df_geformateerd, classes_available = classes_to_array(df_geformateerd, ATTRIBUUT)
        multi_label_flag = True
    else:
        df_geformateerd = one_hot_encoder(df_unique, unique_items, ATTRIBUUT)
        df_geformateerd, classes_available = classes_to_array(df_geformateerd, ATTRIBUUT)
        distribution_of_unique_classes_2(classes_available,ATTRIBUUT)
        multi_label_flag = False

    print("null entries = ", df_geformateerd["class_num_"+str(ATTRIBUUT)].isna().sum())
    return df_geformateerd,multi_label_flag,classes_available

## Balance dataset if necessary

In [0]:
def balance_multi_label(UPSAMPLE_Classes, y_train, X_train, classes, Upsample_size, balance_set):
    print("\n----------- BALANCING ", balance_set, " -----------")
    for class_ in UPSAMPLE_Classes:
        # UP-sample op xtrain en niet in testset en validatieset, zo komen dezelfde producten niet in alle sets.
        df_minority_y = y_train.loc[y_train[class_].astype(str)==str(classes[1])]
        samples_available = len(df_minority_y)
        print("\nSamples available ",class_ ,samples_available, "\nSearching for: ", str(classes[1]))
        if samples_available < Upsample_size:
            X_train_no_duplicates = X_train[~X_train.index.duplicated(keep='first')]

            # Upsample minority class
            y_train_upsample = resample(df_minority_y, 
                                             replace=True,     # sample with replacement
                                             n_samples=int(Upsample_size-samples_available),    # to match majority class
                                             random_state=123) # reproducible results
            X_train_upsample = X_train_no_duplicates.loc[y_train_upsample.index.tolist()]
            #X_train = X_train.loc[y_train.index.tolist()]
            print("Shape before upsampling minority class ", len(y_train), len(X_train))
            y_train = pd.concat([y_train, y_train_upsample])
            X_train = pd.concat([X_train, X_train_upsample])
            #print(len(df_minority_y), len(X_train_no_duplicates))
            print("Add Extra samples Y_train | X_Train ", len(y_train_upsample),len(X_train_upsample))
            print("New shape ", len(y_train), len(X_train))
                        
    return X_train, y_train

In [0]:
def balance_single_label(DF_merged, upsample_classes_list, Upsample_size):
    for class_ in upsample_classes_list:
        # UP-sample op xtrain en niet in testset en validatieset, zo komen dezelfde producten niet in alle sets.
        df_minority = DF_merged.loc[DF_merged[class_] == 1]
        samples_available = len(df_minority)
        print("Samples available = ",samples_available)
        
        if samples_available < Upsample_size:

            # Upsample minority class
            upsampled_df = resample(df_minority, 
                                             replace=True,     # sample with replacement
                                             n_samples=int(Upsample_size-samples_available),    # to match majority class
                                             random_state=123) # reproducible results
            DF_merged = pd.concat([DF_merged, upsampled_df])
                       
    return DF_merged

In [0]:
# Under construction
def imbalance_check_single_label(classes_available, DF_merged, balance_threshold_single_label):
    upsample_classes_list = []
    for key, value in classes_available.items():
        if value < balance_threshold_single_label:
            upsample_classes_list.append(key)
    if len(upsample_classes_list) > 0:
        print("\nDeze classes zijn laag in sample, en worden gebalanceerd: ", upsample_classes_list)
        
        print("Dataframe length, VOOR balancing", len(DF_merged))
        DF_merged = balance_single_label(DF_merged, upsample_classes_list, upsample_single_label)
        print("Dataframe length, NA balancing", len(DF_merged),"\n")
        
    return DF_merged

In [0]:
def equal_classes_downsample(classes_available, DF_merged):
    print("Shape van dataframe: ",(DF_merged["gln"].count(),len(DF_merged.columns)))
    downsample_df = pd.DataFrame()
    value_classes = []
    classes = []
    for key, value in classes_available.items():
        classes.append(key)
    for class_ in classes:
        df_minority = DF_merged.loc[DF_merged[class_] == 1]
        value_classes.append(len(df_minority))
    downsample_value = np.amin(value_classes)
    print("downsampling each class to: ", downsample_value)
    
    for class_ in classes:
        df_minority = DF_merged.loc[DF_merged[class_] == 1]
        samples_available = len(df_minority)
        print(class_+" samples available: ",samples_available)
        
        # Upsample minority class
        sample_df = resample(df_minority, 
                                         replace=True,     # sample with replacement
                                         n_samples=int(downsample_value),    # to match majority class
                                         random_state=123) # reproducible results
        downsample_df = pd.concat([downsample_df, sample_df])
                       
    print("Shape of downsampled dataframe: ", len(downsample_df), "\n")
    return downsample_df

## Natural Language Processing

In [0]:
class Natural_Language_Processing_nd:
    def __init__(self, df_NLP, input_features, filename):
        self.df_NLP = df_NLP
        self.input_features = input_features
        self.filename = filename
        
    def prepare_downloads(self):
        !python -m spacy download nl_core_news_lg
        !python -m spacy download en_core_web_lg
        nltk.download('all')
        nlp_dutch = spacy.load("nl_core_news_lg")
        nlp_en = spacy.load("en_core_web_lg")
        return nlp_dutch, nlp_en

################################### Preprocessing functions ####################################
    def empty_fields_fix(self, df_NLP):
        df_NLP['all_text_features'] = '' #create empty column in dataframe
        for feature in self.input_features:
            print('Feature '+feature+' ',(df_NLP[feature].isna()).sum()) # CHECK how many empty fields there are, since this causes errors
            df_NLP[feature] = df_NLP[feature].fillna('')
            df_NLP['all_text_features'] += df_NLP[feature]+' '
        return df_NLP

    def lower_case(self, df_NLP): # Lowercase
        df_NLP['lowercase'] = df_NLP['all_text_features'].str.lower()
        return df_NLP

    def remove_punctuation(self, df_NLP): # Remove punctuation and symbols
        PUNCT_TO_REMOVE = string.punctuation
        print('Punctuation that will be removed: ',PUNCT_TO_REMOVE)
        df_NLP["no_punctuation"] = df_NLP["lowercase"].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))
        #display(df_NLP.head())
        return df_NLP

    def stopwords_removal(self, df_NLP):   # Remove stopwords
        from nltk.corpus import stopwords
        nltk.download('stopwords')
        STOPWORDS = stopwords.words('dutch') + ["empty_cell"]
        df_NLP["NO_stopwords"] = df_NLP['no_punctuation'].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))   # Remove STOPWORDS
        #display(df_NLP.head())
        return df_NLP

    def most_frequent_words(self, df_NLP):   # Find most frequent words in product sentences
        from collections import Counter
        cnt = Counter()
        for text in df_NLP["NO_stopwords"].values:
            for word in text.split():
                cnt[word] += 1      
        most_common = cnt.most_common(10)
        print("Most common words:\n", most_common)
        return most_common

    def lemmatization(self, df_NLP, nlp_dutch, nlp_en):  # Bring words back to one basis form
        print("Lemmmatization, takes a while..")
        df_NLP["lemmatized_DUTCH"] = df_NLP['NO_stopwords'].apply(lambda x: " ".join([y.lemma_ for y in nlp_dutch(x)]))
#         df_NLP["lemmatized_DUTCH_EN"] = df_NLP['lemmatized_DUTCH'].apply(lambda x: " ".join([y.lemma_ for y in nlp_en(x)]))
        print("Lemmatization done!")
        return df_NLP

    def tokenization(self, df_NLP): # Split sentence in to tokens
        df_NLP["tokenized_DUTCH"] = df_NLP["lemmatized_DUTCH"].apply(nltk.word_tokenize)
#         df_NLP["tokenized_EN_DUTCH"] = df_NLP["lemmatized_DUTCH_EN"].apply(nltk.word_tokenize)
        return df_NLP

    def text_2_sequence(self, df_NLP):   # Text to numerical data, understandable for Machine learning
        tokenizer = Tokenizer() #init

        df_NLP['length_tokenized'] = df_NLP['tokenized_DUTCH'].str.len()
        MAX_LEN = df_NLP['length_tokenized'].max()
        print("Max length of dutch: ",MAX_LEN)
        sequences = tokenizer.fit_on_texts(df_NLP["tokenized_DUTCH"])
        sequences = tokenizer.texts_to_sequences(df_NLP["tokenized_DUTCH"])
        padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN)
        df_NLP["padded_seq_"] = padded_sequences.tolist()
        
#         df_NLP['length_tokenized_EN'] = df_NLP['tokenized_EN_DUTCH'].str.len()
#         MAX_LEN = df_NLP['length_tokenized_EN'].max()
#         print("Max length dutch+en: ",MAX_LEN)
#         sequences = tokenizer.fit_on_texts(df_NLP["tokenized_EN_DUTCH"])
#         sequences = tokenizer.texts_to_sequences(df_NLP["tokenized_EN_DUTCH"])
#         padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN)
#         df_NLP["padded_seq__EN"] = padded_sequences.tolist()
        return df_NLP, tokenizer

    def save_NLP_table(self, df_NLP):   # Save Table, such that we can use it for the neural network later on.
        spark_df = spark.createDataFrame(df_NLP)
        spark.sql("use default")
        to_drop_str = 'drop table if exists ' + self.filename
        spark.sql(to_drop_str)
        spark_df.write.mode("overwrite").saveAsTable(self.filename)
        print("Saved as: _nd_Modelling_NLP")
        return spark_df

    def save_tokenizer(self,tokenizer):
        tokenizer_json = tokenizer.to_json()
        dbutils.fs.put("/FileStore/nd_MODELS/Tokenizer.json", tokenizer_json)

    
    def NLP_Loop(self, overwrite_tokenizer, overwrite_NLP_table):
        nlp_dutch, nlp_en = self.prepare_downloads() #nlp_en can be added if needed
        clear_output(wait=True)
        df_NLP = self.empty_fields_fix(self.df_NLP)
        df_NLP = self.lower_case(df_NLP)
        df_NLP = self.remove_punctuation(df_NLP)
        df_NLP = self.stopwords_removal(df_NLP)
        most_common = self.most_frequent_words(df_NLP)
        df_NLP = self.lemmatization(df_NLP, nlp_dutch, nlp_en)
        df_NLP = self.tokenization(df_NLP)
        df_NLP, tokenizer = self.text_2_sequence(df_NLP)
        if overwrite_NLP_table==True:
            spark_df = self.save_NLP_table(df_NLP)
        if overwrite_tokenizer==True:
            self.save_tokenizer(tokenizer)
        return spark_df, tokenizer

# Functions Training

## Correlatie matrix, Pass Numerical Features

In [0]:
def Correlation_matrix(DF_merged, ATTRIBUUT, show_nan_plot=True, show_correlation_matrix=True):
    # numerieke features met empty entries geven error op de correlatie tabel
    plt.rcParams.update({'font.size': 11})
    NUMERIEKE_nutrienten = ["FIBER","SUGAR","FAT","FASAT","CHOAVL","PROTEINE","SALT","ENER_KJ","ENER_Kcal"]
    DF_merged["CLASS_"+str(ATTRIBUUT)] = DF_merged["class_num_"+str(ATTRIBUUT)].apply(np.argmax)
    
    nan_list_numeriek = []
    nan_list_labels = []
    for nutrient in NUMERIEKE_nutrienten:
        DF_merged[nutrient] = DF_merged[nutrient].astype(float)
        #print(nutrient, DF_merged[nutrient].isna().sum())
        nan_list_numeriek.append(DF_merged[nutrient].isna().sum())
        nan_list_labels.append(nutrient)
        #DF_merged[nutrient] = DF_merged[nutrient].fillna(0)
        DF_merged[nutrient+"_was_missing"] = np.where(DF_merged[nutrient].isnull(), 1, 0)
        if nutrient == "SUGAR":
            DF_merged["SUGAR_IS_ZERO"] = np.where(DF_merged[nutrient]== 0.0, 1, 0)
        # REPLACE the null now by a mean
        DF_merged[nutrient] = DF_merged[nutrient].fillna(DF_merged[nutrient].mean()) # mean or median
    if show_nan_plot == True: 
        fig, ax = plt.subplots(figsize =(16, 9))
        bars = ax.barh(nan_list_labels, nan_list_numeriek)        
        ax.set_title('Null values in nutritional attributes.\nBased on Food products.',
                     loc ='center', )
        ax.bar_label(bars, label_type='center', fontsize=20)
        print(bars, type(bars))
        # Show Plot
        plt.show()
    # string features omzetten naar numerieke classes
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(DF_merged['Fiber_UOM'])
    DF_merged['Fiber_UOM_class']=le.transform(DF_merged['Fiber_UOM'])
    le.fit(DF_merged['NBQ_UOM'])
    DF_merged['NBQ_UOM_class']=le.transform(DF_merged['NBQ_UOM'])
    le.fit(DF_merged['SUGAR_UOM'])
    DF_merged['SUGAR_UOM_class']=le.transform(DF_merged['SUGAR_UOM'])

    #display(DF_merged.head())
    if show_correlation_matrix==True:
        df_correlatie = pd.DataFrame(DF_merged,columns=["FIBER","SUGAR","FAT","FASAT","CHOAVL","PROTEINE","SALT","ENER_KJ","Fiber_UOM_class","SUGAR_UOM_class","NBQ_UOM_class",'CLASS_'+str(ATTRIBUUT)])
        f = plt.figure(figsize=(15,11))
        corrMatrix = df_correlatie.corr()
        sns.heatmap(corrMatrix, annot=True, fmt='.2f', cmap='Blues',annot_kws={"fontsize":16})
        plt.show()
    return DF_merged

## Prepare Train, Test en validatie data

In [0]:
def Numerical_features(ATTRIBUUT, DF_merged):
    #Numerical_features = DF_merged[['Fiber_UOM_class','NBQ_UOM_class','SUGAR_UOM_class']]
    Numerical_features = DF_merged[['Fiber_UOM_class','NBQ_UOM_class','SUGAR_UOM_class']]
    # Welke numerical features wil je bij het attribuut?
    if ATTRIBUUT == "ndFiberIndex":
        Numerical_features = DF_merged[['FIBER','FIBER_was_missing','Fiber_UOM_class','PROTEINE']] # ndFiberIndex
    if ATTRIBUUT == "ndAddedSugar":
        Numerical_features = DF_merged[['SUGAR_was_missing','SUGAR_UOM_class','SUGAR']]
    if ATTRIBUUT == "ndFreeOfAlcohol":
        Numerical_features = DF_merged[['Fiber_UOM_class','NBQ_UOM_class','SUGAR_UOM_class']]
    if ATTRIBUUT == "ndTypeOfGrain":
        Numerical_features = ['FIBER']
    if ATTRIBUUT == "ndAnimalSpecies":
        Numerical_features = ['FIBER']
    return Numerical_features

def Formateer_Features(DF_merged, ATTRIBUUT):
    X_train_text_DF = DF_merged['padded_seq_'].tolist()
    X_train_text = torch.FloatTensor(X_train_text_DF)
    X_train_text = X_train_text.numpy()
    vocab_size = np.amax(X_train_text)+1

    # TARGET
    ytrain = DF_merged['class_num_'+ATTRIBUUT].tolist()
    ytrain_tensor = torch.FloatTensor(ytrain)
    ytrain = ytrain_tensor.numpy()
    print("Vocabular size: ", vocab_size)
    return vocab_size, X_train_text, ytrain

In [0]:
def split_train_test(xtrain, ytrain):
    from sklearn.model_selection import train_test_split
    Xtrain, Xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.3, random_state = 42)
    Xval, Xtest, yval, ytest = train_test_split(Xval, yval, test_size=0.5, random_state = 42)
    return Xtrain, ytrain, Xval, yval, Xtest, ytest

def unzip_sets(Xtrain, Xval, Xtest):
    Xtrain_num = Xtrain[:,0]
    Xtrain_text = Xtrain[:,1]
    Xval_num = Xval[:,0]
    Xval_text = Xval[:,1]
    Xtest_num = Xtest[:,0]
    Xtest_text = Xtest[:,1]
    return Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text
  
def total_split(DF_merged, numerieke_features, ATTRIBUUT):
    from sklearn.model_selection import train_test_split
    X = DF_merged[['gtin','gln','Lemmatized',ATTRIBUUT,'padded_seq_','class_names_'+ATTRIBUUT]]
    X2 = DF_merged[numerieke_features]
    X = X.merge(X2, left_index=True, right_index=True, how='inner')
    targets = DF_merged['class_names_'+ATTRIBUUT].tolist()
    targets = targets[0]
    y = DF_merged[targets]
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2, random_state=1)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                      test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

    print(X_train.shape, X_test.shape, X_val.shape)
    print(y_train.shape, y_test.shape, y_val.shape)
    return  X_train, X_val, X_test, y_train, y_val, y_test, targets

def to_tensor(Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text):
    Xtrain_num = Xtrain_num.tolist()
    Xtrain_num = torch.FloatTensor(Xtrain_num)
    Xtrain_num = Xtrain_num.numpy()

    Xval_num = Xval_num.tolist()
    Xval_num = torch.FloatTensor(Xval_num)
    Xval_num = Xval_num.numpy()

    Xtest_num = Xtest_num.tolist()
    Xtest_num = torch.FloatTensor(Xtest_num)
    Xtest_num = Xtest_num.numpy()

    Xtrain_text = Xtrain_text.tolist()
    Xtrain_text = torch.FloatTensor(Xtrain_text)
    Xtrain_text = Xtrain_text.numpy()

    Xval_text = Xval_text.tolist()
    Xval_text = torch.FloatTensor(Xval_text)
    Xval_text = Xval_text.numpy()

    Xtest_text = Xtest_text.tolist()
    Xtest_text = torch.FloatTensor(Xtest_text)
    Xtest_text = Xtest_text.numpy()
    return  Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text
  
def multiple_classes_PREP(numerical_ftrs, X_train, X_test, X_val, y_train, y_test, y_val, target):
    X_train_num = X_train[numerical_ftrs].to_numpy()
    X_test_num = X_test[numerical_ftrs].to_numpy()
    X_val_num = X_val[numerical_ftrs].to_numpy()

    X_train_text = X_train['padded_seq_'].tolist()
    X_train_text = torch.FloatTensor(X_train_text)
    X_train_text = X_train_text.numpy()
    
    X_test_text = X_test['padded_seq_'].tolist()
    X_test_text = torch.FloatTensor(X_test_text)
    X_test_text = X_test_text.numpy()
    
    X_val_text = X_val['padded_seq_'].tolist()
    X_val_text = torch.FloatTensor(X_val_text)
    X_val_text = X_val_text.numpy()
    
    # TARGET
    ytrain = y_train[target].tolist()
    ytrain_tensor = torch.FloatTensor(ytrain)
    ytrain = ytrain_tensor.numpy()

    ytest = y_test[target].tolist()
    ytest_tensor = torch.FloatTensor(ytest)
    ytest = ytest_tensor.numpy()

    yval = y_val[target].tolist()
    yval_tensor = torch.FloatTensor(yval)
    yval = yval_tensor.numpy()

    return X_train_num, X_test_num, X_val_num, X_train_text, X_test_text, X_val_text, ytrain, ytest, yval
  
def single_prep(numerieke_features, DF_merged, ATTRIBUUT):
    Numerical_features = DF_merged[numerieke_features]

    X_train_num = Numerical_features.to_numpy()

    vocab_size, X_train_text, ytrain = Formateer_Features(DF_merged, ATTRIBUUT)
    print("Numerical Features: ", Numerical_features.columns)
    print("Shape Numerical Features: ", X_train_num.shape)
    print("Shape Text Features: ", X_train_text.shape)
    print("Shape Targets: ", ytrain.shape)

    X_train = np.array(list(zip(X_train_num, X_train_text)))
    Xtrain, ytrain, Xval, yval, Xtest, ytest = split_train_test(X_train, ytrain)
    print(Xtrain.shape, Xtest.shape, Xval.shape)
    print(ytrain.shape, ytest.shape, yval.shape)
    
    Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text = unzip_sets(Xtrain, Xval, Xtest)
    Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text = to_tensor(Xtrain_num, Xtrain_text, Xval_num, 
                                                                                Xval_text, Xtest_num, Xtest_text)# To Tensor format
    return ytrain, yval, ytest, Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text
  
def single_prep_controle(numerieke_features, DF_merged, ATTRIBUUT):
    Numerical_features = DF_merged[numerieke_features]

    X_train_num = Numerical_features.to_numpy()

    vocab_size, X_train_text, ytrain = Formateer_Features(DF_merged, ATTRIBUUT)
    print("Numerical Features: ", Numerical_features.columns)
    print("Shape Numerical Features: ", X_train_num.shape)
    print("Shape Text Features: ", X_train_text.shape)
    print("Shape Targets: ", ytrain.shape)

    return X_train_num, X_train_text, ytrain, vocab_size

## Model metrics, statistics and plots

In [0]:
def predictions(model):
    yhat =  model.predict(Xtest, verbose=1)
    yhat2 = np.round(yhat, 0)
    model.evaluate(Xtest, ytest)

    list_ytest = []
    list_yhat = []
    for idx in range(len(yhat2)):
        list_ytest.append(np.argmax(ytest[idx]))
        list_yhat.append(np.argmax(yhat2[idx]))
    return list_ytest, list_yhat, yhat2

def accuracy_report(y_test, y_pred, macro_f1_list):
    print("Confusion Matrix:\n**************************\n ",
         confusion_matrix(y_test, y_pred),"\n**************************\n")
    print("Accuracy: ",
         accuracy_score(y_test, y_pred)*100)
    print("Classification report: ",
         classification_report(y_test, y_pred))
    report = classification_report(y_test, y_pred, output_dict=True)
    macro_f1 = report['macro avg']['f1-score']
    macro_f1_list.append(macro_f1)
    return macro_f1_list
    
def plot_confusion_matrix(DF_merged, cf_matrix, ATTRIBUUT):
    unique_classes = DF_merged["class_num_"+str(ATTRIBUUT)].tolist()
    unique_classes = [item for item in unique_classes]
    unique_classes = unique_classes[0]
    print("unique amount of classes: ",len(unique_classes), unique_classes)
    group_names = []
    for i in range(len(unique_classes)):
        for class_ in unique_classes: 
            group_names.append(class_)

    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cf_matrix.flatten()/np.sum(cf_matrix)]

    labels = [f"{v1}\n{v2}" for v1, v2 in
              zip(group_counts,group_percentages)]
    
    class_names= DF_merged["class_names_"+str(ATTRIBUUT)].tolist()
    
    
    labels = np.asarray(labels).reshape(len(unique_classes), len(unique_classes))
    s = sns.heatmap(cf_matrix, xticklabels=class_names[0], yticklabels=class_names[0], annot=labels, fmt='', cmap='Blues')
    s.set(xlabel='Predicted', ylabel='True label')
    
    s.set_title("Attribute:\n "+str(ATTRIBUUT)+"\n"+str(' - '.join(class_names[0])))
    plt.show()
    
def plot_training(acc, val_acc, NAME='Accuracy'):
    EPOCH = len(acc)
    for param in ['figure.facecolor', 'axes.facecolor', 'savefig.facecolor']:
        plt.rcParams[param] = '1.0'#'#212946'  # bluish dark grey
    for param in ['text.color', 'axes.labelcolor', 'xtick.color', 'ytick.color']:
        plt.rcParams[param] = '0.3'  # very light grey
    epochs_range = np.arange(1,EPOCH+1)
    plt.plot(epochs_range, acc, 'g', label='Train '+str(NAME))
    plt.plot(epochs_range, val_acc, 'b', label='Val '+str(NAME))
    plt.title('Training and Validation \n '+str(NAME)+' of '+str(ATTRIBUUT))
    plt.xlabel('Epochs')
    plt.ylabel(str(NAME))
    plt.legend()
    plt.show()
    
def predictions_2inputs(model, Xtest_num,Xtest_text, ytest):
    yhat =  model.predict([Xtest_num, Xtest_text], verbose=1)
    yhat2 = np.round(yhat, 0)
    model.evaluate([Xtest_num, Xtest_text], ytest)

    list_ytest = []
    list_yhat = []
    for idx in range(len(yhat2)):
        list_ytest.append(np.argmax(ytest[idx]))
        list_yhat.append(np.argmax(yhat2[idx]))
    return list_ytest, list_yhat, yhat2

def plot_multiple_training(acc, val_acc, class_names, ATTRIBUUT, type_):
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(25,14))
    EPOCH = len(acc[0])
    for param in ['figure.facecolor', 'axes.facecolor', 'savefig.facecolor']:
        plt.rcParams[param] = '1.0'
    for param in ['text.color', 'axes.labelcolor', 'xtick.color', 'ytick.color']:
        plt.rcParams[param] = '0.3'  
    epochs_range = np.arange(1,EPOCH+1)
    for index, class_acc in enumerate(acc):
        ax1.plot(epochs_range, class_acc, label=str(class_names[index]))
    for index, class_val_acc in enumerate(val_acc):
        ax2.plot(epochs_range, class_val_acc, label=str(class_names[index]))
    ax1.set_title('Training \n '+type_+' of '+str(ATTRIBUUT))
    ax2.set_title('Validation \n '+type_+' of '+str(ATTRIBUUT))
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel(type_)
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel(type_)
    ax1.legend()
    ax2.legend()
    plt.show()
    
def plot_single_label_training(model, model_history, Xtest_num, Xtest_text, ytest, ATTRIBUUT, DF_merged):
    print(model_history.history.keys())
 
    list_ytest, list_yhat, yhat2 = predictions_2inputs(model, Xtest_num, Xtest_text, ytest)

    plot_confusion_matrix(DF_merged, confusion_matrix(list_ytest, list_yhat), ATTRIBUUT)
    macro_f1_list = []
    macro_f1_list = accuracy_report(list_ytest, list_yhat,macro_f1_list)
    print(macro_f1_list)
    plot_training(model_history.history['accuracy'], model_history.history['val_accuracy'],NAME='Accuracy')
    plot_training(model_history.history['loss'], model_history.history['val_loss'],NAME='Loss')

## Vocabulary Size

In [0]:
def get_vocabulary_size(DF_ATTRIBUUT):
    vocab = DF_ATTRIBUUT['padded_seq_'].tolist()
    vocab = torch.FloatTensor(vocab)
    vocab = vocab.numpy()
    vocab_size = np.amax(vocab)+1
    print("Vocab size: ", vocab_size)
    return vocab_size

## Neural network

In [0]:
class all_models:
    def __init__(self, Xtrain_num, Xtrain_text, vocab_size, ytrain):
        self.Xtrain_num = Xtrain_num
        self.Xtrain_text = Xtrain_text
        self.vocab_size = vocab_size
        self.ytrain = ytrain
        self.input_NUM = Input(shape=self.Xtrain_num.shape[1]) # Numerieke features
        self.input_TEXT = Input(shape=self.Xtrain_text.shape[1]) # Text features
        
    def LOAD_MODEL(self):
        emb = Embedding(int(self.vocab_size), output_dim=20,input_length=self.Xtrain_text.shape[1])(self.input_TEXT)
        #lstm = LSTM(8, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb)
        fltn = Flatten()(emb)
        x = concatenate([fltn, self.input_NUM])

        x = Dense(30, activation='relu')(x)
        x = Dropout(0.5)(x)
        x = Dense(20, activation='relu')(x)
        x = Dropout(0.5)(x)
        x = Dense(self.ytrain.shape[1], activation='softmax')(x)

        model = Model(inputs=[self.input_NUM , self.input_TEXT], outputs=[x])
        model.summary()

        optimizer = SGD(learning_rate=0.01)
        #model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
        model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])
        return model

    def LOAD_MODEL_2(self):
        emb = Embedding(int(self.vocab_size), output_dim=4,input_length=self.Xtrain_text.shape[1])(self.input_TEXT)
        #lstm = LSTM(8, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb)
        emb = Flatten()(emb)
        input_NUM = Dropout(0.3)(self.input_NUM)
        emb = Dropout(0.3)(emb)
        x = concatenate([emb, input_NUM])
        x = Dense(30, activation='relu')(x)
        x = Dense(50, activation='relu')(x)
        x = Dense(20, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(self.ytrain.shape[1], activation='softmax')(x)

        model = Model(inputs=[self.input_NUM , self.input_TEXT], outputs=[x])
        model.summary()

        optimizer = SGD(learning_rate=0.008)
        #model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
        model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])
        return model

    def LOAD_MODEL_3(self):
        emb = Embedding(int(self.vocab_size), output_dim=4,input_length=self.Xtrain_text.shape[1])(self.input_TEXT)
        #lstm = LSTM(8, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb)
        emb = Flatten()(emb)
        input_NUM = Dropout(0.3)(self.input_NUM)
        emb = Dropout(0.3)(emb)
        x = concatenate([emb, input_NUM])
        x = Dense(100, activation='relu')(x)
        x = Dense(50, activation='relu')(x)
        x = Dense(100, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(self.ytrain.shape[1], activation='softmax')(x)

        model = Model(inputs=[self.input_NUM , self.input_TEXT], outputs=[x])
        model.summary()

        optimizer = SGD(learning_rate=0.008)
        #model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
        model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])
        return model

## Train model multi-label or mutli-class (Single Label)

In [0]:
def TRAINING_multi_label(X_train, X_val, X_test, y_train, y_val, y_test, numerical_ftrs, vocab_size, ATTRIBUUT, SAVE_MODEL):
    total_pred = []
    total_test = []
    
    train_accuracies = []
    val_accuracies = []
    
    train_loss = []
    val_loss = []
    
    class_names = []
    macro_f1_list= []
    print("**************** MULTI LABEL MODEL ***************\n")
    print("Save all models = ",SAVE_MODEL,"\nNumerieke features = ", numerical_ftrs, "\nAttribuut = ", ATTRIBUUT )
    
    for (target, data) in tqdm(y_train.iteritems()):
        print('\n',target)
        Xtrain_num, Xtest_num, Xval_num, Xtrain_text, Xtest_text, Xval_text, ytrain, ytest, yval = multiple_classes_PREP(numerical_ftrs, X_train, X_test, X_val, y_train, y_test, y_val, target)
        
        print("X-Train | X-Validation | X-Test ## Numerical shape", Xtrain_num.shape,  Xval_num.shape,  Xtest_num.shape)
        print("X-Train | X-Validation | X-Test ## Textual shape", Xtrain_text.shape, Xval_text.shape, Xtest_text.shape)
        print("Y-Train | Y-Validation | Y-Test ## TARGET shape", ytrain.shape, yval.shape, ytest.shape)
        models = all_models(Xtrain_num, Xtrain_text, vocab_size, ytrain)
        model = models.LOAD_MODEL_3()

        model_history = model.fit([Xtrain_num,Xtrain_text], ytrain, epochs=30, verbose=2, validation_data=([Xval_num,Xval_text], yval), shuffle=True)
        list_ytest, list_yhat, yhat2 = predictions_2inputs(model, Xtest_num, Xtest_text, ytest)
        macro_f1_list = accuracy_report(list_ytest, list_yhat, macro_f1_list) 

        train_accuracies.append(model_history.history['accuracy'])
        val_accuracies.append(model_history.history['val_accuracy'])
        train_loss.append(model_history.history['loss'])
        val_loss.append(model_history.history['val_loss'])
        
        class_names.append(target)
        if SAVE_MODEL == True:
            model.save("/FileStore/nd_MODELS/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5")
            dbutils.fs.cp("file:/FileStore/nd_MODELS/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5", "dbfs:/FileStore/nd_MODELS/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5") 
            display(dbutils.fs.ls("/FileStore/nd_MODELS/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5"))
    print(class_names, macro_f1_list)
    return train_accuracies, val_accuracies, train_loss, val_loss, class_names, model

In [0]:
def TRAINING_single_label( Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text, vocab_size, EPOCHS, SAVE_MODEL):
    print("**************** SINGLE LABEL MODEL ***************\n")
    models = all_models(Xtrain_num, Xtrain_text, vocab_size, ytrain)
    model = models.LOAD_MODEL_2()
    model_history = model.fit([Xtrain_num,Xtrain_text], ytrain, epochs=EPOCHS, verbose=2, validation_data=([Xval_num,Xval_text], yval), shuffle=True, batch_size=32) # Start training the model --> Singel label
    
    if SAVE_MODEL == True:
        model.save("/FileStore/nd_MODELS/model_"+str(ATTRIBUUT)+".h5")
        dbutils.fs.cp("file:/FileStore/nd_MODELS/model_"+str(ATTRIBUUT)+".h5", "dbfs:/FileStore/nd_MODELS/model_"+str(ATTRIBUUT)+".h5") 
        display(dbutils.fs.ls("/FileStore/nd_MODELS/model_"+str(ATTRIBUUT)+".h5"))
    return model, model_history

# Functions Prediction Notebooks

In [0]:
def get_targets(DF_merged, ATTRIBUUT):
    TARGETS = DF_merged['class_names_'+ATTRIBUUT].tolist()
    TARGETS = TARGETS[0]
    print("Targets: ",TARGETS)
    for index, target in enumerate(TARGETS):
        if target == 'EMPTY':
            index_empty = index  
    print("Index of Empty: ", index_empty)
    return index_empty, TARGETS

In [0]:
def load_numerical_features(ATTRIBUUT):
    spark.sql("use nieuwe_data")
    num_features = spark.sql("select * from nd_models__numerical_features")
    num_features = num_features.toPandas()
    attribuut_rows = num_features.loc[num_features['ndAttribuut'] == ATTRIBUUT]
    attribuut_rows = spark.createDataFrame(attribuut_rows)
    w = Window.partitionBy('ndAttribuut').orderBy(col('date').desc())
    df = (attribuut_rows .withColumn('rank',F.row_number().over(w)))
    df = (df .filter(df['rank'] == 1).drop('rank'))
    num_features = df.toPandas()
    
    numerieke_features = num_features['Num_Features'].tolist()
    numerieke_features = numerieke_features[0]
    if numerieke_features == ['']:
        numerieke_features = []
    print("Numerieke Features: ", numerieke_features)
    return numerieke_features

In [0]:
def predictions_to_dataframe(DF_merged, ATTRIBUUT, list_ytest, list_yhat):
    controle_tabel = DF_merged[['gtin','gln','Lemmatized',str(ATTRIBUUT)]]
    labels = DF_merged['class_names_'+str(ATTRIBUUT)].tolist()
    labels = labels[0]

    controle_tabel['TRUE'] = list_ytest
    controle_tabel['PREDICTED'] = list_yhat

    total_labels = []
    product_labels= []
    for index in controle_tabel['PREDICTED']:
        product_labels.append(labels[index])
        #total_labels.append(product_labels)
    controle_tabel['PREDICTED Labels'] = product_labels
    print(controle_tabel[ATTRIBUUT].isna().sum())
    return controle_tabel

In [0]:
#Create Controle Attribuut
def ControleAttribuut(df):
    NASA = spark.sql("select NASA_NR, GLN as gln, ARTIKEL_EAN as gtin, MERKNAAM, SCHAPSTICKER_OMSCHRIJVING, ASSGROEP_OMSCHRIJVING from StandaardTabellen.NasaStandaardTabel")
 
    GLNAH = spark.read.format('delta').load('/mnt/Prd_adls/Conformed/TIMS/TradeItem/ahTradeItem/Data/').filter('__DeletedFlag == 0')
    GLNAH = GLNAH.select(col("gtin"), col("gln"), col("glnAH"))
 
    NASA = NASA.join(GLNAH, ['gtin', 'gln'], how="left")
    NASA = NASA.filter(NASA.glnAH == True)
    NASA = NASA.drop('glnAH')
 
    ControleAttribuut = NASA.join(df, ['gtin', 'gln'], how="left")
   
#     for i in ControleAttribuut.columns:
#         ControleAttribuut = ControleAttribuut.withColumn(i, regexp_replace(i, '\n', ''))
        
    return ControleAttribuut

In [0]:
def unequal_and_equal_table(controle_tabel, ATTRIBUUT):
    NLP_unlemmatized = spark.sql("select gtin,gln, NO_stopwords as UnLemmatized from default._nd_Modelling_NLP")
#     NLP_unlemmatized = NLP_unlemmatized.toPandas()
#     NLP_unlemmatized['UnLemmatized'] = NLP_unlemmatized['UnLemmatized'].str.replace(',','')
#     NLP_unlemmatized['UnLemmatized'] = NLP_unlemmatized['UnLemmatized'].str.replace('.','')
#     NLP_unlemmatized = spark.createDataFrame(NLP_unlemmatized)
    
    UNEQUAL_TABLE = controle_tabel.loc[~(controle_tabel['TRUE'] == controle_tabel['PREDICTED'])]
    EQUAL_TABLE = controle_tabel.loc[(controle_tabel['TRUE'] == controle_tabel['PREDICTED'])]
    UNEQUAL_TABLE.drop(columns=['TRUE','PREDICTED'], axis=1, inplace=True)
    EQUAL_TABLE.drop(columns=['TRUE','PREDICTED'], axis=1, inplace=True)
    # add the right NASA columns to dataframe
    
    UNEQUAL_TABLE_spark = spark.createDataFrame(UNEQUAL_TABLE)
    UNEQUAL_TABLE_spark = UNEQUAL_TABLE_spark.join(NLP_unlemmatized, ['gtin','gln'], how = "left")
    UNEQUAL_TABLE_spark = ControleAttribuut(UNEQUAL_TABLE_spark)
    UNEQUAL_TABLE_spark = UNEQUAL_TABLE_spark.filter(UNEQUAL_TABLE_spark[ATTRIBUUT].isNotNull())
    UNEQUAL_TABLE_spark = UNEQUAL_TABLE_spark.drop_duplicates()
    
    EQUAL_TABLE_spark = spark.createDataFrame(EQUAL_TABLE)
    EQUAL_TABLE_spark = EQUAL_TABLE_spark.join(NLP_unlemmatized, ['gtin','gln'], how = "left")
    EQUAL_TABLE_spark = ControleAttribuut(EQUAL_TABLE_spark)
    EQUAL_TABLE_spark = EQUAL_TABLE_spark.filter(EQUAL_TABLE_spark[ATTRIBUUT].isNotNull())
    EQUAL_TABLE_spark = EQUAL_TABLE_spark.drop_duplicates()
    return EQUAL_TABLE_spark, UNEQUAL_TABLE_spark

## Edit predictions, important part!

In [0]:
def predictions_to_right_format(total_test,total_pred, ATTRIBUUT, index_empty, controle_tabel, DF_merged):
    total_test_T = np.array(total_test)
    total_test_T = total_test_T.T
    total_test_T = total_test_T.tolist()
    total_pred_T = np.array(total_pred)
    total_pred_T = total_pred_T.T
    total_pred_T = total_pred_T.tolist()

    empty_pred_dummy = []
    complete_empty_dummy = []
    for index, value in enumerate(total_pred_T[0]):
        if index == index_empty:
            empty_pred_dummy.append(1)
        else:
            empty_pred_dummy.append(0)
        complete_empty_dummy.append(0)
    print("empty dummy", empty_pred_dummy)
    print("complete empty dummy", complete_empty_dummy)
        
    for index, row_pred in enumerate(total_pred_T):
        if row_pred == complete_empty_dummy:
            total_pred_T[index] = empty_pred_dummy # Voorbeeld van een EMPTY list

    # Wanneer er empty + graan aanwezig is, verwijder dan empty. Alleen nodig bij multi-label models
    prediction_new = []
    for prediction in total_pred_T:
        if prediction.count(1) >1 and prediction[index_empty]==1:
            prediction_cp = prediction
            prediction_cp[index_empty] = 0
        else:
            prediction_cp = prediction
        prediction_new.append(prediction_cp)

    controle_tabel['TRUE'] = total_test_T
    controle_tabel['PREDICTED'] = prediction_new #total_pred_T
    controle_tabel_T = controle_tabel[['gtin','gln','Lemmatized',ATTRIBUUT,'TRUE','PREDICTED']]

    # Place labels in column
    labels = DF_merged['class_names_'+ATTRIBUUT].tolist()
    labels = labels[0]

    total_labels = []
    for row in controle_tabel_T['PREDICTED']:
        product_labels = []
        for index, label in enumerate(row):
            if label == 1:
                product_labels.append(labels[index])
        total_labels.append(product_labels)
    controle_tabel_T['PREDICTED Labels'] = total_labels
    return controle_tabel_T

# Write numerical features to a text file, so we know later (predictions) which numerical features has been used.

In [0]:
def save_numerical_features_(SAVE_MODEL, ATTRIBUUT, numerical_features):
    from datetime import datetime
    from pytz import timezone
    
    if SAVE_MODEL == True:
        spark.sql("use nieuwe_data")
        spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true")
        #spark.sql("drop table if exists nd_MODELS__numerical_features")
        
        columns = ['ndAttribuut', 'Num_Features', 'Date']
        amsterdam = timezone('Europe/Amsterdam')
        date = datetime.now(amsterdam).strftime('%Y-%m-%d')
        newRow = spark.createDataFrame([(ATTRIBUUT,numerical_features,date)], columns)
        #appended = df.union(newRow)
        #appended.show()
        newRow.write.mode("append").saveAsTable("nd_MODELS__numerical_features")