# Installs op cluster:

torch
click==8.0.4
keras
tensorflow
spacy 
scispacy
nltk 
torch

# Libraries

In [3]:
import pandas as pd
import os
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql.functions import col


import seaborn as sns
import matplotlib.pyplot as plt

import torch
import numpy as np
from collections import Counter
from tqdm.keras import TqdmCallback
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score

%matplotlib inline


# Tables from Database

In [5]:
#basis + nutrienten + food
basis = spark.read.format('delta').load('/mnt/Prd_adls/Conformed/TIMS/TradeItem/ahTradeItem/Data/').filter('__DeletedFlag == 0')
basis = basis.select("gtin", "gln", 'glnAH', 'sgIngredientsRequiredIndicator', 'ndSingleComposed')

food = spark.sql("select gtin, gln, ndFoodNonFood from ndattribuutoutput.ndfoodnonfood")

vezelwijzer = spark.sql("select gtin, gln, Preferred_table, FIBER, Fiber_UOM, NBQ_UOM, SUGAR, SUGAR_UOM, FAT, FASAT, CHOAVL, PROTEINE, SALT, ENER_KJ, ENER_Kcal, Volgorde_Nutrienten from standaardtabellen. Nutrienten") # NBQ_value
vezelwijzer = vezelwijzer.filter(((vezelwijzer.Volgorde_Nutrienten == '0') | 
                                  (vezelwijzer.Volgorde_Nutrienten == '1') | 
                                  (vezelwijzer.Volgorde_Nutrienten == '2') | 
                                  (vezelwijzer.Volgorde_Nutrienten == '3')) & 
                                 (vezelwijzer.Preferred_table == "Ja"))
vezelwijzer = vezelwijzer.drop('Volgorde_Nutrienten', 'Preferred_table')
vezelwijzer = vezelwijzer.drop_duplicates()

df = basis.join(food, ['gtin', 'gln'], how = "left")
df = df.join(vezelwijzer, ['gtin', 'gln'], how = "left")

#df nlp spark
DF_NLP_spark = spark.sql("select gtin, gln, lemmatized_DUTCH_EN as Lemmatized, padded_seq__EN, padded_seq_ from default._paddedseq_ingredient")

DF_NLP_spark = DF_NLP_spark.join(df, ['gtin', 'gln'], how = "left")
DF_NLP_spark = DF_NLP_spark.filter((DF_NLP_spark.glnAH == True) & (DF_NLP_spark.ndFoodNonFood == 'Food'))


In [6]:
# Koppel ndAttributen
def koppel_ndAttributen():
    df_controle = spark.sql("select * from default._ndoutputattributen_25_3_2022") # Verander deze met Vernieuwde lijst?
    nd_controle = df_controle.join(NASA, ["gtin"])
    nd_controle = nd_controle.filter(nd_controle.glnAH == True)
    print("Amount of rows: ",(nd_controle.count(),"Amount of columns: ",len(nd_controle.columns))) # Shape of dataframe
    return nd_controle

def koppel_preprocessed_text(df_geformateerd):
    df_geformateerd = spark.createDataFrame(df_geformateerd)
    DF_merged_ = (DF_NLP_spark.join(df_geformateerd, ["gtin", "gln","glnAH"], how="left"))  # verander hier table naar bovengenoemde attribuut, zie notebook --> Functions

    DF_merged_ = DF_merged_.drop_duplicates()
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")
    DF_merged = DF_merged_.toPandas()
    print("Shape van dataframe: ",(DF_merged["gln"].count(),len(DF_merged.columns)))
    return DF_merged
    
def fix_empty_fields(DF_merged, ATTRIBUUT):
    # Fix empty fields van class_num
    print("null entries = ", DF_merged["class_num_"+str(ATTRIBUUT)].isna().sum())
    # class_empty = DF_merged[DF_merged[str(ATTRIBUUT)].str.match('EMPTY', na=False)]
    # class_empty = class_empty["class_num_"+str(ATTRIBUUT)].tolist()

    # DF_merged.loc[DF_merged["class_num_"+str(ATTRIBUUT)].isnull(),["class_num_"+str(ATTRIBUUT)]] = DF_merged.loc[DF_merged["class_num_"+str(ATTRIBUUT)].isnull(),"class_num_"+str(ATTRIBUUT)].apply(lambda x: list(class_empty[0]))
    DF_merged = DF_merged[DF_merged["class_num_"+str(ATTRIBUUT)].notna()]
    #print(DF_merged["class_num_"+str(ATTRIBUUT)].isna().sum())
    return DF_merged


# Functions

## Correlatie matrix, Pass Numerical Features

In [9]:
def Correlation_matrix(DF_merged, ATTRIBUUT, show_nan_plot=True):
    # numerieke features met empty entries geven error op de correlatie tabel
    plt.rcParams.update({'font.size': 11})
    NUMERIEKE_nutrienten = ["FIBER","SUGAR","FAT","FASAT","CHOAVL","PROTEINE","SALT","ENER_KJ","ENER_Kcal"]
    DF_merged["CLASS_"+str(ATTRIBUUT)] = DF_merged["class_num_"+str(ATTRIBUUT)].apply(np.argmax)
    
    nan_list_numeriek = []
    nan_list_labels = []
    for nutrient in NUMERIEKE_nutrienten:
        DF_merged[nutrient] = DF_merged[nutrient].astype(float)
        #print(nutrient, DF_merged[nutrient].isna().sum())
        nan_list_numeriek.append(DF_merged[nutrient].isna().sum())
        nan_list_labels.append(nutrient)
        #DF_merged[nutrient] = DF_merged[nutrient].fillna(0)
        DF_merged[nutrient+"_was_missing"] = np.where(DF_merged[nutrient].isnull(), 1, 0)
        if nutrient == "SUGAR":
            DF_merged["SUGAR_IS_ZERO"] = np.where(DF_merged[nutrient]== 0.0, 1, 0)
        # REPLACE the null now by a mean
        DF_merged[nutrient] = DF_merged[nutrient].fillna(DF_merged[nutrient].mean()) # mean or median
    if show_nan_plot == True: 
        fig, ax = plt.subplots(figsize =(16, 9))
        bars = ax.barh(nan_list_labels, nan_list_numeriek)        
        ax.set_title('Null values in nutritional attributes.\nBased on Food products.',
                     loc ='center', )
        ax.bar_label(bars, label_type='center', fontsize=20)
        print(bars, type(bars))
        # Show Plot
        plt.show()
    # string features omzetten naar numerieke classes
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(DF_merged['Fiber_UOM'])
    DF_merged['Fiber_UOM_class']=le.transform(DF_merged['Fiber_UOM'])
    le.fit(DF_merged['NBQ_UOM'])
    DF_merged['NBQ_UOM_class']=le.transform(DF_merged['NBQ_UOM'])
    le.fit(DF_merged['SUGAR_UOM'])
    DF_merged['SUGAR_UOM_class']=le.transform(DF_merged['SUGAR_UOM'])

    #display(DF_merged.head())
    df_correlatie = pd.DataFrame(DF_merged,columns=["FIBER","SUGAR","FAT","FASAT","CHOAVL","PROTEINE","SALT","ENER_KJ","Fiber_UOM_class","SUGAR_UOM_class","NBQ_UOM_class",'CLASS_'+str(ATTRIBUUT)])
    f = plt.figure(figsize=(15,11))
    corrMatrix = df_correlatie.corr()
    sns.heatmap(corrMatrix, annot=True, fmt='.2f', cmap='Blues',annot_kws={"fontsize":16})
    plt.show()
    return DF_merged

## Prepare Train, Test en validatie data

In [11]:
# Formateer Features

def Numerical_features(ATTRIBUUT, DF_merged):
    #Numerical_features = DF_merged[['Fiber_UOM_class','NBQ_UOM_class','SUGAR_UOM_class']]
    Numerical_features = DF_merged[['Fiber_UOM_class','NBQ_UOM_class','SUGAR_UOM_class']]
    # Welke numerical features wil je bij het attribuut?
    if ATTRIBUUT == "ndFiberIndex":
        Numerical_features = DF_merged[['FIBER','FIBER_was_missing','Fiber_UOM_class','PROTEINE']] # ndFiberIndex
    if ATTRIBUUT == "ndAddedSugar":
        Numerical_features = DF_merged[['SUGAR_was_missing','SUGAR_UOM_class','SUGAR']]
    if ATTRIBUUT == "ndFreeOfAlcohol":
        Numerical_features = DF_merged[['Fiber_UOM_class','NBQ_UOM_class','SUGAR_UOM_class']]
    if ATTRIBUUT == "ndTypeOfGrain":
        Numerical_features = ['FIBER']
    if ATTRIBUUT == "ndAnimalSpecies":
        Numerical_features = ['FIBER']
    return Numerical_features

def Formateer_Features(DF_merged, ATTRIBUUT):
    X_train_text_DF = DF_merged['padded_seq__EN'].tolist()
    X_train_text = torch.FloatTensor(X_train_text_DF)
    X_train_text = X_train_text.numpy()
    vocab_size = np.amax(X_train_text)+1

    # TARGET
    ytrain = DF_merged['class_num_'+ATTRIBUUT].tolist()
    ytrain_tensor = torch.FloatTensor(ytrain)
    ytrain = ytrain_tensor.numpy()
    print("Vocabular size: ", vocab_size)
    return vocab_size, X_train_text, ytrain

In [12]:
def split_train_test(xtrain, ytrain):
    from sklearn.model_selection import train_test_split
    Xtrain, Xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.3, random_state = 42)
    Xval, Xtest, yval, ytest = train_test_split(Xval, yval, test_size=0.5, random_state = 42)
    return Xtrain, ytrain, Xval, yval, Xtest, ytest

def unzip_sets(Xtrain, Xval, Xtest):
    Xtrain_num = Xtrain[:,0]
    Xtrain_text = Xtrain[:,1]
    Xval_num = Xval[:,0]
    Xval_text = Xval[:,1]
    Xtest_num = Xtest[:,0]
    Xtest_text = Xtest[:,1]
    return Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text

def to_tensor(Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text):
    Xtrain_num = Xtrain_num.tolist()
    Xtrain_num = torch.FloatTensor(Xtrain_num)
    Xtrain_num = Xtrain_num.numpy()

    Xval_num = Xval_num.tolist()
    Xval_num = torch.FloatTensor(Xval_num)
    Xval_num = Xval_num.numpy()

    Xtest_num = Xtest_num.tolist()
    Xtest_num = torch.FloatTensor(Xtest_num)
    Xtest_num = Xtest_num.numpy()

    Xtrain_text = Xtrain_text.tolist()
    Xtrain_text = torch.FloatTensor(Xtrain_text)
    Xtrain_text = Xtrain_text.numpy()

    Xval_text = Xval_text.tolist()
    Xval_text = torch.FloatTensor(Xval_text)
    Xval_text = Xval_text.numpy()

    Xtest_text = Xtest_text.tolist()
    Xtest_text = torch.FloatTensor(Xtest_text)
    Xtest_text = Xtest_text.numpy()
    return  Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text
  
def multiple_classes_PREP(numerical_ftrs, X_train, X_test, X_val, y_train, y_test, y_val, target):
    X_train_num = X_train[numerical_ftrs].to_numpy()
    X_test_num = X_test[numerical_ftrs].to_numpy()
    X_val_num = X_val[numerical_ftrs].to_numpy()

    X_train_text = X_train['padded_seq__EN'].tolist()
    X_train_text = torch.FloatTensor(X_train_text)
    X_train_text = X_train_text.numpy()
    
    X_test_text = X_test['padded_seq__EN'].tolist()
    X_test_text = torch.FloatTensor(X_test_text)
    X_test_text = X_test_text.numpy()
    
    X_val_text = X_val['padded_seq__EN'].tolist()
    X_val_text = torch.FloatTensor(X_val_text)
    X_val_text = X_val_text.numpy()
    
    # TARGET
    ytrain = y_train[target].tolist()
    ytrain_tensor = torch.FloatTensor(ytrain)
    ytrain = ytrain_tensor.numpy()

    ytest = y_test[target].tolist()
    ytest_tensor = torch.FloatTensor(ytest)
    ytest = ytest_tensor.numpy()

    yval = y_val[target].tolist()
    yval_tensor = torch.FloatTensor(yval)
    yval = yval_tensor.numpy()

    return X_train_num, X_test_num, X_val_num, X_train_text, X_test_text, X_val_text, ytrain, ytest, yval
  
def single_prep(numerieke_features, DF_merged, ATTRIBUUT):
    Numerical_features = DF_merged[numerieke_features]

    X_train_num = Numerical_features.to_numpy()

    vocab_size, X_train_text, ytrain = Formateer_Features(DF_merged, ATTRIBUUT)
    print("Numerical Features: ", Numerical_features.columns)
    print("Shape Numerical Features: ", X_train_num.shape)
    print("Shape Text Features: ", X_train_text.shape)
    print("Shape Targets: ", ytrain.shape)

    X_train = np.array(list(zip(X_train_num, X_train_text)))
    Xtrain, ytrain, Xval, yval, Xtest, ytest = split_train_test(X_train, ytrain)
    print(Xtrain.shape, Xtest.shape, Xval.shape)
    print(ytrain.shape, ytest.shape, yval.shape)
    
    Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text = unzip_sets(Xtrain, Xval, Xtest)
    Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text = to_tensor(Xtrain_num, Xtrain_text, Xval_num, 
                                                                                Xval_text, Xtest_num, Xtest_text)# To Tensor format
    return ytrain, yval, ytest, Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text
  
def single_prep_controle(numerieke_features, DF_merged, ATTRIBUUT):
    Numerical_features = DF_merged[numerieke_features]

    X_train_num = Numerical_features.to_numpy()

    vocab_size, X_train_text, ytrain = Formateer_Features(DF_merged, ATTRIBUUT)
    print("Numerical Features: ", Numerical_features.columns)
    print("Shape Numerical Features: ", X_train_num.shape)
    print("Shape Text Features: ", X_train_text.shape)
    print("Shape Targets: ", ytrain.shape)

    return X_train_num, X_train_text, ytrain, vocab_size

## Model metrics, statistics and plots

In [14]:
def predictions(model):
    yhat =  model.predict(Xtest, verbose=1)
    yhat2 = np.round(yhat, 0)
    model.evaluate(Xtest, ytest)

    list_ytest = []
    list_yhat = []
    for idx in range(len(yhat2)):
        list_ytest.append(np.argmax(ytest[idx]))
        list_yhat.append(np.argmax(yhat2[idx]))
    return list_ytest, list_yhat, yhat2

def accuracy_report(y_test, y_pred):
    print("Confusion Matrix:\n**************************\n ",
         confusion_matrix(y_test, y_pred),"\n**************************\n")
    print("Accuracy: ",
         accuracy_score(y_test, y_pred)*100)
    print("Classification report: ",
         classification_report(y_test, y_pred))
    
def plot_confusion_matrix(DF_merged, cf_matrix, ATTRIBUUT):
    unique_classes = DF_merged["class_num_"+str(ATTRIBUUT)].tolist()
    unique_classes = [item for item in unique_classes]
    unique_classes = unique_classes[0]
    print("unique amount of classes: ",len(unique_classes), unique_classes)
    group_names = []
    for i in range(len(unique_classes)):
        for class_ in unique_classes: 
            group_names.append(class_)

    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cf_matrix.flatten()/np.sum(cf_matrix)]

    labels = [f"{v1}\n{v2}" for v1, v2 in
              zip(group_counts,group_percentages)]
    
    class_names= DF_merged["class_names_"+str(ATTRIBUUT)].tolist()
    
    
    labels = np.asarray(labels).reshape(len(unique_classes), len(unique_classes))
    s = sns.heatmap(cf_matrix, xticklabels=class_names[0], yticklabels=class_names[0], annot=labels, fmt='', cmap='Blues')
    s.set(xlabel='Predicted', ylabel='True label')
    
    s.set_title("Attribute:\n "+str(ATTRIBUUT)+"\n"+str(' - '.join(class_names[0])))
    plt.show()
    
def plot_training(acc, val_acc, NAME='Accuracy'):
    EPOCH = len(acc)
    for param in ['figure.facecolor', 'axes.facecolor', 'savefig.facecolor']:
        plt.rcParams[param] = '1.0'#'#212946'  # bluish dark grey
    for param in ['text.color', 'axes.labelcolor', 'xtick.color', 'ytick.color']:
        plt.rcParams[param] = '0.3'  # very light grey
    epochs_range = np.arange(1,EPOCH+1)
    plt.plot(epochs_range, acc, 'g', label='Train '+str(NAME))
    plt.plot(epochs_range, val_acc, 'b', label='Val '+str(NAME))
    plt.title('Training and Validation \n '+str(NAME)+' of '+str(ATTRIBUUT))
    plt.xlabel('Epochs')
    plt.ylabel(str(NAME))
    plt.legend()
    plt.show()
    
def predictions_2inputs(model, Xtest_num,Xtest_text, ytest):
    yhat =  model.predict([Xtest_num, Xtest_text], verbose=1)
    yhat2 = np.round(yhat, 0)
    model.evaluate([Xtest_num, Xtest_text], ytest)

    list_ytest = []
    list_yhat = []
    for idx in range(len(yhat2)):
        list_ytest.append(np.argmax(ytest[idx]))
        list_yhat.append(np.argmax(yhat2[idx]))
    return list_ytest, list_yhat, yhat2

def plot_multiple_training(acc, val_acc, class_names, ATTRIBUUT):
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(25,14))
    EPOCH = len(acc[0])
    for param in ['figure.facecolor', 'axes.facecolor', 'savefig.facecolor']:
        plt.rcParams[param] = '1.0'
    for param in ['text.color', 'axes.labelcolor', 'xtick.color', 'ytick.color']:
        plt.rcParams[param] = '0.3'  
    epochs_range = np.arange(1,EPOCH+1)
    for index, class_acc in enumerate(acc):
        ax1.plot(epochs_range, class_acc, label=str(class_names[index]))
    for index, class_val_acc in enumerate(val_acc):
        ax2.plot(epochs_range, class_val_acc, label=str(class_names[index]))
    ax1.set_title('Training \n Accuracy of '+str(ATTRIBUUT))
    ax2.set_title('Validation \n Accuracy of '+str(ATTRIBUUT))
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy')
    ax1.legend()
    ax2.legend()
    plt.show()
    
def plot_single_label_training(model, model_history, Xtest_num, Xtest_text, ytest, ATTRIBUUT, DF_merged):
    print(model_history.history.keys())
 
    list_ytest, list_yhat, yhat2 = predictions_2inputs(model, Xtest_num, Xtest_text, ytest)

    plot_confusion_matrix(DF_merged, confusion_matrix(list_ytest, list_yhat), ATTRIBUUT)
    accuracy_report(list_ytest, list_yhat)
    plot_training(model_history.history['accuracy'], model_history.history['val_accuracy'],NAME='Accuracy')
    plot_training(model_history.history['loss'], model_history.history['val_loss'],NAME='Loss')

## Vocabulary Size

In [16]:
def get_vocabulary_size(DF_ATTRIBUUT):
    vocab = DF_ATTRIBUUT['padded_seq__EN'].tolist()
    vocab = torch.FloatTensor(vocab)
    vocab = vocab.numpy()
    vocab_size = np.amax(vocab)+1
    print("Vocab size: ", vocab_size)
    return vocab_size

## Neural network

In [18]:
import keras
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Embedding, GlobalAveragePooling1D, Input, concatenate, LSTM
from tensorflow.keras.optimizers import Adam, SGD

def LOAD_MODEL(Xtrain_num, Xtrain_text, vocab_size, ytrain):
    input_NUM = Input(shape=Xtrain_num.shape[1]) # Numerieke features
    input_TEXT = Input(shape=Xtrain_text.shape[1]) # Text features

    emb = Embedding(int(vocab_size), output_dim=20,input_length=Xtrain_text.shape[1])(input_TEXT)
    #lstm = LSTM(8, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb)
    fltn = Flatten()(emb)
    x = concatenate([fltn, input_NUM])
    
    x = Dense(30, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(ytrain.shape[1], activation='softmax')(x)

    model = Model(inputs=[input_NUM , input_TEXT], outputs=[x])
    model.summary()


    optimizer = SGD(learning_rate=0.01)
    #model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])
    return model
  
def LOAD_MODEL_2(Xtrain_num, Xtrain_text, vocab_size, ytrain):
    input_NUM = Input(shape=Xtrain_num.shape[1]) # Numerieke features
    input_TEXT = Input(shape=Xtrain_text.shape[1]) # Text features

    emb = Embedding(int(vocab_size), output_dim=4,input_length=Xtrain_text.shape[1])(input_TEXT)
    #lstm = LSTM(8, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb)
    emb = Flatten()(emb)
    input_NUM = Dropout(0.3)(input_NUM)
    emb = Dropout(0.3)(emb)
    x = concatenate([emb, input_NUM])
    x = Dense(30, activation='relu')(x)
    x = Dense(50, activation='relu')(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(ytrain.shape[1], activation='softmax')(x)

    model = Model(inputs=[input_NUM , input_TEXT], outputs=[x])
    model.summary()


    optimizer = SGD(learning_rate=0.008)
    #model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])
    return model

def LOAD_MODEL_3(Xtrain_num, Xtrain_text, vocab_size, ytrain):
    input_NUM = Input(shape=Xtrain_num.shape[1]) # Numerieke features
    input_TEXT = Input(shape=Xtrain_text.shape[1]) # Text features

    emb = Embedding(int(vocab_size), output_dim=4,input_length=Xtrain_text.shape[1])(input_TEXT)
    #lstm = LSTM(8, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb)
    emb = Flatten()(emb)
    input_NUM = Dropout(0.3)(input_NUM)
    emb = Dropout(0.3)(emb)
    x = concatenate([emb, input_NUM])
    x = Dense(100, activation='relu')(x)
    x = Dense(50, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(ytrain.shape[1], activation='softmax')(x)

    model = Model(inputs=[input_NUM , input_TEXT], outputs=[x])
    model.summary()


    optimizer = SGD(learning_rate=0.008)
    #model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])
    return model

## Train model multi-label or mutli-class (Single Label)

In [20]:
def total_split(DF_merged, numerieke_features, ATTRIBUUT):
    from sklearn.model_selection import train_test_split
    X = DF_merged[['gtin','gln','Lemmatized',ATTRIBUUT,'padded_seq__EN','class_names_'+ATTRIBUUT]]
    X2 = DF_merged[numerieke_features]
    X = X.merge(X2, left_index=True, right_index=True, how='inner')
    targets = DF_merged['class_names_'+ATTRIBUUT].tolist()
    targets = targets[0]
    y = DF_merged[targets]
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2, random_state=1)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                      test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

    print(X_train.shape, X_test.shape, X_val.shape)
    print(y_train.shape, y_test.shape, y_val.shape)
    return  X_train, X_val, X_test, y_train, y_val, y_test, targets

In [21]:
def TRAINING_multi_label(X_train, X_val, X_test, y_train, y_val, y_test, numerical_ftrs, vocab_size, ATTRIBUUT, SAVE_MODEL):
    total_pred = []
    total_test = []
    train_accuracies = []
    val_accuracies = []
    class_names = []
    print("**************** MULTI LABEL MODEL ***************\n")
    print("Save all models = ",SAVE_MODEL,"\nNumerieke features = ", numerical_ftrs, "\nAttribuut = ", ATTRIBUUT )
    
    for (target, data) in tqdm(y_train.iteritems()):
        print('\n',target)
        Xtrain_num, Xtest_num, Xval_num, Xtrain_text, Xtest_text, Xval_text, ytrain, ytest, yval = multiple_classes_PREP(numerical_ftrs, X_train, X_test, X_val, y_train, y_test, y_val, target)

        print(Xtrain_num.shape, Xtest_num.shape, Xval_num.shape, Xtrain_text.shape, Xtest_text.shape, Xval_text.shape)
        print(ytrain.shape, ytest.shape, yval.shape)
        model = LOAD_MODEL_3(Xtrain_num, Xtrain_text, vocab_size, ytrain)

        model_history = model.fit([Xtrain_num,Xtrain_text], ytrain, epochs=30, verbose=2, validation_data=([Xval_num,Xval_text], yval), shuffle=True)
        list_ytest, list_yhat, yhat2 = predictions_2inputs(model, Xtest_num, Xtest_text, ytest)
        accuracy_report(list_ytest, list_yhat) 

        #controle_tabel['TRUE '+str(target)] = list_ytest
        #controle_tabel['PRED '+str(target)] = list_yhat
        #total_test.append(list_ytest)
        #total_pred.append(list_yhat)

        train_accuracies.append(model_history.history['accuracy'])
        val_accuracies.append(model_history.history['val_accuracy'])
        class_names.append(target)
        if SAVE_MODEL == True:
            model.save("/FileStore/nd_MODELS_publicatie/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5")
            dbutils.fs.cp("file:/FileStore/nd_MODELS_publicatie/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5", "dbfs:/FileStore/nd_MODELS_publicatie/"+str(ATTRIBUUT)+"/model_"+str(target)+".h5") 
    return train_accuracies, val_accuracies, class_names, model

In [22]:
def TRAINING_single_label( Xtrain_num, Xtrain_text, Xval_num, Xval_text, Xtest_num, Xtest_text, vocab_size, EPOCHS, SAVE_MODEL):
    print("**************** SINGLE LABEL MODEL ***************\n")
    model = LOAD_MODEL_2(Xtrain_num, Xtrain_text, vocab_size, ytrain)
    model_history = model.fit([Xtrain_num,Xtrain_text], ytrain, epochs=EPOCHS, verbose=2, validation_data=([Xval_num,Xval_text], yval), shuffle=True, batch_size=32) # Start training the model --> Singel label
    
    if SAVE_MODEL == True:
        model.save("/FileStore/nd_MODELS_publicatie/model_"+str(ATTRIBUUT)+".h5")
        dbutils.fs.cp("file:/FileStore/nd_MODELS_publicatie/model_"+str(ATTRIBUUT)+".h5", "dbfs:/FileStore/nd_MODELS_publicatie/model_"+str(ATTRIBUUT)+".h5") 
        display(dbutils.fs.ls("/FileStore/nd_MODELS_publicatie/model_"+str(ATTRIBUUT)+".h5"))
    return model, model_history