# Word embeddings and neural network

In [2]:
import os
import re
import pandas as pd
import numpy as np
import string
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, cohen_kappa_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import spacy
from gensim.models import KeyedVectors

import keras
from keras import callbacks
from keras.models import Sequential, Input
from keras.layers import Dense, Bidirectional
from keras.layers import Dropout, SpatialDropout1D
from keras.layers import LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Embedding, Flatten, Concatenate
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import LearningRateScheduler as LRS
from keras.models import Model

In [3]:
def makeDirIfNotExists(dir):
    if not os.path.isdir(dir):
        os.mkdir(dir)
        
def load_all_embeddings(path, filename):
    # Let's make a dict mapping words (strings) to their NumPy vector representation:
    if filename.split('/')[0] == 'word2vec':
        embeddings_index = KeyedVectors.load_word2vec_format(path + filename, binary=True)
        n_word_vectors = len(embeddings_index.key_to_index.keys())
    else:
        embeddings_index = {}
        with open(path + filename, encoding="utf8") as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index[word] = coefs
        n_word_vectors = len(embeddings_index) 
    # print('Number of words in the embeddings:',n_word_vectors)
    return embeddings_index

def load_embedding(filename, embeddings_index, tokenizer):
    # https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
    '''
        returns:
            num_tokens: Number of different words in the data
            misses: Number of words that are not in the embedding
            misses_words: Words that are not in the embedding
    '''
    # Let's prepare a corresponding embedding matrix that we can use in a Keras Embedding layer. 
    # It's a matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.
    num_tokens = len(tokenizer.word_index) + 1 # len(voc) + 2
    embedding_dim = int(filename.split('.')[-2].split('_')[-1][0:-1]) #embedding_dim = int(filename.split('.')[2][0:-1])
    hits = 0; misses = 0; misses_words = list()
    
    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in tokenizer.word_index.items(): 
        if filename.split('/')[0] == 'word2vec':
            try:
                embedding_vector = embeddings_index[word] #embeddings_index.get(word)
            except:
                embedding_vector = None
        else:
            embedding_vector = embeddings_index.get(word)
            
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
            misses_words.append(word)
    # print("Converted %d words (%d misses)" % (hits, misses))
    percentage_misses = round((misses*100)/(hits+misses),2)
    # print('Found %s unique tokens.' % len(tokenizer.word_index))
    
    return num_tokens, percentage_misses, misses_words, embedding_dim, embedding_matrix

def prepare_data(max_nb_words, max_sequence_length, filters, lower, X_train_df, Y_train_df, X_dev_df, Y_dev_df, X_test_df, Y_test_df):
    
    # max_nb_words = The maximum number of words to be used (most frequent) -> size of the vocabulary   
    # filters = String where each element is a character that will be filtered from the texts. 
    #           The default is all punctuation, plus tabs and line breaks, minus the ' character
    # lower = Whether to convert the texts to lowercase 
    # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
    
    # max_sequence_length = Max number of words in each complaint.
    
    # Tokenize and pad sequences
    tokenizer = Tokenizer(num_words=max_nb_words, filters=filters, lower=lower, split=' ', char_level=False, oov_token=None) 
    tokenizer.fit_on_texts(X_train_df.values) # Updates internal vocabulary based on a list of texts

    word_index = tokenizer.word_index
    # print('Found %s unique tokens.' % len(word_index))
    # print('Maximum tokens used:', max_nb_words)

    X_train = tokenizer.texts_to_sequences(X_train_df.values) # Transforms each text in texts to a sequence of integers
    X_train = pad_sequences(X_train, maxlen=max_sequence_length) # Pads/truncates sequences to the same length (padding='pre', truncating='pre')
    Y_train = Y_train_df
    
    try:
        X_dev = tokenizer.texts_to_sequences(X_dev_df.values)
    except:
        print('One subject in validation')
        X_dev = tokenizer.texts_to_sequences(X_dev_df)
    X_dev = pad_sequences(X_dev, maxlen=max_sequence_length)
    Y_dev = Y_dev_df

    X_test = tokenizer.texts_to_sequences(X_test_df.values)
    X_test = pad_sequences(X_test, maxlen=max_sequence_length)
    Y_test = Y_test_df

    print('\nShape of training: X =', X_train.shape,' Y =',Y_train.shape)
    print('Shape of validation: X =', X_dev.shape,' Y =',Y_dev.shape)
    print('Shape of test: X =', X_test.shape,' Y =',Y_test.shape)

    return X_train, Y_train, X_dev, Y_dev, X_test, Y_test, tokenizer
    
def load_model(embedding_dim, num_tokens, embedding_matrix):
    model = Sequential()
    model.add(Embedding(num_tokens, embedding_dim, weights=[embedding_matrix], input_length=X_train.shape[1], trainable=False))
    model.add(Bidirectional(LSTM(128,return_sequences=True,dropout=0.2)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64,activation='relu')) 
    model.add(Dense(2,activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def compute_metrics(y_true, y_pred):
    """
        A partir de las predicciones y las ground truth, calcula diferentes métricas.
        (si se le pasan las labels por sujeto, las métricas son por sujeto)
        https://towardsdatascience.com/should-i-look-at-precision-recall-or-specificity-sensitivity-3946158aace1
    """
    
    acc = round(accuracy_score(y_true, y_pred)*100,2)
    kappa = round(cohen_kappa_score(y_true,y_pred),3)
    cm = confusion_matrix(y_true, y_pred) 
    tn = cm[0][0] # True negative => negative = non-AD = 0 (control)
    fp = cm[0][1]
    fn = cm[1][0]
    tp = cm[1][1] # True positive => positive = AD = 1 (dementia)
    sensitivity = round(tp/(tp+fn),3) # = recall
    specificity = round(tn/(tn+fp),3)
    precision = round(tp/(tp+fp),3)
    recall = round(tp/(tp+fn),3)
    f1score = round(2*(precision * recall)/(precision + recall),3) # = f1_score(y_true,y_pred)
    
    return acc, kappa, f1score, cm, sensitivity, specificity, precision

def save_test_results(path_save,identifier,y_true,y_pred,df_test):
    df_errors = pd.DataFrame(columns=['user','predicted','true','errores'])
    for i in range(len(y_pred)):
        if y_pred[i] == y_true[i]:
            error = 'acierto'
        else:
            error = 'fallo'

        new_row = {'user': df_test.loc[i,'file'],
                 'predicted':y_pred[i],
                 'true':y_true[i],
                 'errores':error}
        df_errors = df_errors.append(new_row, ignore_index=True)
    df_errors.to_csv(path_save+str(identifier)+'_test_results.csv', index = False)
    
def print_evaluations(path_save, identifier, history, save = False):
    
    #NUMERO DE EPOCHS
    num_epochs = range(1, len(history.history['loss']) + 1)

    #GRAFICO DEL ENTRENAMIENTO LOSS
    fig = plt.figure()
    plt.plot(num_epochs, history.history['loss'], 'r--')
    plt.plot(num_epochs, history.history['val_loss'], 'b-')
    plt.legend(['Loss Entrenamiento', 'Loss Validacion'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    if save:
        plt.savefig(path_save+str(identifier)+'_loss.jpeg')
    plt.show()

    #GRAFICO DEL ENTRENAMIENTO ACCURACY
    fig = plt.figure()
    plt.plot(num_epochs, history.history['accuracy'], 'r--')
    plt.plot(num_epochs, history.history['val_accuracy'], 'b-')
    plt.legend(['Accuracy Entrenamiento', 'Accuracy Validacion'])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    if save:
        plt.savefig(path_save+str(identifier)+'_acc.jpeg')
    plt.show()
    
    plt.close('all')

In [2]:
path = 'pretrained_embeddings/'
my_embeddings = ['fasttext/crawl-2M_300d.vec'] 
print('[INFO] Loading embeddings...')
start_emb = time.time()
all_embeddings = dict()
for filename in my_embeddings:
    embeddings_index = load_all_embeddings(path, filename)
    all_embeddings[filename] = embeddings_index
end_emb = time.time()
print('       Duration:',round((end_emb-start_emb)/60,2),'min') 

In [3]:
save = True
MAX_SEQUENCE_LENGTH = 250 
MAX_NB_WORDS = None
batch_size = 10 
epochs = 30
filename = 'fasttext/crawl-2M_300d.vec'
filters = '#$%&()*+-/:;<=>@[\]^_`{|}~' #excluded: !¡ ¿? . ,
lower = True
    
path = '0-publication/results_embeddings/'

try_transcriptions = ['manual_Punc','manual_noPunc','manual_pauses','ASR_wav2vec2_noPunc','ASR_whisper_Punc','ASR_whisper_noPunc','ASR_whisper_pauses']

df_results = pd.DataFrame()
start = time.time() 
cleaning_training = False
for identifier, selected_text in enumerate(try_transcriptions):
    print('============== ', selected_text,'- clean:',cleaning_training,' ==============')
    path_save = path + selected_text + '_' + 'cleaning'+str(cleaning_training) + '_TESTING_FINAL/'
    makeDirIfNotExists(path_save)
        
    # (1) Load transcriptions
    df_all = pd.read_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx')
    df_all.rename(columns = {selected_text:'text', 'label':'group','user':'file'}, inplace = True)

    if cleaning_training:
        subjects_drop = ['S083', 'S093', 'S095', 'S101', 'S103', 'S104', 'S139', 'S151'] # En training, mmse>23 & label=1
        df = df_all[~df_all['file'].isin(subjects_drop)].copy()
        df.reset_index(inplace=True, drop=True)
    else:
        df = df_all.copy()
    
    df_train = df.loc[df.partition == 'train'].copy()
    df_train.drop(columns='partition',inplace=True)
    df_train = df_train.sample(frac=1, random_state=13) # shuffle
    df_train.reset_index(drop=True, inplace=True)
    X_train_df, Y_train_df = df_train['text'], pd.get_dummies(df_train['group']).values

    df_test = df.loc[df.partition == 'test'].copy()
    df_test.drop(columns='partition',inplace=True)
    df_test = df_test.sample(frac=1, random_state=13) # shuffle
    df_test.reset_index(drop=True, inplace=True)
    X_test_df, Y_test_df = df_test['text'], pd.get_dummies(df_test['group']).values    
    
    X_dev_df, Y_dev_df = [X_train_df[0]], np.array([Y_train_df[0]]) 
    X_train, Y_train, _, _, X_test, Y_test, tokenizer = prepare_data(MAX_NB_WORDS,MAX_SEQUENCE_LENGTH,
                                                                    filters, lower,
                                                                    X_train_df,Y_train_df, X_dev_df, Y_dev_df, X_test_df,Y_test_df)

    num_tokens, misses, misses_words, embedding_dim, embedding_matrix = load_embedding(filename, all_embeddings[filename], tokenizer)
    
    # (3) Testing the model
    df_iteraciones = pd.DataFrame()
    df_results_it = pd.DataFrame()
    for zz in range(25):
        model = load_model(embedding_dim, num_tokens, embedding_matrix)

        if save:
            with open(path_save+str(zz)+'_modelsummary.txt', 'w') as f:
                model.summary(print_fn=lambda x: f.write(x + '\n'))

        history = model.fit(X_train, Y_train, 
                        epochs=epochs, batch_size=batch_size,
                        validation_data=(X_test,Y_test),
                        verbose = 0) #shuffle=True,callbacks=callbacks,
        
        # Save history of the model
        num_epochs = range(1, len(history.history['loss']) + 1)
        train_loss = history.history['loss']
        train_acc = history.history['accuracy']
        val_loss = history.history['val_loss']
        val_acc = history.history['val_accuracy']
        data = np.array([num_epochs,train_loss,train_acc,val_loss,val_acc])
        df_history = pd.DataFrame(data.T, columns = ['epochs','train_loss','train_acc','val_loss','val_acc'])
        print_evaluations(path_save, zz, history, save = True)
        
        y_true = []; y_pred = []
        for a in Y_test:
            y_true_i = np.argmax(a)
            y_true.append(y_true_i)
        predictions = model.predict(X_test)
        for a in predictions:
            y_pred_i = np.argmax(a)
            y_pred.append(y_pred_i)
        df_iteraciones[zz] = y_pred
        
        if save:
            df_history.to_csv(path_save+str(zz)+'_history.csv',index=False)
            model.save(path_save+str(zz)+'_model.h5')
            save_test_results(path_save,zz,y_true,y_pred,df_test)
            
        test_acc, test_kappa, test_f1score, test_cm, test_sensitivity, test_specificity, test_precision = compute_metrics(y_true, y_pred)
        errors = []
        for i in range(len(y_true)):
            if y_true[i] != y_pred[i]:
                user = df_test.file.values[i]
                errors.append(user)
        
        new_row =  {'text': selected_text,'it':zz,
                   'test_acc':test_acc, 'test_kappa':test_kappa, 'test_f1score':test_f1score, 'test_cm':test_cm, 
                   'test_sensitivity':test_sensitivity, 'test_specificity':test_specificity, 'test_precision':test_precision,
                   'errors_test':errors}
        df_results_it = df_results_it.append(new_row, ignore_index=True)
        df_results_it.to_csv(path_save+'it_metrics.csv')
    
    df_iteraciones['voting'] = 10
    for i in range(df_iteraciones.shape[0]):
        valores = list(df_iteraciones.loc[i,:].values)
        maj_voting = max(set(valores), key = valores.count)
        df_iteraciones.loc[i,'voting'] = maj_voting
    df_iteraciones['true'] = y_true
    df_iteraciones.to_csv(path_save+'it_predictions.csv')
    
    y_true = df_iteraciones['true'].values
    y_pred = df_iteraciones['voting'].values
    test_acc, test_kappa, test_f1score, test_cm, test_sensitivity, test_specificity, test_precision = compute_metrics(y_true, y_pred)
    errors = []
    for i in range(len(y_true)):
        if y_true[i] != y_pred[i]:
            user = df_test.file.values[i]
            errors.append(user)
            
    new_row =  {'text': selected_text,
               'test_acc':test_acc, 'test_kappa':test_kappa, 'test_f1score':test_f1score, 'test_cm':test_cm, 
               'test_sensitivity':test_sensitivity, 'test_specificity':test_specificity, 'test_precision':test_precision,
               'errors_test':errors}
    df_results = df_results.append(new_row, ignore_index=True)
    df_results.to_csv('0-publication/results_embeddings/mean_tests.csv')        
    df_results.to_excel('0-publication/results_embeddings/mean_tests.xlsx',index=False)  
    
    end = time.time()
    print('\n** Duration:',round((end - start)/60,2),'min')