### Imports:

In [2]:
from gensim.models import KeyedVectors

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.preprocessing.text import Tokenizer, one_hot
from keras.utils import to_categorical, pad_sequences
from keras.preprocessing.text import hashing_trick, text_to_word_sequence

from random import shuffle
from pickle import dump, load
from numpy import array
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint

import matplotlib.pyplot as plt


### Model:

In [7]:
def generate_X_Y_data(df, test_size=0.3, random_state=None):
    labels = df.columns.tolist()
    X = []
    y = []
    for _, row in df.iterrows():
        for value, label in zip(row.values,labels):
            X.append(value)
            y.append(label)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train = [str(x) for x in X_train]
    X_test = [str(x) for x in X_test]
    return X_train, X_test, y_train, y_test

def label_encoding(y_train,y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    y_test = le.transform(y_test)
    return y_train, y_test

def convert_to_padded(tokenizer_, docs, max_length):
    embedded = tokenizer_.texts_to_sequences(docs)
    padded = pad_sequences(embedded, maxlen = max_length, padding = 'post')
    return padded

def samples_encoding(X_train, X_test, tokenizer_, max_length):
    tokenizer_.fit_on_texts(X_train)
    padded_X_train = convert_to_padded(tokenizer_, X_train,max_length)
    padded_X_test = convert_to_padded(tokenizer_, X_test,max_length)
    return padded_X_train, padded_X_test

def prepare_embedding(embedding_model, tokenizer_):
    word_index = tokenizer_.word_index
    EMBEDDING_DIM = 100
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if word in embedding_model:
            embedding_matrix[i] = embedding_model[word]
    return embedding_matrix

In [12]:
def make_model(qtd_labels,max_length,vocab_size,embedding_matrix):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_matrix.shape[1], input_length = max_length, trainable = False, weights = [embedding_matrix]))
    model.add(Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.1, dropout=0.1), 'concat'))
    model.add(Dropout(0.5))
    model.add(LSTM(128, return_sequences=False, recurrent_dropout=0.1, dropout=0.1))
    model.add(Dropout(0.5))
    model.add(Dense(128, input_shape=(max_length), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(qtd_labels, activation='softmax'))
    return model



def plot_training_history(history):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.show()



df = pd.read_csv('undersampled_df.csv')
df = df.drop('Unnamed: 0', axis=1)

def train_model(df, save_path, num_epochs=20,num_batch_size=32):
    qtd_labels = len(df.columns)
    X_train, X_test, y_train, y_test = generate_X_Y_data(df)
    y_train, y_test = label_encoding(y_train, y_test)

    tokenizer_ = Tokenizer()

    vocab_size = len(tokenizer_.word_counts) + 1
    max_length = len(max(X_train, key = len))

    padded_X_train, padded_X_test = samples_encoding(X_train, X_test, tokenizer_, max_length)

    embedding_model = KeyedVectors.load_word2vec_format(r'..\embeddings\Word2Vec\cbow_s100.txt')
    embedding_matrix = prepare_embedding(embedding_model, tokenizer_)

    make_model(qtd_labels,max_length,vocab_size,embedding_matrix)

    model = make_model(qtd_labels, max_length, vocab_size, embedding_matrix)

    checkpoint = ModelCheckpoint(save_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(padded_X_train, y_train, validation_data=(padded_X_test, y_test), epochs=num_epochs, batch_size=num_batch_size, callbacks=[checkpoint])

    model.save(save_path)
    
    plot_training_history(history)
    