In [1]:
#Import libraries for loading pre-trained word vectors
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors

#Import libraries for loading and pre-processing the dataset
import pandas as pd
import numpy as np
import emoji
import re

#Avoid warnings from libraries
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
#Global variables

#Embeddings
path_wordvectors = '../pre-trained embeddings/spanish/'
name_wordvectors = 'cc_es_300.vec'
type_wordvectors = 'fasttext'
wv_dimension     = 300

#Dataset
path_ds     = '../data/spanish/'
name_ds     = 'MEX-A3T/'
validation  = False
language    = 'sp'

In [3]:
#load dataset / index: 0 = train, 1 = test, 2 = validation
def load_data(path, name, validation = False):#loads the CSV files from datasets
    d_train = pd.read_csv(path + name + "ds_train.csv")
    d_test  = pd.read_csv(path + name + "ds_test.csv")
    if validation == True:
        d_validation = pd.read_csv(path + name + "_validation.csv")
        return [d_train, d_test, d_validation]
    return [d_train, d_test]

dataset = load_data(path_ds, name_ds, validation)

In [4]:
print(dataset[0].head(5))

                                                text  label
0  Soy el Clint Eastwood de los Puentes de Madiso...      0
1  Actualmente ya pasó de moda la pucha joto, aho...      0
2  ¿Es cierto esto? Y no me refiero a lo que dijo...      0
3  Vuela pega y esquiva... la neta está de la ver...      0
4  Mejor puto disfraz de la noche!!!! 👊👊👊Por terc...      0


In [5]:
#Pre-process text on dataset
def remove_consecutive_characters(sentence):
    string_len = len(sentence)
    clipped_sentence = ''
    queued_character = ''
    for index, character in enumerate(sentence):
        if index == 0:
            clipped_sentence += character
            queued_character = character
        elif character.isalnum() or character == ' ':
            clipped_sentence += character
            queued_character = character
        else:
            if queued_character is not character:
                clipped_sentence += character
                queued_character = character
    return clipped_sentence

def clean_text(sentence, language):
    #Convert instance to string
    sentence = str(sentence)
    
    #All text to lowecase
    sentence = sentence.lower()
    
    #Remove all consecutive special characters (non-alphanumeric)
    sentence = remove_consecutive_characters(sentence)
    
    #separate special characters
    sentence = re.sub(r":", " : ", sentence)
    sentence = re.sub(r",", " , ", sentence)
    sentence = re.sub(r"\.", " . ", sentence)
    sentence = re.sub(r"!", " ! ", sentence)
    sentence = re.sub(r"¡", " ¡ ", sentence)
    sentence = re.sub(r"“", " “ ", sentence)
    sentence = re.sub(r"”", " ” ", sentence)
    sentence = re.sub(r"\(", " ( ", sentence)
    sentence = re.sub(r"\)", " ) ", sentence)
    sentence = re.sub(r"\?", " ? ", sentence)
    sentence = re.sub(r"\¿", " ¿ ", sentence)
    
    #Split all emojis
    emoji_list = emoji.emoji_lis(sentence)
    for index, _emoji in enumerate(emoji_list):
        location = _emoji['location'] + (index * 2)
        sentence = sentence[0:location] + ' ' + sentence[location:location+1] + ' ' + sentence[location+1:]
    
    #Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    
    tokens = sentence.split()
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

def preprocess_dataset(dataset, language):
    for data in dataset:
        for i, row in data.iterrows():
            if row['text'].strip() != '':
                sentence = clean_text(row['text'], language)
                data.at[i, 'text'] = sentence
    return dataset

dataset = preprocess_dataset(dataset, language)

In [6]:
print(dataset[0].head(5))

                                                text  label
0  soy el clint eastwood de los puentes de madiso...      0
1  actualmente ya pasó de moda la pucha joto , ah...      0
2  ¿ es cierto esto ? y no me refiero a lo que di...      0
3  vuela pega y esquiva . la neta está de la verg...      0
4  mejor puto disfraz de la noche ! 👊 👊 👊 por ter...      0


In [7]:
#Load pre-trained word embeddings
class load_wordvectors():#loads the word vectors
    def __init__(self, path, name, type_wv):
        self.type_wv = type_wv
        self.path    = path
        self.name    = name
        
    def load(self):#returns the loaded word vectors
        if self.type_wv == "custom":
            model = FastText.load(self.path + self.name)
        elif self.type_wv == "fasttext":
            model = KeyedVectors.load_word2vec_format(self.path+self.name, binary=False)
        elif self.type_wv == "glove":
            model = KeyedVectors.load_word2vec_format(self.path+self.name, binary=False)
        return model

    def tokenize(self, text):#return the tokenizer and the vocabulary size
        vocab_set   = list(set().union(text))
        tokenizer   = Tokenizer()
        tokenizer.fit_on_texts(vocab_set)
        vocabulary_sz 	= len(tokenizer.word_index) + 1
        return tokenizer, vocabulary_sz

    def build_embedding_matrix(self, vocabulary_sz, wv_dimension, tokenizer, model):
        embedding_matrix = np.zeros((vocabulary_sz, wv_dimension))
        for word, i in tokenizer.word_index.items():
            if word in model.wv:
                embedding_matrix[i] = model.wv[word]
        return embedding_matrix

wv_model      = load_wordvectors(path_wordvectors, name_wordvectors, type_wordvectors)
word_vectors  = wv_model.load()

In [8]:
#Tokenize the train set
tokenizer, vocabulary_sz = wv_model.tokenize(dataset[0]['text'])

In [9]:
#Build the embedding matrix
embedding_matrix = wv_model.build_embedding_matrix(vocabulary_sz, wv_dimension, tokenizer, word_vectors)

In [10]:
#dnn variables
seq_len            = 40
learning_rate      = 0.001
word_encoding_dim  = 64
n_classes          = 2
dropout_rate       = 0.15

In [None]:
#Normalize the text and labels for the neural network
class normalize_data():#normalize the text and labels
    def __init__(self, dataset):
        self.dataset = dataset
    def create_label_vector(self, labels, classes):
        vector = np.zeros((labels.shape[0], classes))
        for instance, label in enumerate(labels):
            vector[instance][label] = 1
        return vector
    def normalize(self, tokenizer, n_classes, seq_len):
        n_partition = len(self.dataset)
        if n_partition >= 2:
            y_train = self.create_label_vector(self.dataset[0]['label'], n_classes)
            y_test  = self.create_label_vector(self.dataset[1]['label'], n_classes)
            x_train = pad_sequences(tokenizer.texts_to_sequences(self.dataset[0]['text']), maxlen=seq_len)
            x_test  = pad_sequences(tokenizer.texts_to_sequences(self.dataset[1]['text']), maxlen=seq_len)
        if n_partition == 3:
            y_validation = self.create_label_vector(self.dataset[2]['label'], n_classes)
            x_validation = pad_sequences(tokenizer.texts_to_sequences(self.dataset[2]['text']), maxlen=seq_len)
            return x_train, y_train, x_test, y_test, x_validation, y_validation
        return x_train, y_train, x_test, y_test

normalized_data = normalize_data(dataset)
x_train, y_train, x_test, y_test = normalized_data.normalize(tokenizer, n_classes, seq_len)

In [None]:
from keras.layers import Dense, Input, Embedding, GRU, Bidirectional, concatenate, GlobalMaxPooling1D
from keras.layers import Activation, Dense, Conv1D, MaxPooling1D, Dropout, AveragePooling1D, GlobalAveragePooling1D
from keras.models import Model, load_model

from keras import initializers, layers
from keras.optimizers import *
from keras import regularizers, initializers, constraints
from keras.engine.topology import Layer, InputSpec
from keras import backend as K

from keras.callbacks import Callback
from sklearn import metrics

#Attention Class
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(name='{}_W'.format(self.name),
                                 shape=(input_shape[-1],),
                                 initializer=self.init,
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint,
                                 trainable=True)
        
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(name='{}_b'.format(self.name),
                                     shape=(input_shape[1],),
                                     initializer='zero',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint,
                                     trainable=True)
        else:
            self.b = None
            
        self.context = self.add_weight(
                                    name='context_vector', shape=(self.step_dim, 1),
                                    initializer= self.init,
                                    regularizer=self.b_regularizer,
                                    constraint=self.b_constraint,
                                    trainable=True)

        self.built = True

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            e += self.b
        e = K.tanh(e)
        
        similarity = K.dot(e, self.context)
        a = K.exp(similarity)

        #Avoid NaN's with the addition of a very small positive number.
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        
        #Weighted sum
        c = K.sum(a * x, axis=1)
        return c

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

#create class for the dnn handler
class deep_neural_network_handler():
    def __init__(self, seq_len, vocabulary_sz, wv_dimension, embedding_matrix, learning_rate, word_encoding_dim, n_classes):
        self.seq_len 			= seq_len
        self.vocabulary_sz 		= vocabulary_sz
        self.wv_dimension 		= wv_dimension
        self.embedding_matrix 	= embedding_matrix
        self.learning_rate 		= learning_rate
        self.word_encoding_dim  = word_encoding_dim
        self.n_classes 			= n_classes
    def get_metrics(self):
        if self.n_classes > 2:
            custom_loss      = 'categorical_crossentropy'
            custom_accuracy  = 'categorical_accuracy'
        else:
            custom_loss      = 'binary_crossentropy'
            custom_accuracy  = 'accuracy'
        return custom_loss, custom_accuracy
    def model_test(self, model, test_partition):
        prediction = model.predict(test_partition)
        test_prediction = []
        for instance in prediction:
            custom_list = instance.tolist()
            test_prediction.append(custom_list.index(max(custom_list)))
        return test_prediction
    def model_test_report(self, model, test_partition, y_test, precision_digits):
        test_prediction = self.model_test(model, test_partition)
        report          = metrics.classification_report(y_test, test_prediction, digits=precision_digits)
        return report
    def gru_architecture(self, rnn_dropout_rate=0.15):
        instance_input       = Input(shape=(self.seq_len,), dtype='int32')
        embedded_sequences   = Embedding(self.vocabulary_sz, self.wv_dimension, weights = [self.embedding_matrix], 
                                    input_shape=(self.seq_len,), trainable=False)(instance_input)
        dropout              = Dropout(rnn_dropout_rate)(embedded_sequences)
        gru_uno              = Bidirectional(GRU(units=self.word_encoding_dim, return_sequences=True))(dropout)
        gru_dos              = Bidirectional(GRU(units=int(self.word_encoding_dim), return_sequences=False))(gru_uno)
        dropout              = Dropout(rnn_dropout_rate)(gru_dos)
        dense                = Dense(128, activation='relu')(dropout)
        dropout              = Dropout(rnn_dropout_rate)(dense)
        prediction           = Dense(self.n_classes, activation='softmax')(dropout)

        custom_loss, custom_accuracy = self.get_metrics()

        model                = Model(instance_input, prediction)

        model.compile(loss        = custom_loss,
                        optimizer     = Adam(lr = self.learning_rate),
                        metrics       = [custom_accuracy])
        return model
    def gru_att_architecture(self, rnn_dropout_rate=0.15):
        instance_input       = Input(shape=(self.seq_len,), dtype='int32')
        embedded_sequences   = Embedding(self.vocabulary_sz, self.wv_dimension, weights = [self.embedding_matrix], 
                                    input_shape=(self.seq_len,), trainable=False)(instance_input)
        dropout              = Dropout(rnn_dropout_rate)(embedded_sequences)
        gru_uno              = Bidirectional(GRU(units=self.word_encoding_dim, return_sequences=True))(dropout)
        gru_dos              = Bidirectional(GRU(units=int(self.word_encoding_dim), return_sequences=True))(gru_uno)
        attention            = Attention(self.seq_len)(gru_dos)
        dropout              = Dropout(rnn_dropout_rate)(attention)
        dense                = Dense(128, activation='relu')(dropout)
        dropout              = Dropout(rnn_dropout_rate)(dense)
        prediction           = Dense(self.n_classes, activation='softmax')(dropout)

        custom_loss, custom_accuracy = self.get_metrics()

        model                = Model(instance_input, prediction)

        model.compile(loss        = custom_loss,
                        optimizer     = Adam(lr = self.learning_rate),
                        metrics       = [custom_accuracy])
        return model


class validation_metrics(Callback):
    def __init__(self):
        super(validation_metrics, self).__init__()
    def on_train_begin(self, logs={}):
        self.f1_macro 		= []
        self.patience 		= 5
        self.max_f1_macro 	= 0.0
        self.negative_count = 0
    def on_epoch_end(self, epoch, logs={}):
        if(len(self.validation_data) == 6):
            val_predict = (np.asarray(self.model.predict([self.validation_data[0],self.validation_data[1],self.validation_data[2]]))).round()
            val_targ = self.validation_data[3]
        else:
            val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
            val_targ = self.validation_data[1]
        score = metrics.f1_score(val_targ, val_predict, average='macro')
        self.f1_macro.append(score)
        if self.max_f1_macro < score:
            print("THE MACRO AVG F1 SCORE IMPROVED FROM: " + str(self.max_f1_macro) + " TO: " + str(score))
            self.max_f1_macro = score
            self.negative_count = 0
            self.model.save('trained models/current_model.hdf5')
        else:
            print("THE MACRO AVG F1 SCORE DID'T IMPROVE: " + str(self.max_f1_macro))
            self.negative_count += 1
        if self.negative_count == self.patience:
            self.model.stop_training = True

In [None]:
#Create the deep neural network instance
dnn_handler = deep_neural_network_handler(seq_len, vocabulary_sz, wv_dimension, embedding_matrix, learning_rate, word_encoding_dim, n_classes)

In [None]:
#Create GRU DNN
gru_model = dnn_handler.gru_architecture(0.15)
gru_model.summary()


#Train GRU DNN model
c_metrics = validation_metrics()

history = gru_model.fit(x                = x_train,
                        y                = y_train,
                        epochs           = 25,
                        batch_size       = 32,
                        verbose          = 1,
                        validation_split = 0.15,
                        callbacks        = [c_metrics])

#load trained weights
gru_model.load_weights('trained models/current_model.hdf5')

#test GRU NN model
report = dnn_handler.model_test_report(gru_model, x_test, dataset[1]['label'], 5)
print(report)

In [None]:
#Create GRU+ATT DNN
gru_model = dnn_handler.gru_att_architecture(0.15)
gru_model.summary()


#Train GRU DNN model
c_metrics = validation_metrics()

history = gru_model.fit(x                = x_train,
                        y                = y_train,
                        epochs           = 25,
                        batch_size       = 32,
                        verbose          = 1,
                        validation_split = 0.10,
                        callbacks        = [c_metrics])

#load trained weights
gru_model.load_weights('trained models/current_model.hdf5')

#test GRU NN model
report = dnn_handler.model_test_report(gru_model, x_test, dataset[1]['label'], 5)
print(report)