In [50]:
import warnings
import nltk
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, make_scorer, fbeta_score, multilabel_confusion_matrix,\
    average_precision_score, precision_score, recall_score
import numpy as np
import time
from utilities.preprocess import Preproccesor
from utilities.attention_layer import Attention
from utilities.helping_functions import create_embedding_matrix
from keras.preprocessing.text import Tokenizer
from keras.layers.embeddings import Embedding
from keras.models import Sequential, clone_model, model_from_json
from keras.optimizers import Adam
from keras import Input, Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.engine import Layer
from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, Bidirectional, Dense, LSTM, Conv1D, MaxPooling1D, Dropout, concatenate, Flatten, add, Conv2D
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
pd.set_option('max_colwidth', 400)


def average_precision_wrapper(y, y_pred, view):
    return average_precision_score(y, y_pred.toarray(), average=view)

In [51]:
hamm_scorer = make_scorer(hamming_loss, greater_is_better=False)
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [52]:
nltk.download('wordnet')
nltk.download('stopwords') 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
X, yt, y = Preproccesor.load_multi_label_data(True, False) #yt has continuous data, y has binary
label_names = ["violence","directed_vs_generalized","gender","race","national_origin","disability","religion","sexual_orientation"]

In [None]:
import zipfile
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'

--2021-04-13 15:40:47--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2021-04-13 15:42:52 (11.7 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]



In [None]:
!wget 'http://nlp.stanford.edu/data/glove.42B.300d.zip'

--2021-04-13 15:42:52--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2021-04-13 15:42:52--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2021-04-13 15:42:52--  http://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]
Sav

In [None]:
with zipfile.ZipFile("/content/crawl-300d-2M.vec.zip", "r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)
with zipfile.ZipFile("/content/glove.42B.300d.zip", "r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)

del zip_ref

[<ZipInfo filename='crawl-300d-2M.vec' compress_type=deflate filemode='-rw-r--r--' file_size=4514687127 compress_size=1523784963>]
[<ZipInfo filename='glove.42B.300d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=5025028820 compress_size=1877800207>]


In [None]:
!rm '/content/crawl-300d-2M.vec.zip'
!rm '/content/glove.42B.300d.zip'

In [40]:
embedding_path1 = "/content/embeddings/crawl-300d-2M.vec" #FastText
embedding_path2 = "/content/embeddings/glove.42B.300d.txt" #Glove 300d
embed_size = 300

In [41]:
n_fold = 10
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=7)

In [42]:
def my_hamming_loss(y_true, y_pred):
    print(y_true, y_pred)
    y_true = K.cast(y_true, dtype='float32')
    y_pred = K.cast(y_pred, dtype='float32')
    print(y_true, y_pred)
    hamming_loss(y_true, y_pred)

    return K.mean(diff, axis=-1)

In [44]:
# Binary Relevance
def build_model1(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(
        file_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
    early_stop = EarlyStopping(
        monitor="val_loss", mode="min", patience=patience)
    main_input = Input(shape=(max_len,), name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len,
                   weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.4)(x)
    x = Bidirectional(LSTM(150, return_sequences=True))(x)
    x = Bidirectional(LSTM(150, return_sequences=True))(x)
    hidden = concatenate([
        Attention(max_len)(x),
        GlobalMaxPooling1D()(x),
    ])
    hidden = Dense(1024, activation='selu')(hidden)
    hidden = Dropout(0.4)(hidden)
    hidden = Dense(512, activation='selu')(hidden)
    hidden = Dropout(0.4)(hidden)
    hidden1 = Dense(128, activation='selu')(hidden)
    output_lay1 = Dense(8, activation='sigmoid')(hidden1)
    model = Model(inputs=[main_input], outputs=output_lay1)
    model.compile(loss="binary_crossentropy", optimizer=Adam(
        lr=lr, decay=lr_d), metrics=['binary_accuracy'])
    from keras.utils import plot_model
    plot_model(model, to_file='model1.png')
    model2 = Model(inputs=[main_input], outputs=output_lay1)
    model.fit(X_train, y_train, batch_size=16, epochs=50, validation_data=(
        X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(
        lr=lr, decay=lr_d), metrics=['binary_accuracy'])
    return model2

In [45]:
# Classifier Chains
def build_model2(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(
        file_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
    early_stop = EarlyStopping(
        monitor="val_loss", mode="min", patience=patience)
    main_input = Input(shape=(max_len,), name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len,
                   weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.5)(x)
    x = Bidirectional(LSTM(150, return_sequences=True))(x)
    x = Bidirectional(LSTM(150, return_sequences=True))(x)
    hidden = concatenate([
        Attention(max_len)(x),
        GlobalMaxPooling1D()(x),
    ])
    hidden = Dense(1024, activation='selu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden = Dense(512, activation='selu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden1 = Dense(128, activation='selu')(hidden)
    output_lay1 = Dense(1, activation='selu')(hidden1)
    hidden2 = concatenate([hidden1, output_lay1])
    output_lay2 = Dense(1, activation='selu')(hidden2)
    hidden3 = concatenate([hidden2, output_lay2])
    output_lay3 = Dense(1, activation='selu')(hidden3)
    hidden4 = concatenate([hidden3, output_lay3])
    output_lay4 = Dense(1, activation='selu')(hidden4)
    hidden5 = concatenate([hidden4, output_lay4])
    output_lay5 = Dense(1, activation='selu')(hidden5)
    hidden6 = concatenate([hidden5, output_lay5])
    output_lay6 = Dense(1, activation='selu')(hidden6)
    hidden7 = concatenate([hidden6, output_lay6])
    output_lay7 = Dense(1, activation='selu')(hidden7)
    hidden8 = concatenate([hidden7, output_lay7])
    output_lay8 = Dense(1, activation='selu')(hidden8)

    hidden_l = concatenate([output_lay1, output_lay2, output_lay3, output_lay4, output_lay5, output_lay6,
                            output_lay7, output_lay8])
    hidden_l = Dropout(0.5)(hidden_l)
    output_layer = Dense(8, activation='sigmoid')(hidden_l)

    model = Model(inputs=[main_input], outputs=output_layer)
    model.compile(loss="binary_crossentropy", optimizer=Adam(
        lr=lr, decay=lr_d), metrics=['binary_accuracy', 'categorical_accuracy'])
    from keras.utils import plot_model
    plot_model(model, to_file='model2.png')
    model2 = Model(inputs=[main_input], outputs=output_layer)
    model.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(
        X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(
        lr=lr, decay=lr_d), metrics=['binary_accuracy', 'categorical_accuracy'])
    return model2

In [46]:
!pip install iterative-stratification

In [49]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

max_features = 50000
scores = {}
scores = {}
scores.setdefault('test_F1_example', [])
scores.setdefault('test_F1_macro', [])
scores.setdefault('test_F1_micro', [])
scores.setdefault('test_precision_example', [])
scores.setdefault('test_precision_macro', [])
scores.setdefault('test_precision_micro', [])
scores.setdefault('test_recall_example', [])
scores.setdefault('test_recall_macro', [])
scores.setdefault('test_recall_micro', [])
scores.setdefault('test_average_precision_macro', [])
scores.setdefault('test_average_precision_micro', [])
scores.setdefault('test_Accuracy', [])
scores.setdefault('test_Hamm', [])
cm = []
mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
fold_n = 0
save_ys = []
save_yt = []
max_len = 150
embed_size = 150
embma = 1
name = "Mixed"

for train_index, test_index in mskf.split(X, y):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    tk = Tokenizer(lower=True, filters='',
                   num_words=max_features, oov_token=True)
    tk.fit_on_texts(X_train)
    train_tokenized = tk.texts_to_sequences(X_train)
    valid_tokenized = tk.texts_to_sequences(X_valid)
    X_train = pad_sequences(train_tokenized, maxlen=max_len)
    X_valid = pad_sequences(valid_tokenized, maxlen=max_len)
    embedding_matrix = create_embedding_matrix(embma, tk, max_features)

    model = build_model2(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix,lr=1e-3, lr_d=0, spatial_dr=0.1, dense_units=128, conv_size=128, dr=0.1, patience=10, fold_id=fold_n)

    fold_n = fold_n + 1
    yT = model.predict(X_valid)
    y_preds = []
    for yt in yT:  # Don't do this if you throw them with continuous values
        yi = []
        for i in yt:
            if i >= 0.5:
                yi.append(int(1))
            else:
                yi.append(int(0))
        y_preds.append(yi)
    y_preds = np.array(y_preds)
    scores['test_F1_example'].append(
        f1_score(y_valid, y_preds, average='samples'))
    scores['test_F1_macro'].append(f1_score(y_valid, y_preds, average='macro'))
    scores['test_F1_micro'].append(f1_score(y_valid, y_preds, average='micro'))
    scores['test_precision_example'].append(
        precision_score(y_valid, y_preds, average='samples'))
    scores['test_precision_macro'].append(
        precision_score(y_valid, y_preds, average='macro'))
    scores['test_precision_micro'].append(
        precision_score(y_valid, y_preds, average='micro'))
    scores['test_recall_example'].append(
        recall_score(y_valid, y_preds, average='samples'))
    scores['test_recall_macro'].append(
        recall_score(y_valid, y_preds, average='macro'))
    scores['test_recall_micro'].append(
        recall_score(y_valid, y_preds, average='micro'))
    scores['test_average_precision_macro'].append(
        average_precision_score(y_valid, y_preds, average='macro'))
    scores['test_average_precision_micro'].append(
        average_precision_score(y_valid, y_preds, average='micro'))
    scores['test_Accuracy'].append(accuracy_score(y_valid, y_preds))
    scores['test_Hamm'].append(hamming_loss(y_valid, y_preds))

f = open("../results/setZ.txt", "a+")
f.write("{:<7} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} \n".format(str(name)[:7],
        str('%.4f' % (
            sum(scores['test_F1_example'])/10)),
        str('%.4f' % (
            sum(scores['test_F1_macro'])/10)),
        str('%.4f' % (
            sum(scores['test_F1_micro']) / 10)),
        str('%.4f' % (
            sum(scores['test_precision_example']) / 10)),
        str('%.4f' % (
            sum(scores['test_precision_macro']) / 10)),
        str('%.4f' % (
            sum(scores['test_precision_micro']) / 10)),
        str('%.4f' % (
            sum(scores['test_recall_example']) / 10)),
        str('%.4f' % (
            sum(scores['test_recall_macro']) / 10)),
        str('%.4f' % (
            sum(scores['test_recall_micro']) / 10)),
        str('%.4f' % (
            sum(scores['test_average_precision_macro'])/10)),
        str('%.4f' % (
            sum(scores['test_average_precision_micro'])/10)),
        str('%.4f' % (
            sum(scores['test_Accuracy'])/10)),
        str('%.4f' % (sum(scores['test_Hamm'])/10))))
f.close()

