<a href="https://colab.research.google.com/github/Hebruwu/NLP_2023/blob/main/Part2_SI/Semeval2020_11_SI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SemEval - 2020 Task 11 (Task SI)
##Author: Ritvik Prabhu

Below install the necessary libraries for the training of the model

In [None]:
# !pip install transformers
# !pip install pytorch-crf
# !pip install seqeval[gpu]

Follow the instructions here: https://propaganda.qcri.org/semeval2020-task11/index.html

Download the datasets as a zip file and store it in the notebook environment

In [None]:
# !unzip datasets.zip

Import the necessary libraries

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
import pandas as pd
import re
import nltk
from sklearn.metrics import classification_report, f1_score


Ensure that the GPU is available to train on

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


The amount of data is very dense. As a result, the preprocessing step takes a very long time. To combat this, we preprocess the data once and store the preprocessed file to run our training on. For that reason, the next few cells are commented out as they are part of the initial preprocessing step. We will include the preprocessed data in the repo to avoid any sort of delays during evaluation.

Below we preprocess the data by converting them to lower case and tagging each word of each data point using the BIO tagging schema.

In [None]:
# # Function to preprocess text
# def preprocess_text(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z0-9\s]', '', text)
#     return text


# def load_labeled_passages(set_type):
#     text = []
#     bio_tagged = []

#     train_files = os.listdir(f"datasets/{set_type}-articles/")
#     train_file_prefixes = [x.replace(".txt", "") for x in train_files]
#     del train_files

#     articles_with_labels = []
#     for prefix in train_file_prefixes:
#         article_file_name = f"datasets/{set_type}-articles/{prefix}.txt"
#         labels_file_name = f"datasets/{set_type}-labels-task-si/{prefix}.task-si.labels"

#         with open(article_file_name, "r") as article_file:
#             with open(labels_file_name, "r") as labels_file:
#                 article = article_file.read()
#                 words = preprocess_text(article).split()
#                 bio_text = ['O']*len(words)
#                 text.append(words)
#                 article_id_added = False
#                 for line in labels_file:
#                     id, start, end = line.split()
#                     start = int(start)
#                     end = int(end)
#                     for i, word in enumerate(words):
#                         # Check if the start of the word is within the start and end character offsets
#                         if start <= len(' '.join(words[:i])) < end:
#                             bio_text[i] = 'B-PROP' if len(' '.join(words[:i])) == start else 'I-PROP'
#                 bio_tagged.append(bio_text)
#     return {"text": text,
#             "bio_tagged": bio_tagged,}


We now load the data and preprocess them

In [None]:
# train_data = load_labeled_passages("train")
# test_data = load_labeled_passages("dev")
# df_train = pd.DataFrame(train_data)
# df_test = pd.DataFrame(test_data)
# df_train.head()

We create a bidirectional hash map to access the keys and values of each BIO tag

In [None]:
labels_to_ids = {"B-PROP": 1, "I-PROP": 2, "O": 0}
ids_to_labels = {1: "B-PROP", 0: "O", 2: "I-PROP" }

Below we export the the preprocessed data into a file for later and quick use

In [None]:
# def export_to_file(export_file_path, data):
#     with open(export_file_path, "w") as f:
#         for index, row in data.iterrows():
#             ner_tags = row["bio_tagged"]
#             tokens = row["text"]
#             ner_tags_ids = [labels_to_ids[tag] for tag in ner_tags]
#             if len(tokens) > 0:
#                 f.write(
#                     str(len(tokens))
#                     + "\t"
#                     + "\t".join(tokens)
#                     + "\t"
#                     + "\t".join(map(str, ner_tags_ids))
#                     + "\n"
#                 )


# export_to_file("./data/train.txt", df_train)
# export_to_file("./data/val.txt", df_test)

Below we develop our own tokens from the given data to allow for consistent tokenization of our text. We limit the dataset to 20K unique tokens.

In [None]:
# all_tokens = sum(df_train["text"].to_list(), [])
# all_tokens_array = np.array(list(map(str.lower, all_tokens)))
# counter = Counter(all_tokens_array)

num_tags = len(ids_to_labels)
vocab_size = 20000

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
# vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# import pickle

# file_path = './data/vocabulary.pkl'

# # Save the vocabulary list to a file using pickle
# with open(file_path, 'wb') as file:
#     pickle.dump(vocabulary, file)


We import the vocabulary data and preprocessed data that we just saved

In [None]:
import pickle
# Specify the file path where you saved the vocabulary
file_path = 'vocabulary.pkl'

# Load the vocabulary list from the file using pickle
with open(file_path, 'rb') as file:
    loaded_vocabulary = pickle.load(file)

# Use the loaded vocabulary for StringLookup
lookup_layer = keras.layers.StringLookup(
    vocabulary=loaded_vocabulary
)

In [None]:
train_data = tf.data.TextLineDataset("train.txt")
val_data = tf.data.TextLineDataset("val.txt")

This is an example of one of the lines of the preprocessed data

In [None]:
print(list(train_data.take(1).as_numpy_iterator()))

[b'287\tnimesh\tpatel\tstandup\troutine\tcut\tshort\tdue\tto\tuncomfortable\tjokes\tnimesh\tpatel\ta\tcomedian\tknown\tfor\tbeing\tthe\tfirst\tindianamerican\twriter\tfor\tsaturday\tnight\tlive\thad\this\tstandup\troutine\tat\tcultureshock\tcut\tshort\tearlier\ttonight\tdue\tto\tuncomfortable\tjokes\tcultureshock\tan\tevent\thosted\tby\tthe\tasian\tamerican\talliance\tis\ta\tcharity\tperformance\tshowcase\tthat\taims\tto\tprovide\ta\tspace\tto\tcelebrate\tasian\tamerican\texpression\tpatel\twas\tone\tof\tthe\tmain\tevents\tpromoted\tbeforehand\thowever\this\tjokes\tquickly\tprogressed\tto\tuncomfortable\tterritory\tincluding\tone\tabout\ta\tgay\tblack\tman\twho\tlives\tin\this\tneighborhood\tand\thow\tit\tmade\tme\trealize\tthat\tbeing\tgay\tis\tdefinitely\tnot\ta\tchoice\tbecause\tno\tone\twants\tto\tbe\tgay\tand\tblack\tthe\ttension\tin\tthe\troom\tincreased\tas\tpatel\ttold\tmore\tjokes\tin\tthis\tvein\tuntil\torganizers\tof\tthe\tevent\twent\tup\ton\tstage\tto\tstop\thim\tciting\ta

We define the Transformer layer, the Tokens and Position layer and the custom NER model below

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings


In [None]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=len(loaded_vocabulary), embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x


We convert the text to a format suitable to pass to the model

In [None]:
def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    return tokens, tags


def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)


# We use `padded_batch` here because each record in the dataset has a
# different length.
batch_size = 1
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)
val_dataset = (
    val_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

ner_model = NERModel(num_tags, vocab_size + 1, embed_dim=32, num_heads=4, ff_dim=64)

We compile and train the model below

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=1e-5, clipvalue=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

ner_model.compile(optimizer=optimizer, loss=loss)
ner_model.fit(train_dataset, epochs=20)

Epoch 1/20


  output, from_logits = _get_logits(


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7bcfc165e830>

We test the model's performance on the validation data

In [None]:
# Make predictions on the validation dataset
val_predictions = ner_model.predict(val_dataset)

# Convert predictions and true labels to flat lists
flat_val_predictions = np.concatenate([np.argmax(pred, axis=-1).flatten() for pred in val_predictions])
flat_val_labels = np.concatenate([y.numpy().flatten() for x, y in val_dataset])

# Create a classification report
class_report = classification_report(
    flat_val_labels, flat_val_predictions, target_names=list(ids_to_labels.values()), output_dict=True, zero_division=0
)

# Print the classification report
for category, metrics in class_report.items():
    if isinstance(metrics, dict) and category == 'macro avg':
        print(f'Category: {category}')
        print(f'Precision: {metrics["precision"]:.4f}')
        print(f'Recall: {metrics["recall"]:.4f}')
        print(f'F1 Score: {metrics["f1-score"]:.4f}')
        print(f'Support: {metrics["support"]}')
        print('-' * 40)
        break

Category: macro avg
Precision: 0.2997
Recall: 0.3333
F1 Score: 0.3156
Support: 57395
----------------------------------------


We save the model below and store it as a zip file for easy transportation of the model

In [None]:
model_save_path = 'SI_task_model'
ner_model.save(model_save_path)
lookup_layer_save_path = 'lookup_layer'
with open(lookup_layer_save_path, 'wb') as file:
    pickle.dump(lookup_layer.get_vocabulary(), file)

In [None]:
import os
import zipfile

folder_to_zip = 'SI_task_model'
zip_file_name = 'SI_task_model_files.zip'
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(folder_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, folder_to_zip))

print(f'The folder "{folder_to_zip}" has been zipped into "{zip_file_name}".')


The folder "SI_task_model" has been zipped into "SI_task_model_files.zip".
