# Variables to set

`standard_load_dir`: Relative save folder of the standard setup model

`main_load_dir`: Relative save folder of the sub model in the hierarchical setup

`sub_load_dir`: Relative save folder of the main model in the hierarchical setup

In [24]:
standard_load_dir = './Saved Models/RoBERTa-standard'
main_load_dir = './Saved Models/RoBERTa-main'
sub_load_dir = './Saved Models/RoBERTa-sub'

# Data Preprocessing (Data Filtering, Token Classification , Cleaning, Normalization)

## Import required libraries, download necessary data, open and save additional files

In [2]:
import re
import contractions
import simplemma
import emoji
import pandas as pd
import json
import os
import csv
import unicodedata
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#tTagalog Stop words dictionary
tl_stopwords_file = open('./preprocessing_files/stopwords-tl.txt', 'r')
data = tl_stopwords_file.read()
tl_stopwords_list = data.split('\n')
tl_stopwords_file.close()

# opening the candidate file in read mode
candidate_file = open('./preprocessing_files/candidate_keywords.txt', 'r')
data = candidate_file.read()
candidate_keywords = data.split('\n')
candidate_file.close()

# opening the party file in read mode
party_file = open('./preprocessing_files/party_keywords.txt', 'r')
data = party_file.read()
party_keywords = data.split('\n')
party_file.close()

# opening the supporter file in read mode
supporter_file = open('./preprocessing_files/supporter_keywords.txt', 'r')
data = supporter_file.read()
supporter_keywords = data.split('\n')
supporter_file.close()

# opening the related file in read mode
related_file = open('./preprocessing_files/related_keywords.txt', 'r')
data = related_file.read()
related_keywords = data.split('\n')
related_file.close()

# opening the candidate file in read mode
not_file = open('./preprocessing_files/not_keywords.txt', 'r')
data = not_file.read()
not_keywords = data.split('\n')
not_file.close()

keywords = candidate_keywords + party_keywords
keywords = list(map(lambda x: x.lower().strip(), keywords))
keywords

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['manny',
 'pacquiao',
 'mannypacquiao',
 'bongbong',
 'marcos',
 'bongbongmarcos',
 'faisal',
 'mangondato',
 'faisalmangondato',
 'panfilo',
 'lacson',
 'panfilolacson',
 'norberto',
 'gonzales',
 'norbertogonzales',
 'leody',
 'de guzman',
 'leodydeguzman',
 'guzman',
 'ernesto',
 'abella',
 'ernestoabella',
 'isko',
 'moreno',
 'iskomoreno',
 'leni',
 'robredo',
 'lenirobredo',
 'jose',
 'montemayor',
 'josemontemayor',
 'yorme',
 'lbm',
 'vyvym',
 'bybym',
 'babym',
 'veeveem',
 'beebeem',
 '88m',
 'partido lakas ng masa',
 'plm',
 'partido demokratiko sosyalista ng pilipinas',
 'pdsp',
 'katipunan ng kamalayang kayumanggi',
 'partido federal ng pilipinas',
 'pfp',
 'dpp',
 'democratic party of the philippines',
 'aksyon demokratiko',
 'aksyon',
 'promdi',
 'unity',
 'uniteam',
 'rosas',
 'pinklawan']

## The following are functions required for Data Preprocessing

In [3]:
# fix common tagalog contractions
def decontracted(phrase):
    # specific (seen frequently in tweet comments)
    phrase = re.sub(r"ika\'y", "ikaw ay", phrase)
    phrase = re.sub(r"ba\'t", "bakit", phrase)
    phrase = re.sub(r"nya", "niya", phrase)
    phrase = re.sub(r"nyo", "ninyo", phrase)
    phrase = re.sub(r"kundi", "kung hindi", phrase)
    phrase = re.sub(r" musta ", "kumusta", phrase)

    # general (starts with ') [https://en.wiktionary.org/wiki/Category:Tagalog_contractions]
    phrase = re.sub(r"\'yo", " iyo", phrase)
    phrase = re.sub(r"\'to", " ito", phrase)
    phrase = re.sub(r"\'di", " hindi", phrase)
    phrase = re.sub(r"\'no", " ano", phrase)
    phrase = re.sub(r"\'t", " at", phrase)
    phrase = re.sub(r"\'y", " ay", phrase)
    phrase = re.sub(r"\'ng", " ang", phrase)
    return phrase

In [4]:
# removes same consecutive emojis into one
def remove_duplicate_emoji(orig_str):
    prev_emoji = None
    remove_duplicate_emoji = []
    for c in orig_str:
        if c in emoji.EMOJI_DATA:
            if prev_emoji == c:
                continue
            prev_emoji = c
        remove_duplicate_emoji.append(c)
    return "".join(remove_duplicate_emoji)

In [5]:
# remove non-white spaces of the labeled keywords
def remove_chars_keep_uppercase(string):
    pattern = r'(https?://\w*[A-Z]\w*|@\w*[A-Z]\w*|#\w*[A-Z]\w*)'
    matches = re.findall(pattern, string)
    for match in matches:
        new_match = re.sub('[^A-Z]+', ' ', match)
        string = string.replace(match, new_match)
    return string

In [6]:
# removes same consecutive labeled keywords into one
def remove_duplicate_uppercase(text):
    pattern = r'\b([A-Z]+)(?:\s+\1)+\b'
    replace = r'\1'
    return re.sub(pattern, replace, text)

In [7]:
# adds white space between different keywords
def add_whitespace_to_keywords(text):
    # Define the keywords
    keywords = ['CANDIDATE', 'RELATED', 'SUPPORTER', 'PARTY']

    # Create a regular expression pattern to match the keywords
    pattern = re.compile('|'.join(keywords))

    # Define a function to add whitespace between the matched keywords
    def add_whitespace(match):
        return ' ' + match.group(0)

    # Use the sub() method to replace the matched keywords with whitespace
    result = pattern.sub(add_whitespace, text)

    return result

In [8]:
# Removes extra non-capital letters or digits after capital letters 
def remove_extra_trailing_chars(text):
    # Define a regular expression pattern to match non-capital letters or digits after capital letters
    pattern = re.compile(r'(?<=[A-Z])[^A-Z\d\s]+')

    # Define a function to remove non-capital letters or digits after capital letters in a string
    def replace_extra_chars(match):
        return ''

    # Use the sub() method to remove the extra characters
    result = pattern.sub(replace_extra_chars, text)

    return result

In [9]:
# label the keywords based on what category it belongs to
def label_keywords(comment):
    for index, keyword in enumerate(keywords):
        if keyword == 'ong':
            keyword = ' ong'
        isKeyword = True
        compiled = re.compile(re.escape(keyword), re.IGNORECASE)
        for not_keyword in not_keywords:
            if keyword in not_keyword and not_keyword in comment:
                isKeyword = False
                break
        # label based on order of priority 
        if isKeyword:
            if index < len(candidate_keywords): 
                comment = compiled.sub("CANDIDATE", comment)
                if re.search(r'[CANDIDATE][es]|[s]$', comment):
                    comment = compiled.sub("RELATED", comment)
            elif index < (len(candidate_keywords) + len(party_keywords)):
                comment = compiled.sub("PARTY", comment)
            elif index < (len(candidate_keywords) + len(party_keywords) + len(supporter_keywords)):
                comment = compiled.sub("SUPPORTER", comment)
            else:
                comment = compiled.sub("RELATED", comment)
    return comment

In [10]:
# removes stopwords for both english and tagalog
def remove_stopwords(text, language='english'):
    if language == 'english':
        stop_words = set(stopwords.words('english'))
    elif language == 'tagalog':
        stop_words = tl_stopwords_list
    else:
        print('Unsupported language')
        return text
    
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    return ' '.join(filtered_words)

In [11]:
# [corner case] adds space between textualized emojis
def add_space(string):
    output_str = re.sub(r'(?<=[a-z0-9:])(?=[A-Z])|(?<=[A-Z])(?=[a-z0-9:])', ' ', string)
    output_str = re.sub(r'::', ': :', output_str)
    return output_str
# [corner case] used in fixer only
def remove_hashtag_word(string):
    pattern = r"HASHTAG\s*\S+"
    string = re.sub(pattern, "HASHTAG", string)
    return string

### Phase 1 and Phase 2 Functions For Data Preprocessing Stages

In [12]:
punctuations = '!"$%&\'()*+,-./;<=>?@[\\]^_`{|}~'
transtab = str.maketrans(dict.fromkeys(punctuations, ""))
lemmatizer = WordNetLemmatizer()
# Used for example app, the dataset will undergo filtering Tagalog tweets,canonicalizing phrases, converting to lowercase, 
#identifying keywords, tokenizing links, mentions and hashtags, and shortening repeated emojis.
def preprocess_phase_1_and_2(current_text):
    # canonicalize sentence
    current_text = unicodedata.normalize('NFKD', current_text).encode('ascii', 'ignore').decode('utf-8')
    # Convert to lower case
    current_text = current_text.lower()
    # Remove Keywords
    current_text = label_keywords(current_text)
    # Clean Keywords
    current_text = remove_chars_keep_uppercase(current_text)
    # Remove Links
    current_text = re.sub(r'http\S+', 'LINK', current_text)
    # Remove Mentions
    current_text = re.sub('@\S+', 'MENTION', current_text)
    # Remove Hashtag
    current_text = re.sub('#\S+', 'HASHTAG', current_text)
    # Reduce Duplicated Emojis
    current_text = remove_duplicate_emoji(current_text)
    # Emojis to text
    current_text = emoji.demojize(current_text)
    # Add Space for Keywords
    current_text = add_whitespace_to_keywords(current_text)
    current_text = add_space(current_text)
    # Remove Extra spaces
    current_text = re.sub(r'\s+', ' ', current_text.strip())
    # Remove Extra charcters
    current_text = remove_extra_trailing_chars(current_text)
    # Remove Duplicate Keywords
    current_text = remove_duplicate_uppercase(current_text)
    # Remove tagalog Contractions
    current_text = decontracted(current_text)
    # Remove english Contractions
    current_text = contractions.fix(current_text)
    # Fix Elongated
    current_text = re.sub(r'(\w)(\1{2,})', r'\1\1', current_text)
    # Lemmatization for english only
    current_text = lemmatizer.lemmatize(current_text)
    # Remove Punctuations
    current_text = current_text.translate(transtab)
    # Remove stop words
    current_text = remove_stopwords(current_text, language='english')
    # Remove stop words
    current_text = remove_stopwords(current_text, language='tagalog')
    # Remove Extra spaces
    current_text = " ".join(current_text.split())
    return current_text

In [13]:
#example
pre_text = "AkO NA nga BA sir BongBong?"
keywords = keywords + supporter_keywords + related_keywords

#### The process below is for Phase 1 and 2 Data Preprocessing

In [14]:
result = preprocess_phase_1_and_2(pre_text)

result

'nga ba sir CANDIDATE'

# RoBERTa Predictor

## Load saved models

In [15]:
# Core
import numpy as np
import pandas as pd
import os

# Dataset prepration
#from transformers import TFAutoModel, AutoModel
from transformers import AutoTokenizer, Trainer
from datasets import Dataset

# Model, hyperparameter search, evaluation
import torch
from transformers import BertPreTrainedModel, TrainingArguments
from transformers.models.roberta.modeling_roberta import (
    RobertaClassificationHead,
    RobertaConfig,
    RobertaModel,
)
from torch.nn import CrossEntropyLoss
import evaluate

from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# Prepare variables for indexing
classes = ["Explicit", "Implicit", "Non-abusive"]
labels = ["E1", "E2", "E3", "I1", "I2", "I3", "I4", "I5", "I6", "I7"]

In [17]:
def load_csv(dataset, use_stopwords):
    if dataset not in ["train", "validate", "test"]:
        raise Exception("Invalid split.")
    if type(use_stopwords) != bool:
        raise Exception("Stop words must be specified in boolean.")
    
    stopwords = "With Stopwords"
    if not use_stopwords:
        stopwords = "Without Stopwords"
    df = pd.read_csv(f"./Data/{stopwords}/{dataset}.csv")
    
    return df

In [18]:
def format_dataset(df):
    df.rename({"Text": "text", "Class": "labels"}, axis=1, inplace=True)
    df.drop(labels, axis=1, inplace=True)
    
    dataset = Dataset.from_pandas(df)
    
    # Convert pd labels to huggingface ClassLabels for stratifying
    dataset = dataset.class_encode_column("labels")
    
    dataset = dataset.map(tokenize_function, batched=True)
    
    # Convert datasets to pytorch format
    dataset = dataset.remove_columns(["text"])
    dataset.set_format("torch")
        
    return dataset

In [19]:
# Variables
load_dir = "jcblaise/roberta-tagalog-base"

# Get tokenizer from repository
tokenizer = AutoTokenizer.from_pretrained(load_dir, model_max_length=256)

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

In [20]:
# Prepare classification head for pretrained RoBERTa
class RobertaAbusiveClassification(BertPreTrainedModel):
    
    def __init__(self, config):
        super(RobertaAbusiveClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.roberta(input_ids,attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [21]:
def compute_metrics(eval_pred):
    global predictions
    acc_metric = evaluate.load("accuracy")
    pre_metric = evaluate.load("precision")
    rec_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    
    return {"f1":f1}

In [22]:
# Set configurations
standard_num_labels=3
device = torch.device("cuda")

standard_config = RobertaConfig.from_pretrained(standard_load_dir, num_labels=standard_num_labels)
standard_model = RobertaAbusiveClassification.from_pretrained(standard_load_dir, config=standard_config)
standard_model.to(device)

standard_training_args = TrainingArguments(
    output_dir= 'predict',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to = "none")

standard_trainer = Trainer(
    model = standard_model,
    args = standard_training_args,
    compute_metrics = compute_metrics)

standard_model.eval()

RobertaAbusiveClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [25]:
# Set configurations
num_labels=2

main_config = RobertaConfig.from_pretrained(main_load_dir, num_labels=num_labels)
main_model = RobertaAbusiveClassification.from_pretrained(main_load_dir, config=main_config)
main_model.to(device)

main_training_args = TrainingArguments(
    output_dir= 'predict',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to = "none")

main_trainer = Trainer(
    model = main_model,
    args = main_training_args,
    compute_metrics = compute_metrics)

main_model.eval()

RobertaAbusiveClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [26]:
# Set configurations
num_labels=2

sub_config = RobertaConfig.from_pretrained(sub_load_dir, num_labels=num_labels)
sub_model = RobertaAbusiveClassification.from_pretrained(sub_load_dir, config=sub_config)
sub_model.to(device)

sub_training_args = TrainingArguments(
    output_dir= 'predict',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to = "none")

sub_trainer = Trainer(
    model = sub_model,
    args = sub_training_args,
    compute_metrics = compute_metrics)

sub_model.eval()

RobertaAbusiveClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Prediction loop

In [27]:
while True:
    print("Input 'exit' to stop.")
    intext = input()
    if intext == "exit":
        break;
    clear_output(wait=True)
    text = [preprocess_phase_1_and_2(intext)]
    
    # Preprocess text
    print("Preprocessing text...")
    testing_df = pd.DataFrame([[x, -1] for x in text], columns=["text", "labels"])
    testing_set = Dataset.from_pandas(testing_df)
    testing_set = testing_set.class_encode_column("labels")
    testing_set_tokenized = testing_set.map(tokenize_function, batched=True)

    testing_set_tokenized = testing_set_tokenized.remove_columns(["text"])
    testing_set_tokenized.set_format("torch")

    # Get predictions from standard setup
    print("Predicting with standard setup...")
    standard_trainer.evaluate(testing_set_tokenized)
    standard_prediction = predictions[0]
    
    # Get predictions from main hierarchical model
    print("Predicting with hierarchical setup...")
    main_trainer.evaluate(testing_set_tokenized)
    main_prediction = predictions[0]
    
    # Get predictions from sub hierarchical model
    if main_prediction == 1:
        sub_trainer.evaluate(testing_set_tokenized)
        sub_prediction = predictions[0]
        
    
    print(f'\nInput text\t\t: {intext}')
    print(f'Preprocessed text\t: {text[0]}')
    print("\n##### MULTICLASS CLASSIFIER RESULT #####")
    if standard_prediction == 0:
        print("\tNon-Abusive")
    elif standard_prediction == 1:
        print("\tExplicitly Abusive")
    elif standard_prediction == 2:
        print("\tImplicitly Abusive")
    print('\n')
    
    print("##### HIERARCHICAL CLASSIFIER RESULT #####")
    if main_prediction == 0:
        print("\tNon-Abusive")
    elif main_prediction == 1:
        print("\tAbusive")
        if sub_prediction == 0:
            print("\t> Explicitly Abusive")
        elif sub_prediction == 1:
            print("\t> Implicitly Abusive")
    print('\n')

Preprocessing text...


                                                                                                                       

Predicting with standard setup...




Predicting with hierarchical setup...



Input text		: mga walang tulog for leni
Preprocessed text	: tulog CANDIDATE

##### MULTICLASS CLASSIFIER RESULT #####
	Non-Abusive


##### HIERARCHICAL CLASSIFIER RESULT #####
	Non-Abusive


Input 'exit' to stop.
exit
