In [None]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch', 'transformers', 'joblib', 'calamanCy', 'accelerate'])
except ImportError:
    pass

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import joblib

In [None]:
from sklearn.pipeline import Pipeline
from transformers import BertForSequenceClassification
from torch import nn, device, cuda, optim
from skorch import NeuralNetClassifier
from skorch.hf import HuggingfacePretrainedTokenizer
from skorch.callbacks import Checkpoint, LoadInitState, ProgressBar

# _device = "cpu".'/
_device = device("cuda" if cuda.is_available() else "cpu")
"""
Set the device used by the learner.
It automatically uses the GPU if it is available, else,
it will default to the CPU.
Using the GPU is preferred because it is faster,
and can handle greater quantities of data.
"""

_model_name = "bert-base-multilingual-uncased"
"""
mBERT on Huggingface
"""

_model = BertForSequenceClassification.from_pretrained(
    _model_name,
    device_map=_device,
)
_model.to(_device)
"""
Make the mBERT model and map it to the device.
This automatically downloads it from Huggingface if it is
not already on the current system.
"""

class BertModel(nn.Module):
    """
    Custom Pytorch module for mBERT.
    This simply gets the output from mBERT and returns the
    logits, allowing it to properly classify inputs.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.bert = _model

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids, attention_mask)
        return x.logits

BertTokenizer = HuggingfacePretrainedTokenizer('bert-base-multilingual-uncased')
"""
Load the tokenizer for use in mBERT.
It is the tokenizer made specifically for use in mBERT,
and as such, should be utilized here to process text input.
"""

Criterion = nn.CrossEntropyLoss
"""
Loss function for multilabel classification. This is desired
so we get the right output shape to be uniform with the other
learners.
This was chosen over BCELoss because BCELoss does not have the
right output shape.
"""

Optimizer = optim.Adam
"""
Implements the Adam algorithm as the optimizer,
commonly used in text classification problems.
"""

checkpoint = Checkpoint(
    monitor='train_loss_best',
    dirname='train_bert',
    load_best=True,
)
"""
Checkpoint is used to save and load training progress.
"""

load_state = LoadInitState(checkpoint)
"""
Create a callback that loads the checkpoint.
"""

progress_bar = ProgressBar()

BertNet = NeuralNetClassifier(
    BertModel,
    criterion=Criterion,
    optimizer=Optimizer,
    device=_device,
    callbacks=[
        checkpoint,
        load_state,
        progress_bar,
    ],
    train_split=None, # Fixes numpy.exceptions.AxisError in training
                      # Anyways, data is assumed to be already split
)
"""
Define the mBERT neural network alongside parameters of it,
its optimizer, and its criterion.
"""

BertPipeline = Pipeline([
    ('tokenizer', BertTokenizer),
    ('bert', BertNet),
])
"""
Pipeline for mBERT. Import this for the ensemble.
"""

BertPipeline.set_params(
    tokenizer__max_length=255,
)
"""
Setting parameters of the tokenizer in the mBERT pipeline
so that the output shape is processable by the ensemble.
"""


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'\nSetting parameters of the tokenizer in the mBERT pipeline\nso that the output shape is processable by the ensemble.\n'

In [None]:
def load_model(model_name: str):
  try:
    return joblib.load(model_name)
  except FileNotFoundError:
    print("ERROR: Model not found")
    return None

learner_bayes = load_model('/content/drive/MyDrive/School/Thesis - Hate Speech/Models/For 100%/Bayes.pkl')
learner_lstm = load_model('/content/drive/MyDrive/School/Thesis - Hate Speech/Models/For 100%/LSTM.pkl')
learner_bert = load_model('/content/drive/MyDrive/School/Thesis - Hate Speech/Models/For 100%/mBERT.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[38;5;4mℹ Installing 'tl_calamancy_md-0.1.0' from
https://huggingface.co/ljvmiranda921/tl_calamancy_md/resolve/main/tl_calamancy_md-any-py3-none-any.whl...[0m


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
learner_bayes

In [None]:
learner_lstm

In [None]:
learner_bert

In [None]:
import numpy as np
import pandas as pd
import torch
import math

def shuffle_data_frame(data_frame):
    text = list(data_frame['text'])
    label = list(data_frame['label'])

    assert(len(text) == len(label))

    indices = list(range(len(label)))

    # Make a random number generator that will shuffle list of indices
    # It is seeded to be reproducible
    random_number_generator = np.random.default_rng(seed=0)
    random_number_generator.shuffle(indices)

    shuffled_text = []
    shuffled_labels = []

    # Iterate through the list of indices and add the original data
    # from those shuffled indices
    for index in indices:
        shuffled_text.append(text[index])
        shuffled_labels.append(label[index])

    return pd.DataFrame({
        'text': shuffled_text,
        'label': shuffled_labels,
    })


def get_train_test_split(data_frame: pd.DataFrame, test_size: float):
    """
    Makes a stratified train test split.
    This aims to preserve the distribution between classes.
    """
    if not (1 >= test_size >= 0):
        print('ERROR: test_size must be between 0 and 1')
        return

    data_frame = shuffle_data_frame(data_frame)

    data_frame_length = len(data_frame)
    train_size = 1 - test_size

    nonhate_rows = data_frame[data_frame['label'] == 0]
    nonhate_row_length = len(nonhate_rows)

    nonhate_row_train_size = math.ceil(nonhate_row_length * train_size)

    nonhate_row_train = nonhate_rows[0:nonhate_row_train_size]
    nonhate_row_test = nonhate_rows[nonhate_row_train_size:nonhate_row_length]

    assert(len(nonhate_row_train) + len(nonhate_row_test) == nonhate_row_length)

    hate_rows = data_frame[data_frame['label'] == 1]
    hate_row_length = len(hate_rows)

    hate_row_train_size = math.ceil(hate_row_length * train_size)

    hate_row_train = hate_rows[0:hate_row_train_size]
    hate_row_test = hate_rows[hate_row_train_size:hate_row_length]

    assert(len(hate_row_train) + len(hate_row_test) == hate_row_length)

    combined_train = pd.concat([nonhate_row_train, hate_row_train])
    combined_test = pd.concat([nonhate_row_test, hate_row_test])

    assert(len(combined_train) + len(combined_test) == data_frame_length)

    shuffled_train = shuffle_data_frame(combined_train)
    shuffled_test = shuffle_data_frame(combined_test)

    assert(len(shuffled_train) + len(shuffled_test) == data_frame_length)

    return (
        shuffled_train['text'],
        shuffled_test['text'],
        shuffled_train['label'],
        shuffled_test['label'],
    )

def seed_random_number_generators(seed=0):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    print("Random number generators seeded.")

def read_csv_file(filename: str) -> pd.DataFrame:
    try:
        data = pd.read_csv(filename, lineterminator='\n', usecols=range(2))
        print("CSV file read successfully!")
        return data
    except FileNotFoundError:
        print("ERROR: File not found")
        exit(1)

seed_random_number_generators()

Random number generators seeded.


In [None]:
def concatenate_predict_proba(bayes, lstm, bert, inputs: list):
  bayes_pred = bayes.predict_proba(inputs)
  lstm_pred = lstm.predict_proba(inputs)
  bert_pred = bert.predict_proba(inputs)

  return np.array([
    bayes_pred,
    lstm_pred,
    bert_pred,
  ])

In [None]:
dataset = read_csv_file('/content/drive/MyDrive/School/Thesis - Hate Speech/Data/datasetall.csv')

dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [None]:
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = get_train_test_split(dataset, TEST_SIZE)

In [None]:
X_train

0         [USERNAME] Palangga ka man sang mga taga Baco...
1                      Who dafuq is Jose Montemayor Jr.???
2        Di na nakakatuwa yung mukha ni Mar Roxas sa TV...
3                      national elections. | via[USERNAME]
4        Binay will be staring in a movie called "The D...
                               ...                        
22764    "Kala ko wala andito pala si Marcos."*pertaini...
22765    sie ~ [USERNAME]Marcos Magnanakaw Marcos Dikta...
22766                    If Mar is BatMarBinay is Bane-ay.
22767    to my moots im sorry in not sorry for flooding...
22768                                Uunlad tayo kay Binay
Name: text, Length: 22769, dtype: object

In [None]:
y_train

0        0
1        0
2        1
3        0
4        1
        ..
22764    0
22765    1
22766    1
22767    1
22768    0
Name: label, Length: 22769, dtype: int64

In [None]:
VAL_SPLIT = 0.5

X_val, X_test, y_val, y_test = get_train_test_split(
  pd.DataFrame({
    'text': X_test,
    'label': y_test,
  }),
  VAL_SPLIT,
)

In [None]:
X_val

0                    let leni lead Jessica Soho nterviews
1                       Oy Let Leni Lead daw sabi ni tomi
2                                           Dapat Si Leni
3       [USERNAME]and[USERNAME] Beautiful girlWise too...
4       [USERNAME] TO DONATE GCash Primitiva C TRANSPA...
                              ...                        
2842                                        bobong marcos
2843                                    My kakampink rice
2844                    Poor Binay. Too desperate. Hekhek
2845               Yup correlated to IQ Marcos Magnanakaw
2846    Lesbros and sissies look o. Hahahaha Let Leni ...
Name: text, Length: 2847, dtype: object

In [None]:
y_val

0       0
1       0
2       0
3       0
4       0
       ..
2842    1
2843    0
2844    1
2845    1
2846    0
Name: label, Length: 2847, dtype: int64

In [None]:
X_test

0       Chika natin mga nagawa ni[USERNAME]ha pati sa ...
1                 glad that my man is kakampink Leni Kiko
2       Ayan daw mga nagawatalo pa DPWHPakivalidate mg...
3                Rizalito david is for morality ahahahaha
4       Its our choice and our right whom to vote for ...
                              ...                        
2840    Ayaw na nilang lagyan ng mukha ni Binay yung T...
2841    Im one of the Thank you VP[USERNAME]for the in...
2842    [USERNAME] lugaw ka putang ina mo pag nakaupo ...
2843    you know whats a bad joke? mar's shady mrt dea...
2844    Robredo leads Marcos snubs advertising on Face...
Name: text, Length: 2845, dtype: object

In [None]:
y_test

0       0
1       0
2       0
3       0
4       0
       ..
2840    1
2841    0
2842    1
2843    1
2844    0
Name: label, Length: 2845, dtype: int64

In [None]:
results = concatenate_predict_proba(
  learner_bayes,
  learner_lstm,
  learner_bert,
  X_train,
)

results

array([[[1.00000000e+00, 3.83098716e-12],
        [4.10748793e-01, 5.89251207e-01],
        [2.76716117e-04, 9.99723284e-01],
        ...,
        [1.59771886e-02, 9.84022811e-01],
        [1.11059607e-01, 8.88940393e-01],
        [3.34887545e-01, 6.65112455e-01]],

       [[3.11595261e-01, 6.88404799e-01],
        [1.83765247e-01, 8.16234767e-01],
        [3.24293710e-02, 9.67570603e-01],
        ...,
        [6.17692024e-02, 9.38230813e-01],
        [1.73013601e-02, 9.82698560e-01],
        [5.22954524e-01, 4.77045417e-01]],

       [[9.76286471e-01, 2.37135142e-02],
        [2.03698382e-01, 7.96301663e-01],
        [1.80314723e-02, 9.81968582e-01],
        ...,
        [1.81948692e-01, 8.18051279e-01],
        [9.93827637e-03, 9.90061760e-01],
        [9.78958905e-01, 2.10411288e-02]]])

In [None]:
sliced_results = results[:, :, 1:]

transposed_results = sliced_results.T[0]

transposed_results

array([[3.83098716e-12, 6.88404799e-01, 2.37135142e-02],
       [5.89251207e-01, 8.16234767e-01, 7.96301663e-01],
       [9.99723284e-01, 9.67570603e-01, 9.81968582e-01],
       ...,
       [9.84022811e-01, 9.38230813e-01, 8.18051279e-01],
       [8.88940393e-01, 9.82698560e-01, 9.90061760e-01],
       [6.65112455e-01, 4.77045417e-01, 2.10411288e-02]])

In [None]:
val_results = concatenate_predict_proba(
  learner_bayes,
  learner_lstm,
  learner_bert,
  X_val,
)

sliced_val = val_results[:, :, 1:]
transposed_val = sliced_val.T[0]

test_results = concatenate_predict_proba(
  learner_bayes,
  learner_lstm,
  learner_bert,
  X_test,
)

sliced_test = test_results[:, :, 1:]
transposed_test = sliced_test.T[0]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

logistic_regression = LogisticRegression(
    C=0.1,
)

logistic_regression.fit(transposed_results, y_train)

with torch.inference_mode():
  y_pred = logistic_regression.predict(transposed_val)
  accuracy = accuracy_score(y_val, y_pred)
  recall = recall_score(y_val, y_pred)
  precision = precision_score(y_val, y_pred)
  f1 = f1_score(y_val, y_pred)
  print(f"Accuracy: {accuracy}\nRecall: {recall}\nPrecision: {precision}\nF1-score: {f1}")

  test_y_pred = logistic_regression.predict(transposed_test)
  test_accuracy = accuracy_score(y_test, test_y_pred)
  test_recall = recall_score(y_test, test_y_pred)
  test_precision = precision_score(y_test, test_y_pred)
  test_f1 = f1_score(y_test, test_y_pred)
  print(f"Accuracy: {test_accuracy}\nRecall: {test_recall}\nPrecision: {test_precision}\nF1-score: {test_f1}")

joblib.dump(logistic_regression, f'/content/drive/MyDrive/School/Colab/lr/{i}.pkl', compress=True)