In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Obtaining dependency information for simpletransformers from https://files.pythonhosted.org/packages/16/c8/20d7eede93e320c0746c01d2205bdbeb388c236247244b9428e743a96de9/simpletransformers-0.64.3-py3-none-any.whl.metadata
  Downloading simpletransformers-0.64.3-py3-none-any.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.3 kB ? eta -:--:--
     ---------------------------------------- 42.3/42.3 kB 2.0 MB/s eta 0:00:00
Collecting transformers>=4.31.0 (from simpletransformers)
  Obtaining dependency information for transformers>=4.31.0 from https://files.pythonhosted.org/packages/98/46/f6a79f944d5c7763a9bc13b2aa6ac72daf43a6551f5fb03bccf0a9c2fec1/transformers-4.33.3-py3-none-any.whl.metadata
  Downloading transformers-4.33.3-py3-none-any.whl.metadata (119 kB)
     ---------------------------------------- 0.0/119.9 kB ? eta -:--:--
     -------------------------------------- 119.9/119.9 kB 6.9 MB/s eta 0:00:00
Collecting datasets (fro

In [None]:
!pip install torch

In [2]:
import torch

import math
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

ModuleNotFoundError: No module named 'torch'

In [3]:
OLIDv1_train_df = (pd.read_csv("Datasets/olid-train-small.csv")).drop("id",axis = 1)  # length 5852
hasoc_train_df = (pd.read_csv("Datasets/hasoc-train.csv")).drop("id",axis = 1)       # length 5852
OLIDv1_test_df = (pd.read_csv("Datasets/olid-test.csv")).drop("id",axis = 1)         # length 860

In [4]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("GroNLP/hateBERT")

# Preprocess and tokenize the text data in the test set
def preprocess_text(text):
    # Add special tokens [CLS] and [SEP] and tokenize
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, truncation=True, max_length=128, pad_to_max_length=True)
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    segment_ids = tokens['token_type_ids']  # BERT doesn't use segment_ids, but it's required for the function call

    return input_ids, attention_mask, segment_ids

# Apply preprocessing to each row in the test dataframe
OLIDv1_test_df['input_ids'], OLIDv1_test_df['attention_mask'], OLIDv1_test_df['segment_ids'] = zip(*OLIDv1_test_df['text'].apply(preprocess_text))
OLIDv1_train_df['input_ids'], OLIDv1_train_df['attention_mask'], OLIDv1_train_df['segment_ids'] = zip(*OLIDv1_train_df['text'].apply(preprocess_text))
hasoc_train_df['input_ids'], hasoc_train_df['attention_mask'], hasoc_train_df['segment_ids'] = zip(*hasoc_train_df['text'].apply(preprocess_text))

OLIDv1_test_df = OLIDv1_test_df.drop('text',axis=1)



In [5]:
# Split the OLIDv1_train_df into input features and labels
X_olid = OLIDv1_train_df[['input_ids', 'attention_mask', 'segment_ids']]
Y_olid = OLIDv1_train_df['labels']

# Split the hasoc_train_df into input features and labels
X_hasoc = hasoc_train_df[['input_ids', 'attention_mask', 'segment_ids']]
Y_hasoc = hasoc_train_df['labels']

# Split the OLIDv1_train_df dataset into a training set and a validation set
X_train_olid, X_val_olid, Y_train_olid, Y_val_olid = train_test_split(X_olid, Y_olid, test_size=0.2, random_state=42)



train_olid = pd.concat([X_train_olid, Y_train_olid], axis=1)
val_olid = pd.concat([X_val_olid, Y_val_olid], axis=1)

# Split the hasoc_train_df dataset into a training set and a validation set
X_train_hasoc, X_val_hasoc, Y_train_hasoc, Y_val_hasoc = train_test_split(X_hasoc, Y_hasoc, test_size=0.2, random_state=42)

train_hasoc = pd.concat([X_train_hasoc, Y_train_hasoc], axis=1)
val_hasoc = pd.concat([X_val_hasoc, Y_val_hasoc], axis=1)




In [6]:
def train_model(train_df, val_df):
    hatebert = ClassificationModel(
        "bert",
        "GroNLP/hateBERT", num_labels=2,
        args={
            # 'reprocess_input_data': True,
            "learning_rate": 1e-2,
            "overwrite_output_dir": True,
            "num_train_epochs": 2,
            "train_batch_size": 128,
            "eval_batch_size": 128,
            "evaluate_during_training": True,
            "save_eval_checkpoints": True,
            "save_model_every_epoch": False,
            "use_early_stopping": True,
            "early_stopping_patience": 3,
            "early_stopping_delta": 0.1
        },

        use_cuda=True
    )
    hatebert.train_model(train_df, eval_df=val_df)
    return hatebert

In [7]:
def plot_losses(training_log):
    plt.plot(training_log['train_loss'], label='training Loss')
    plt.plot(training_log['eval_loss'], label='validation Loss')
    plt.title('Training and validation loss over epochs')
    plt.xlabel('Evaluation Steps')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [8]:
def calculate_confusion_matrix(true_labels, predictions):
    class_labels = [" 0", " 1",]
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(len(class_labels), len(class_labels)))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    return cm

In [9]:
def calculate_classifier_report(true_labels, predictions):
    return classification_report(true_labels, predictions)

In [11]:

# Training the model
hatebert = train_model(train_olid, val_olid)

# Load the training log
training_log = pd.read_csv("outputs/training_progress_scores.csv")

# Plot the losses
plot_losses(training_log)


# Test the model on test set the model
result, model_outputs, wrong_predictions = hatebert.eval_model(OLIDv1_test_df)

# Get predictions and true labels
predictions = np.argmax(model_outputs, axis=1)
true_labels = OLIDv1_test_df['labels']

# Calculate confusion matrix and plot it
confusion_mat = calculate_confusion_matrix(true_labels, predictions)

# Calculate classification report
class_report = calculate_classifier_report(true_labels, predictions)
print("Classification Report:")
print(class_report)


ValueError: ignored

In [None]:
asas

In [None]:
# Assuming you have already preprocessed and tokenized the test data into 'input_ids', 'attention_mask', and 'segment_ids'

# Create a new DataFrame with necessary columns
test_data = {
    'text': OLIDv1_test_df['text'],
    'labels': OLIDv1_test_df['labels'],
    'input_ids': OLIDv1_test_df['input_ids'],
    'attention_mask': OLIDv1_test_df['attention_mask'],
    'segment_ids': OLIDv1_test_df['segment_ids']
}

# Create a DataFrame from the dictionary
test_df = pd.DataFrame(test_data)

# Now test_df has the required columns: 'text', 'labels', 'input_ids', 'attention_mask', 'segment_ids'

# Test the model on the test set
result, model_outputs, wrong_predictions = hatebert.eval_model(test_df)


In [None]:
test_dataloader

In [None]:
result, model_outputs, wrong_predictions = hatebert.eval_model(test_dataloader)


In [None]:
from sklearn.metrics import confusion_matrix

# Load test data (assuming you have a test DataFrame called test_olid_df)
test_inputs, test_labels = preprocess_test_data(test_olid_df)

# Evaluate the model on the preprocessed test data
predictions, model_outputs, wrong_predictions = hatebert.eval_model(test_dataloader)


# Assuming model_outputs contains the predicted probabilities and test_labels contains the true labels
# predictions, model_outputs, wrong_predictions = hatebert.eval_model(test_olid_df)

# Get the predicted labels (assuming it's binary classification)
predicted_labels = [1 if prob[1] > prob[0] else 0 for prob in model_outputs]

# Get the true labels from your test data (assuming it's binary classification)
true_labels = test_olid_df['labels'].tolist()  # Replace 'true_labels' with the actual column name in your DataFrame

# Calculate confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define the labels for the matrix
labels = ['Negative', 'Positive']  # Assuming your classes are 'Negative' and 'Positive'

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Usage
train_hasoc_df = pd.DataFrame({'text': X_train_hasoc, 'labels': Y_train_hasoc})
val_hasoc_df = pd.DataFrame({'text': X_val_hasoc, 'labels': Y_val_hasoc})

test_olid_df = OLIDv1_test_df.drop('id', axis=1)


# Training the model
hatebert = train_model(train_hasoc_df, val_hasoc_df)

# Load the training log
training_log = pd.read_csv("outputs/training_progress_scores.csv")

# Plot the losses
plot_losses(training_log)

# Test the model on test set the model
result, model_outputs, wrong_predictions = hatebert.eval_model(test_olid_df)

# Get predictions and true labels
predictions = np.argmax(model_outputs, axis=1)
true_labels = test_olid_df['labels']

# Calculate confusion matrix and plot it
confusion_mat = calculate_confusion_matrix(true_labels, predictions)

# Calculate classification report
class_report = calculate_classifier_report(true_labels, predictions)
print("Classification Report:")
print(class_report)
