In [1]:
# Import necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np
import pandas as pd
import evaluate

In [2]:
# Define model and tokenizer
model_name = "bert-base-multilingual-cased"  # mBERT model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Load the training data (e.g., English hate speech data)
dataset = load_dataset("hate_speech18", "english")

In [4]:
# Tokenize the training and validation data
def tokenize_data(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=128)

In [5]:
# Split the English data into training and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_data = train_test_split['train'].map(tokenize_data, batched=True)
val_data = train_test_split['test'].map(tokenize_data, batched=True)

# Prepare datasets for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/8755 [00:00<?, ? examples/s]

Map:   0%|          | 0/2189 [00:00<?, ? examples/s]

In [6]:
# Define training arguments (single definition)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)



In [7]:

# Train the model on English data
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3207,0.350757
2,0.3041,0.357048
3,0.302,0.338056


TrainOutput(global_step=3285, training_loss=0.3511268999660033, metrics={'train_runtime': 1141.1239, 'train_samples_per_second': 23.017, 'train_steps_per_second': 2.879, 'total_flos': 1727652967257600.0, 'train_loss': 0.3511268999660033, 'epoch': 3.0})

In [8]:
# Load the HASOC 2019 Hindi test data for zero-shot evaluation
hasoc_test = pd.read_csv('data/hindi_dataset/hasoc2019_hi_test_gold_2919.tsv', sep='\t')

# Preprocess the Hindi test data
hasoc_test = hasoc_test[['text', 'task_1']]
label_mapping = {'HOF': 1, 'NOT': 0}
hasoc_test['label'] = hasoc_test['task_1'].map(label_mapping)
hasoc_test = hasoc_test.drop(columns=['task_1'])

# Convert the test data into a Hugging Face dataset
hasoc_test_dataset = Dataset.from_pandas(hasoc_test)

# Tokenize the Hindi test data
hasoc_test_dataset = hasoc_test_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=128), batched=True)
hasoc_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

In [11]:
# Load the evaluation metric using the evaluate library
metric = evaluate.load("f1")

# Define a function to compute metrics during evaluation
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=predictions, references=labels, average='macro')
    accuracy = np.mean(predictions == labels)
    return {"f1": f1['f1'], "accuracy": accuracy}

training_args = TrainingArguments(
    output_dir="./dummy_output",
    save_strategy="no",
    per_device_eval_batch_size=8,
    # logging_dir="./logs",
    # logging_steps=10,
)

# Initialize the Trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=hasoc_test_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
# Evaluate the trained model on the Hindi test data without further training
results = trainer.evaluate(eval_dataset=hasoc_test_dataset)

# Print the evaluation results
print(f"Zero-Shot Evaluation Results on Hindi Data using mBERT: {results}")

Zero-Shot Evaluation Results on Hindi Data using mBERT: {'eval_loss': 1.0631043910980225, 'eval_model_preparation_time': 0.0067, 'eval_f1': 0.35105859182668636, 'eval_accuracy': 0.5409711684370258, 'eval_runtime': 12.6608, 'eval_samples_per_second': 104.101, 'eval_steps_per_second': 13.032}
