In [None]:
# Import necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np
import pandas as pd
import evaluate

In [None]:
# Define model and tokenizer
model_name = "bert-base-multilingual-cased"  # mBERT model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
# Load the training data (e.g., English hate speech data)
dataset = load_dataset("hate_speech18", "english")

In [None]:
# Tokenize the training and validation data
def tokenize_data(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=128)

In [None]:
# Split the English data into training and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_data = train_test_split['train'].map(tokenize_data, batched=True)
val_data = train_test_split['test'].map(tokenize_data, batched=True)

# Prepare datasets for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Define training arguments (single definition)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

In [None]:

# Train the model on English data
trainer.train()

In [None]:
# Load the HASOC 2019 Hindi test data for zero-shot evaluation
hasoc_test = pd.read_csv('data/hindi_dataset/hasoc2019_hi_test_gold_2919.tsv', sep='\t')

# Preprocess the Hindi test data
hasoc_test = hasoc_test[['text', 'task_1']]
label_mapping = {'HOF': 1, 'NOT': 0}
hasoc_test['label'] = hasoc_test['task_1'].map(label_mapping)
hasoc_test = hasoc_test.drop(columns=['task_1'])

# Convert the test data into a Hugging Face dataset
hasoc_test_dataset = Dataset.from_pandas(hasoc_test)

# Tokenize the Hindi test data
hasoc_test_dataset = hasoc_test_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=128), batched=True)
hasoc_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Load the evaluation metric using the evaluate library
metric = evaluate.load("f1")

# Define a function to compute metrics during evaluation
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=predictions, references=labels, average='macro')
    accuracy = np.mean(predictions == labels)
    return {"f1": f1['f1'], "accuracy": accuracy}

In [None]:
# Evaluate the trained model on the Hindi test data without further training
results = trainer.evaluate(eval_dataset=hasoc_test_dataset)

# Print the evaluation results
print(f"Zero-Shot Evaluation Results on Hindi Data using mBERT: {results}")