In [None]:
pip install accelerate


In [None]:
pip install transformers[torch]


In [16]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np

In [2]:
# Load the tokenizer and model for XLM-Roberta
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Load the training data (e.g., English HatEval)
dataset = load_dataset("hate_speech18", "english")  # Replace with your dataset

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'user_id', 'subforum_id', 'num_contexts', 'label'],
        num_rows: 10944
    })
})


In [5]:
# Tokenize the data
def tokenize_data(example):
    return tokenizer(example["text"], truncation=True, padding='max_length')

In [6]:
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Get the new training and validation data
train_data = train_test_split['train'].map(tokenize_data, batched=True)
val_data = train_test_split['test'].map(tokenize_data, batched=True)

Map:   0%|          | 0/8755 [00:00<?, ? examples/s]

Map:   0%|          | 0/2189 [00:00<?, ? examples/s]

In [7]:
# Prepare the datasets for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)



In [9]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3757,0.352421
2,0.4162,0.363305
3,0.4042,0.479198


TrainOutput(global_step=3285, training_loss=0.3559436602135227, metrics={'train_runtime': 6466.7394, 'train_samples_per_second': 4.062, 'train_steps_per_second': 0.508, 'total_flos': 6910611869030400.0, 'train_loss': 0.3559436602135227, 'epoch': 3.0})

DatasetNotFoundError: Dataset 'hasoc_2019' doesn't exist on the Hub or cannot be accessed.

In [11]:
import pandas as pd

# Load the datasets
hasoc_test = pd.read_csv('data/hindi_dataset/hasoc2019_hi_test_gold_2919.tsv', sep='\t')
hindi_data = pd.read_csv('data/hindi_dataset/hindi_dataset.tsv', sep='\t')

# Display the first few rows and columns of each dataset
print("HASOC 2019 Test Data Sample:")
print(hasoc_test.head())
print("\nHASOC 2019 Test Data Columns:")
print(hasoc_test.columns)

print("\nHindi Dataset Sample:")
print(hindi_data.head())
print("\nHindi Dataset Columns:")
print(hindi_data.columns)

HASOC 2019 Test Data Sample:
         text_id                                               text task_1  \
0  hasoc_hi_5061  वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...    NOT   
1  hasoc_hi_2090  #कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...    HOF   
2  hasoc_hi_2960  पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...    HOF   
3   hasoc_hi_864  जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...    NOT   
4    hasoc_hi_54  नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...    NOT   

  task_2 task_3  
0   NONE   NONE  
1   OFFN    TIN  
2   OFFN    TIN  
3   NONE   NONE  
4   NONE   NONE  

HASOC 2019 Test Data Columns:
Index(['text_id', 'text', 'task_1', 'task_2', 'task_3'], dtype='object')

Hindi Dataset Sample:
         text_id                                               text task_1  \
0  hasoc_hi_5556  बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...    NOT   
1  hasoc_hi_5648  सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...    HOF   
2   hasoc_hi_164  तुम जैसे हरामिय

In [12]:
import pandas as pd
from datasets import Dataset

# Load the datasets
hasoc_test = pd.read_csv('data/hindi_dataset/hasoc2019_hi_test_gold_2919.tsv', sep='\t')
hindi_data = pd.read_csv('data/hindi_dataset/hindi_dataset.tsv', sep='\t')

# Select only 'text' and 'task_1' columns for hate speech detection
hasoc_test = hasoc_test[['text', 'task_1']]
hindi_data = hindi_data[['text', 'task_1']]

# Map 'HOF' to 1 and 'NOT' to 0 for easier numerical processing
label_mapping = {'HOF': 1, 'NOT': 0}
hasoc_test['label'] = hasoc_test['task_1'].map(label_mapping)
hindi_data['label'] = hindi_data['task_1'].map(label_mapping)

# Drop the original 'task_1' column now that we have the numerical labels
hasoc_test = hasoc_test.drop(columns=['task_1'])
hindi_data = hindi_data.drop(columns=['task_1'])

# Display the first few rows to verify
print(hasoc_test.head())
print(hindi_data.head())

# Convert to Hugging Face dataset format for compatibility with the model
hasoc_test_dataset = Dataset.from_pandas(hasoc_test)
hindi_dataset = Dataset.from_pandas(hindi_data)


                                                text  label
0  वक्त, इन्सान और इंग्लैंड का मौसम आपको कभी भी ध...      0
1  #कांग्रेस के इस #कमीने की #करतूत को देखिए देश ...      1
2  पाकिस्तान को फेकना था फेका गया। जो हार कर भी द...      1
3  जो शब्द तूम आज किसी और औरत के लिए यूज कर रहे व...      0
4  नेता जी हम समाजवादी सिपाही हमेशा आपके साथ है आ...      0
                                                text  label
0  बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...      0
1  सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...      1
2  तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...      1
3  बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...      0
4  चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...      0


In [13]:
hasoc_test_dataset = hasoc_test_dataset.map(tokenize_data, batched=True)
hasoc_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

In [19]:
pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [21]:
from transformers import XLMRobertaTokenizer, Trainer, TrainingArguments
import numpy as np
import evaluate  # New library for loading metrics

# Load the pre-trained tokenizer (replace 'xlm-roberta-base' with your model if different)
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

# Define a function to tokenize the text data
def tokenize_data(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

# Tokenize the test dataset
hasoc_test_dataset = hasoc_test_dataset.map(tokenize_data, batched=True)

# Set format for PyTorch tensors for compatibility with Hugging Face models
hasoc_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Load your trained model (replace 'path_to_your_model' with your actual model path)
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base")

# Define the evaluation metric using the evaluate library
metric = evaluate.load("f1")

# Define a function to compute metrics during evaluation
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=predictions, references=labels, average='macro')
    accuracy = np.mean(predictions == labels)
    return {"f1": f1['f1'], "accuracy": accuracy}

# Define training arguments for evaluation
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=hasoc_test_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model on the Hindi test data
results = trainer.evaluate()

print(f"Evaluation Results: {results}")


Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Evaluation Results: {'eval_loss': 0.7001206278800964, 'eval_model_preparation_time': 0.001, 'eval_f1': 0.31461258450338014, 'eval_accuracy': 0.4590288315629742, 'eval_runtime': 12.679, 'eval_samples_per_second': 103.951, 'eval_steps_per_second': 6.546}


In [None]:
# Evaluate the model on the Hindi dataset
results = trainer.evaluate(eval_dataset=hindi_test_data)
print(f"Zero-shot evaluation results on Hindi dataset: {results}")