In [None]:
!pip install transformers datasets torch scikit-learn wandb

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting wandb
  Downloading wandb-0.17.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collect

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
dataset = load_dataset("badmatr11x/hate-offensive-speech")

Downloading readme:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.49M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/193k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/198k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51070 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2838 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2837 [00:00<?, ? examples/s]

In [None]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("badmatr11x/hate-offensive-speech")

In [None]:
ds_builder.info.features

{'label': Value(dtype='int64', id=None),
 'tweet': Value(dtype='string', id=None)}

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/51070 [00:00<?, ? examples/s]

Map:   0%|          | 0/2838 [00:00<?, ? examples/s]

Map:   0%|          | 0/2837 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
train_dataset = tokenized_dataset["train"].shuffle(seed=107)
validation_dataset = tokenized_dataset["validation"].shuffle(seed=107)

In [None]:
import os
import wandb
os.environ["WANDB_API_KEY"]="59d808e9fcd215f43e751c8077b674e585909971"
os.environ["WANDB_ENTITY"]="tirath-bhathawala"
os.environ["WANDB_PROJECT"]="IPD"

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    output = {
        "overall-f1": f1_score(labels, predictions, average='micro'),
        "overall-accuracy": accuracy_score(labels, predictions),
        "overall-precision": precision_score(labels, predictions, average='micro'),
        "overall-recall": recall_score(labels, predictions, average='micro'),
    }

    return output

In [None]:
id2label = {0: "HATE-SPEECH", 1: "OFFENSIVE-LANGUAGE", 2: "NEITHER"}
label2id = {"HATE-SPEECH": 0, "OFFENSIVE-LANGUAGE": 1, "NEITHER": 2}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="speech-multiclassifier-run-2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    report_to="wandb",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()
wandb.finish()

Epoch,Training Loss,Validation Loss,Overall-f1,Overall-accuracy,Overall-precision,Overall-recall
1,0.1678,0.156204,0.953136,0.953136,0.953136,0.953136
2,0.1522,0.155314,0.953136,0.953136,0.953136,0.953136
3,0.1157,0.169474,0.953488,0.953488,0.953488,0.953488


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁▁█
eval/overall-accuracy,▁▁█
eval/overall-f1,▁▁█
eval/overall-precision,▁▁█
eval/overall-recall,▁▁█
eval/runtime,▆█▁
eval/samples_per_second,▃▁█
eval/steps_per_second,▃▁█
train/epoch,▁▁▁▂▂▃▃▃▃▄▄▄▅▅▆▆▆▆▇▇████
train/global_step,▁▁▁▂▂▃▃▃▃▄▄▄▅▅▆▆▆▆▇▇████

0,1
eval/loss,0.16947
eval/overall-accuracy,0.95349
eval/overall-f1,0.95349
eval/overall-precision,0.95349
eval/overall-recall,0.95349
eval/runtime,5.6944
eval/samples_per_second,498.388
eval/steps_per_second,31.259
total_flos,2641161396896352.0
train/epoch,3.0


In [None]:
test_dataset = tokenized_dataset["test"].shuffle(seed=23)

In [None]:
tester = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [None]:
wandb.init(project="IPD", entity="tirath-bhathawala", name="test-run")

In [None]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-1.7839005, -3.683505 ,  6.256647 ],
       [-1.3621508,  4.3065248, -3.3359818],
       [ 1.4687214, -4.3079734,  3.8835793],
       ...,
       [-2.0086772, -3.8533688,  6.54445  ],
       [-1.9411983, -3.9037862,  6.5279875],
       [-1.8904849, -3.914018 ,  6.55604  ]], dtype=float32), label_ids=array([2, 1, 2, ..., 2, 2, 2]), metrics={'test_loss': 0.18947246670722961, 'test_overall-f1': 0.9481847021501586, 'test_overall-accuracy': 0.9481847021501586, 'test_overall-precision': 0.9481847021501586, 'test_overall-recall': 0.9481847021501586, 'test_runtime': 6.0136, 'test_samples_per_second': 471.766, 'test_steps_per_second': 29.6})

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="/content/speech-multiclassifier-run-2/checkpoint-9576",
    device=0 if torch.cuda.is_available() else -1
)

In [None]:
test_string = "tirath can't be bothered to write the assignment"
result = classifier(test_string)
print(result)

[{'label': 'NEITHER', 'score': 0.9989138841629028}]


In [None]:
test_string = "are jews even humans?"
result = classifier(test_string)
print(result)

[{'label': 'HATE-SPEECH', 'score': 0.9786828756332397}]


In [None]:
!zip -r '/content/speech-multiclassifier-run-2/checkpoint-9576'.zip '/content/speech-multiclassifier-run-2/checkpoint-9576'

  adding: content/speech-multiclassifier-run-2/checkpoint-9576/ (stored 0%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/merges.txt (deflated 53%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/config.json (deflated 50%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/tokenizer_config.json (deflated 76%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/tokenizer.json (deflated 72%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/model.safetensors (deflated 7%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/optimizer.pt (deflated 30%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/vocab.json (deflated 59%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/trainer_state.json (deflated 73%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/scheduler.pt (deflated 56%)
  adding: content/speech-multiclassifier-run-2/checkpoint-9576/training_args.bin (deflate