In [1]:
import sys

import numpy as np

import torch
from torch.utils.data import DataLoader

from transformers import DistilBertTokenizer

import pickle as pkl

sys.path.insert(0, '..')

from src.data_collection import get_data
from src.models import (
    HateDataset, DistilBERTMultiClass, get_distil_hyperparams, predict_distilbert,
    predict_bert_tweet_roberta, prepare_dataloaders, get_bertweet_hyperparams,
    get_roberta_hyperparams
)

In [2]:
hate_speech_ucb = get_data()

device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"

N_SAMPLES = 100

hate_speech_sample = hate_speech_ucb[:N_SAMPLES]#.sample(N_SAMPLES, random_state=585)
N_CLASSES = hate_speech_sample["hatespeech"].nunique()

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd
Reusing dataset parquet (C:\Users\UTKARSH\.cache\huggingface\datasets\parquet\ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!


In [3]:
BERTWEET_HYPERPARAMS = get_bertweet_hyperparams()

bertweet_model_path = BERTWEET_HYPERPARAMS["MODEL_PATH"]

with open(bertweet_model_path, "rb") as f:
    bertweet_large_model = pkl.load(f)

BERTWEET_MODEL = BERTWEET_HYPERPARAMS["MODEL_NAME"]
batch_size = BERTWEET_HYPERPARAMS["BATCH_SIZE"]

bertweet_roberta_dataloader = prepare_dataloaders(hate_speech_sample, BERTWEET_MODEL, batch_size)
bertweet_outputs = predict_bert_tweet_roberta(bertweet_large_model, bertweet_roberta_dataloader, device)
bertweet_outputs = np.argmax(bertweet_outputs, axis = 1)

In [4]:
DISTIL_HYPERPARAMS = get_distil_hyperparams()

distil_model_path = DISTIL_HYPERPARAMS["MODEL_PATH"]
distil_vocab_path = DISTIL_HYPERPARAMS["VOCAB_PATH"]

DISTIL_TOKENIZER = DistilBertTokenizer.from_pretrained(distil_vocab_path)
DISTIL_MAX_LEN = DISTIL_HYPERPARAMS["MAX_LEN"]

distil_params = DISTIL_HYPERPARAMS["TEST_PARAMS"]
distil_dataset = HateDataset(hate_speech_sample, DISTIL_TOKENIZER, DISTIL_MAX_LEN)
distil_dataloader = DataLoader(distil_dataset, **distil_params)

distil_model = DistilBERTMultiClass(n_classes=N_CLASSES)
distil_model.load_state_dict(torch.load(distil_model_path))

distil_outputs = predict_distilbert(distil_model, distil_dataloader, device)
distil_outputs = np.argmax(distil_outputs, axis=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLU

In [5]:
ROBERTA_HYPERPARAMS = get_roberta_hyperparams()

roberta_model_path = ROBERTA_HYPERPARAMS["MODEL_PATH"]

with open(roberta_model_path, "rb") as f:
    roberta_model = pkl.load(f)
roberta_outputs = predict_bert_tweet_roberta(roberta_model, bertweet_roberta_dataloader, device)
roberta_outputs = np.argmax(roberta_outputs, axis=1)

In [6]:
hate_speech_sample["RoBERTa_pred"] = roberta_outputs
hate_speech_sample["DistilBERT_pred"] = distil_outputs
hate_speech_sample["BERTweet_Large_pred"] = bertweet_outputs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_speech_sample["RoBERTa_pred"] = roberta_outputs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_speech_sample["DistilBERT_pred"] = distil_outputs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_speech_sample["BERTweet_Large_pred"] = bertweet_outputs


### Errors - RoBERTa

In [7]:
hate_speech_sample[
    hate_speech_sample["hatespeech"] != hate_speech_sample["RoBERTa_pred"]
]

Unnamed: 0,text,hatespeech,RoBERTa_pred,DistilBERT_pred,BERTweet_Large_pred
7,""" F those ugly idiots "" "" I cant stand useless...",0,1,0,0
10,""" I got women tryna sneak me through there doo...",1,0,1,0
14,""" if I get pregnant by a nigga in Mexico, ima ...",0,1,0,1
30,"""A nigga really speak his mind when he off dem...",0,1,0,1
35,"""After Donald Trump was elected president, Tay...",1,0,1,0
66,"""Bang outside, I nutted on god"" ok 21 go off?",1,0,0,0
68,"""Because she's ugly. And terrible. And I hate ...",1,0,1,0
72,"""Bet I have a bigger dick than your boyfriend!...",1,0,1,0
76,"""Bitches talking about ""you ruined my self est...",0,1,0,1
79,"""Boycott this seditious, sickular Islamist. He...",1,0,1,1


### Errors - DistilBERT

In [8]:
hate_speech_sample[
    hate_speech_sample["hatespeech"] != hate_speech_sample["DistilBERT_pred"]
]

Unnamed: 0,text,hatespeech,RoBERTa_pred,DistilBERT_pred,BERTweet_Large_pred
24,"""3 in one special: bad bitch, dumb bitch and ...",1,1,0,1
66,"""Bang outside, I nutted on god"" ok 21 go off?",1,0,0,0
67,"""Because I am a lunatic whore pimped out by ki...",1,1,0,1


### Errors - BERTweet

In [9]:
hate_speech_sample[
    hate_speech_sample["hatespeech"] != hate_speech_sample["BERTweet_Large_pred"]
]

Unnamed: 0,text,hatespeech,RoBERTa_pred,DistilBERT_pred,BERTweet_Large_pred
10,""" I got women tryna sneak me through there doo...",1,0,1,0
14,""" if I get pregnant by a nigga in Mexico, ima ...",0,1,0,1
30,"""A nigga really speak his mind when he off dem...",0,1,0,1
35,"""After Donald Trump was elected president, Tay...",1,0,1,0
66,"""Bang outside, I nutted on god"" ok 21 go off?",1,0,0,0
68,"""Because she's ugly. And terrible. And I hate ...",1,0,1,0
72,"""Bet I have a bigger dick than your boyfriend!...",1,0,1,0
76,"""Bitches talking about ""you ruined my self est...",0,1,0,1
87,"""Business groups"" don't give a damn about Amer...",0,1,0,1
92,"""Capped"" ""hard asab""🤣🤣 like this Shit ain't be...",0,1,0,1
