In [1]:
import sys

import numpy as np

import torch
from torch.utils.data import DataLoader

from transformers import DistilBertTokenizer

from tqdm import tqdm

sys.path.insert(0, '..')
from src.data_collection import get_data
from src.models import HateDataset, DistilBERTMultiClass, get_distil_hyperparams

In [2]:
hate_speech_ucb = get_data()
HYPERPARAMS = get_distil_hyperparams()

model_path = HYPERPARAMS["MODEL_PATH"]
vocab_path = HYPERPARAMS["VOCAB_PATH"]

TOKENIZER = DistilBertTokenizer.from_pretrained(vocab_path)
MAX_LEN = HYPERPARAMS["MAX_LEN"]
device = HYPERPARAMS["DEVICE"]

N_SAMPLES = 20

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd
Reusing dataset parquet (C:\Users\UTKARSH\.cache\huggingface\datasets\parquet\ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!




In [3]:
hate_speech_sample = hate_speech_ucb[:N_SAMPLES]#.sample(N_SAMPLES, random_state=585)
hate_speech_sample

Unnamed: 0,text,hatespeech
0,! thank u! im transmasc and generally present ...,0
1,!Go fuck yourself faggot!,1
2,!flair [I love women and minorities],0
3,!flair [death to all niggers and gays],1
4,""" 'convoluted' genealogy of Jesus""; was that c...",0
5,""" *gulhfg* that's the sound of your mom suckin...",1
6,""" Did you notice a sign out front that said de...",1
7,""" F those ugly idiots "" "" I cant stand useless...",0
8,""" Fuck you niggas ion give a fuck no more nigg...",1
9,""" Happy Independence Day to all my fellow sout...",0


In [4]:
dataset = HateDataset(hate_speech_sample, TOKENIZER, MAX_LEN)

In [5]:
sample_params = HYPERPARAMS["TEST_PARAMS"]

sample_loader = DataLoader(dataset, **sample_params)

In [6]:
N_CLASSES = hate_speech_sample["hatespeech"].nunique()

In [7]:
distil_model = DistilBERTMultiClass(n_classes=N_CLASSES)
distil_model.load_state_dict(torch.load(model_path))
distil_model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTMultiClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_

In [8]:
def predict(model, loader):
    model.eval()
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs

In [9]:
distil_outputs = predict(distil_model, sample_loader)
distil_outputs = np.argmax(distil_outputs, axis=1)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
20it [00:02,  9.75it/s]


In [10]:
hate_speech_sample["DistilBERT_pred"] = distil_outputs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hate_speech_sample["DistilBERT_pred"] = distil_outputs


In [11]:
hate_speech_sample

Unnamed: 0,text,hatespeech,DistilBERT_pred
0,! thank u! im transmasc and generally present ...,0,0
1,!Go fuck yourself faggot!,1,1
2,!flair [I love women and minorities],0,0
3,!flair [death to all niggers and gays],1,1
4,""" 'convoluted' genealogy of Jesus""; was that c...",0,0
5,""" *gulhfg* that's the sound of your mom suckin...",1,1
6,""" Did you notice a sign out front that said de...",1,1
7,""" F those ugly idiots "" "" I cant stand useless...",0,0
8,""" Fuck you niggas ion give a fuck no more nigg...",1,1
9,""" Happy Independence Day to all my fellow sout...",0,0
