## Imports

In [65]:
import numpy as np
import pandas as pd

## Load dataset

In [66]:
df = pd.read_csv('offensive_tweet_dataset/labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [67]:
## Check for any null values
df.isnull().values.any()

False

Relevant columns: class (label) & tweet (data)  
class is labeled as:
- 0 if hate speech
- 1 if offensive
- 2 if neither

## Drop unused columns

In [68]:
df = df[['class', 'tweet']]
df

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


## Split dataset

In [69]:
from sklearn.model_selection import train_test_split
X = df['tweet']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

def check_ratio(feat_df: pd.DataFrame, df: pd.DataFrame, header: str) -> None:
    print(header + ' : {0} ({1:0.2f}%)'.format(len(feat_df), (len(feat_df)/len(df)) * 100.0))

# Verify split ratios
print('{0:0.2f}% in training set'.format((len(X_train)/len(df.index)) * 100))
print('{0:0.2f}% in test set'.format((len(X_test)/len(df.index)) * 100))
print('')
check_ratio(df.loc[df['class'] == 0], df.index, 'Original Hate Speech')
check_ratio(df.loc[df['class'] == 1], df.index, 'Original Offensive')
check_ratio(df.loc[df['class'] == 2], df.index, 'Original Neither')
print('')
check_ratio(y_train[y_train[:] == 0], y_train, 'Training Hate Speech')
check_ratio(y_train[y_train[:] == 1], y_train, 'Training Offensive')
check_ratio(y_train[y_train[:] == 2], y_train, 'Training Neither')
print('')
check_ratio(y_test[y_test[:] == 0], y_test, 'Test Hate Speech')
check_ratio(y_test[y_test[:] == 1], y_test, 'Test Offensive')
check_ratio(y_test[y_test[:] == 2], y_test, 'Test Neither')

80.00% in training set
20.00% in test set

Original Hate Speech : 1430 (5.77%)
Original Offensive : 19190 (77.43%)
Original Neither : 4163 (16.80%)

Training Hate Speech : 1140 (5.75%)
Training Offensive : 15358 (77.46%)
Training Neither : 3328 (16.79%)

Test Hate Speech : 290 (5.85%)
Test Offensive : 3832 (77.30%)
Test Neither : 835 (16.84%)


## Tokenization

In [70]:
from transformers import RobertaTokenizer, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [71]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [77]:
# tokens = tokenizer.batch_encode_plus(
#     X_train.values,
#     max_length=256,
#     padding=True,
#     return_tensors='pt'
# )

tokens = tokenizer.batch_encode_plus(
    X_train.values,
    return_attention_mask=True,
    truncation=True,
    max_length=256,
    # pad_to_max_length=True,
    padding='max_length',
    return_tensors='pt'
)

# test_tokens = tokenizer.batch_encode_plus(
#     X_test.values,
#     max_length=280,
#     padding=True,
#     return_tensors='pt'
# )

In [79]:
print(type(X_train.values))

<class 'numpy.ndarray'>


In [None]:
dataset = TensorDataset(
    torch.tensor(tokens['input_ids']),
    torch.tensor(tokens['attention_mask']),
    torch.tensor(y_train.values)
)

print(y_train.values)

print(torch.tensor(tokens['input_ids']).size())
print(torch.tensor(tokens['attention_mask']).size())
print(torch.tensor(y_train.values).size())
# test_ds = TensorDataset(
#     torch.tensor(test_tokens['input_ids']),
#     torch.tensor(test_tokens['attention_mask']),
#     torch.tensor(y_test.values)
# )

[0 2 1 ... 1 1 1]
torch.Size([19826, 256])
torch.Size([19826, 256])
torch.Size([19826])


  torch.tensor(tokens['input_ids']),
  torch.tensor(tokens['attention_mask']),
  print(torch.tensor(tokens['input_ids']).size())
  print(torch.tensor(tokens['attention_mask']).size())


## Building DataLoader

In [None]:
dataloader = DataLoader(
    dataset,
    batch_size=32,
    sampler=RandomSampler(dataset)
)

## Initializing Model

In [None]:
from transformers import RobertaConfig, RobertaForSequenceClassification, BertForSequenceClassification

# config = RobertaConfig(
#     vocab_size=tokenizer.vocab_size,
#     max_position_embeddings=258,
#     hidden_size=768,
#     num_attention_heads=12,
#     num_hidden_layers=12,
#     type_vocab_size=3
# )

# model = RobertaForSequenceClassification(config)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3, output_attentions=False, output_hidden_states=False)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, output_attentions=False, output_hidden_states=False)

## Training

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(3, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
from transformers import AdamW
# model.train()
# AdamW optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)



In [None]:
from tqdm import tqdm
epochs = 5

# for epoch in range(epochs):
#     # setup loop with TQDM and dataloader
#     loop = tqdm(dataloader, leave=False)
#     for batch in loop:
#         # initialize calculated gradients (from prev step)
#         optimizer.zero_grad()
#         # pull all tensor batches required for training
#         batch = tuple(b.to(device) for b in batch)

#         inputs = {
#             'input_ids':      batch[0],
#             'attention_mask': batch[1],
#             'labels':         batch[2],
#         }
#         # input_ids = batch['input_ids'].to(device)
#         # attention_mask = batch['attention_mask'].to(device)
#         # labels = batch['labels'].to(device)
#         # input_ids = torch.tensor(tokens['input_ids']),
#         # attention_mask = torch.tensor(tokens['attention_mask']),
#         # labels = torch.tensor(y_train.values)
#         # process
#         outputs = model(**inputs)
#         # outputs = model(input_ids, attention_mask=attention_mask,
#         #                 labels=labels)
#         # extract loss
#         loss = outputs.loss
#         # calculate loss for every parameter that needs grad update
#         loss.backward()
#         # update parameters
#         optimizer.step()
#         # print relevant info to progress bar
#         loop.set_description(f'Epoch {epoch}')
#         loop.set_postfix(loss=loss.item())

# model.save_pretrained('./trained_models')



for epoch in tqdm(range(1, epochs)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        # loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
        
#     tqdm.write(f'\nEpoch {epoch}')
    
#     loss_train_avg = loss_train_total/len(dataloader_train)            
#     tqdm.write(f'Training loss: {loss_train_avg}')
    
#     val_loss, predictions, true_vals = evaluate(dataloader_val)
#     val_f1 = f1_score_func(predictions, true_vals)
#     tqdm.write(f'Validation loss: {val_loss}')
#     tqdm.write(f'F1 Score (Weighted): {val_f1}')

torch.save(model.state_dict(), f'finetuned_BERT.model')

  0%|          | 0/4 [00:43<?, ?it/s]


IndexError: Target 2 is out of bounds.

In [None]:
from transformers import Trainer, TrainingArguments
from torch import nn

training_args = TrainingArguments(
    output_dir="./trained",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get("logits")
            # compute custom loss (suppose one has 3 labels with different weights)
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.0, 3.0]))
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss
            
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataloader
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 620
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  0%|          | 0/200 [00:00<?, ?it/s]

TypeError: 'DataLoader' object is not subscriptable