## Imports

In [15]:
import numpy as np
import pandas as pd

## Load dataset

In [16]:
df = pd.read_csv('offensive_tweet_dataset/labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [17]:
## Check for any null values
df.isnull().values.any()

False

Relevant columns: class (label) & tweet (data)  
class is labeled as:
- 0 if hate speech
- 1 if offensive
- 2 if neither

## Drop unused columns

In [18]:
df = df[['class', 'tweet']]
df

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


## Split dataset

In [19]:
from sklearn.model_selection import train_test_split
X = df['tweet']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

def check_ratio(feat_df: pd.DataFrame, df: pd.DataFrame, header: str) -> None:
    print(header + ' : {0} ({1:0.2f}%)'.format(len(feat_df), (len(feat_df)/len(df)) * 100.0))

# Verify split ratios
print('{0:0.2f}% in training set'.format((len(X_train)/len(df.index)) * 100))
print('{0:0.2f}% in test set'.format((len(X_test)/len(df.index)) * 100))
print('')
check_ratio(df.loc[df['class'] == 0], df.index, 'Original Hate Speech')
check_ratio(df.loc[df['class'] == 1], df.index, 'Original Offensive')
check_ratio(df.loc[df['class'] == 2], df.index, 'Original Neither')
print('')
check_ratio(y_train[y_train[:] == 0], y_train, 'Training Hate Speech')
check_ratio(y_train[y_train[:] == 1], y_train, 'Training Offensive')
check_ratio(y_train[y_train[:] == 2], y_train, 'Training Neither')
print('')
check_ratio(y_test[y_test[:] == 0], y_test, 'Test Hate Speech')
check_ratio(y_test[y_test[:] == 1], y_test, 'Test Offensive')
check_ratio(y_test[y_test[:] == 2], y_test, 'Test Neither')

80.00% in training set
20.00% in test set

Original Hate Speech : 1430 (5.77%)
Original Offensive : 19190 (77.43%)
Original Neither : 4163 (16.80%)

Training Hate Speech : 1140 (5.75%)
Training Offensive : 15358 (77.46%)
Training Neither : 3328 (16.79%)

Test Hate Speech : 290 (5.85%)
Test Offensive : 3832 (77.30%)
Test Neither : 835 (16.84%)


## Tokenization

In [20]:
from transformers import RobertaTokenizer, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [21]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [22]:
tokens = tokenizer.batch_encode_plus(
    X_train.values,
    return_attention_mask=True,
    truncation=True,
    max_length=256,
    padding='max_length',
    return_tensors='pt'
)

In [23]:
print(type(X_train.values))

<class 'numpy.ndarray'>


In [24]:
dataset = TensorDataset(
    torch.tensor(tokens['input_ids']),
    torch.tensor(tokens['attention_mask']),
    torch.tensor(y_train.values)
)

print(torch.tensor(tokens['input_ids']).size())
print(torch.tensor(tokens['attention_mask']).size())
print(torch.tensor(y_train.values).size())

torch.Size([19826, 256])
torch.Size([19826, 256])
torch.Size([19826])


  
  This is separate from the ipykernel package so we can avoid doing imports until
  import sys
  


## Building DataLoader

In [25]:
dataloader = DataLoader(
    dataset,
    batch_size=32,
    sampler=RandomSampler(dataset)
)

## Initializing Model

In [26]:
from transformers import RobertaConfig, RobertaForSequenceClassification, BertForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3, output_attentions=False, output_hidden_states=False)

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

## Training

In [27]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [28]:
from transformers import AdamW
# AdamW optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)



In [29]:
from tqdm import tqdm

model.train()

epochs = 5
for epoch in range(epochs):
    # Setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=False)
    for batch in loop:
        # Initialize gradients
        optimizer.zero_grad()
        # Pull all tensor batches required for training
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2],
        }
        outputs = model(**inputs)
        # Calculate loss
        loss = outputs.loss
        loss.backward()
        # Update model parameters
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.save_pretrained('./trained_models')

