# BERT Model: Real Or Not?

Reference for code (code has been slightly modified for our purposes): https://medium.com/@aniruddha.choudhury94/part-2-bert-fine-tuning-tutorial-with-pytorch-for-text-classification-on-the-corpus-of-linguistic-18057ce330e1

# Google Colab Setup

In [None]:
# Ensures Google Colab has GPU hardware accelerator active
# To activate it, edit -> notebook settings -> select 'GPU' in Google Colab
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if device_name:
  print('GPU: {}'.format(device_name))
else:
  raise SystemError('Run this in Google Colab with GPU hardware accelerator active')

GPU: /device:GPU:0


In [None]:
# Saves ID of Google Colab GPU being utilized
import torch
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Using:', torch.cuda.get_device_name(0))

Using: Tesla T4


In [None]:
# Downloads Kaggle dataset
# https://www.kaggle.com/general/51898
import os

if not os.path.exists('./sample_submission.csv'):
  os.environ['KAGGLE_USERNAME'] = 'sadiec'
  os.environ['KAGGLE_KEY'] = '4229579af4a23afdcc918693cc59282e'
  !kaggle competitions download -c nlp-getting-started

In [None]:
# To get the pytorch working, I had to do this:
#https://github.com/pytorch/pytorch/issues/19406#issuecomment-581178550
#pip install torch==1.2.0+cpu torchvision==0.4.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

# Importing Data

In [None]:
import pandas as pd

train_df = pd.read_csv('./train.csv',
                       header=0,
                       names=['id','keyword','location','text','target'])

test_df = pd.read_csv('./test.csv',
                      header=0,
                      names=['id','keyword','location','text'])

train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target
2643,3795,destruction,,RSS: Russia begins mass destruction of illegal...,1
2880,4140,drought,Ashxjonespr@gmail.com,Thought it was a drought @_ASHJ? http://t.co/V...,1
2314,3323,demolished,Beautiful British Columbia,They absolutely demolished the sounders from s...,0
847,1227,blizzard,Ideally under a big tree,That horrible moment when u open up the dryer ...,0
7573,10824,wrecked,"Denton, Texas",Had an awesome time gettin wrecked at bowling ...,0


In [None]:
train_tweets = train_df.text.values
train_targets = train_df.target.values

test_tweets = test_df.text.values

print(train_tweets[0])
print(train_targets[0])

print(train_tweets.shape)
print(train_targets.shape)

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1
(7613,)
(7613,)


# Tokenizing

In [None]:
!pip install transformers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)



In [None]:
# Tokenizing Training Data
train_tokenized = []

for tweet in train_tweets:
  tokens = tokenizer.encode(tweet,
                            add_special_tokens=True,
                            truncation=True,
                            pad_to_max_length=True,
                            max_length=80)
  train_tokenized.append(tokens)

print('Tweet:', train_tweets[0])
print('Tokenized:',train_tokenized[0])

# Tokenizing Testing Data
test_tokenized = []

for tweet in test_tweets:
  tokens = tokenizer.encode(tweet,
                            add_special_tokens=True,
                            truncation=True,
                            pad_to_max_length=True,
                            max_length=80)
  test_tokenized.append(tokens)

Tweet: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Tokenized: [101, 2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Attention Masks

In [None]:
# Marks real words as 1 and padding as 0

# Creating attention masks for training data
train_attention_masks = []

for tokenized in train_tokenized:
  mask = [int(token != 0) for token in tokenized]
  train_attention_masks.append(mask)

# Creating attention masks for testing data
test_attention_masks = []

for tokenized in test_tokenized:
  mask = [int(token != 0) for token in tokenized]
  test_attention_masks.append(mask)

print(train_tweets[0])
print(train_attention_masks[0])

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Training/Testing Split

In [None]:
# TODO: Use solely training data to train model and classify unlabeled testing data
# Splitting data to use in preliminary analysis of BERT Model
from sklearn.model_selection import train_test_split

# Splitting tweets/targets
train_inputs, prelim_test_inputs, train_labels, prelim_test_targets \
        = train_test_split(train_tokenized,
                           train_targets,
                           random_state = 2018,
                           test_size = 0.1)
# Splitting attention masks
train_masks, prelim_test_masks, _, _ = train_test_split(train_attention_masks,
                                                        train_targets,
                                                        random_state = 2018,
                                                        test_size = 0.1)
print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])

[101, 4151, 10231, 1030, 2332, 8029, 2705, 16147, 2683, 2683, 2026, 3042, 2074, 9913, 1012, 5292, 3270, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Tensor Creation

In [None]:
# Training and Preliminary Split Tensors
train_inputs = torch.tensor(train_inputs)
#prelim_test_inputs = torch.tensor(prelim_test_inputs)
train_labels = torch.tensor(train_labels)
#prelim_test_labels = torch.tensor(prelim_test_targets)
train_masks = torch.tensor(train_masks)
#prelim_test_masks = torch.tensor(prelim_test_masks)

In [None]:
# Testing Tensors
test_inputs = torch.tensor(test_tokenized)
test_masks = torch.tensor(test_attention_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32 # Can modify to be 16 or 32, TODO: tuning

# Training Slice of Training
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_size)

# Preliminary Slice of Training
"""
prelim_test_data = TensorDataset(prelim_test_inputs, prelim_test_masks, prelim_test_labels)
prelim_test_sampler = RandomSampler(prelim_test_data)
prelim_test_dataloader = DataLoader(prelim_test_data,
                              sampler=prelim_test_sampler,
                              batch_size=batch_size)"""

'\nprelim_test_data = TensorDataset(prelim_test_inputs, prelim_test_masks, prelim_test_labels)\nprelim_test_sampler = RandomSampler(prelim_test_data)\nprelim_test_dataloader = DataLoader(prelim_test_data,\n                              sampler=prelim_test_sampler,\n                              batch_size=batch_size)'

# BERT Setup

In [None]:
# Says to use model in Google Colab GPU
# pip install tensorflow
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                     num_labels=2,
                                                     output_attentions=False,
                                                     output_hidden_states=False)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# Learning and Optimization

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

# Accuracy Function

In [None]:
import numpy as np

def accuracy(classifications, targets):
  classification = np.argmax(classifications, axis=1).flatten()
  targets = targets.flatten()
  return np.sum(classification == targets) / len(targets)

# Training

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    print('=====Epoch {:} / {:} ====='.format(epoch_i+1, epochs))
    total_loss = 0
    model.train() # puts model into training mode
    
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0: # show progress every 40 batches
            print('Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids = None,
                        attention_mask = b_input_mask,
                        labels = b_labels)

        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)
    print('Average Training Loss: {0:.2f}'.format(avg_train_loss))

=====Epoch 1 / 4 =====
Batch    40 of   215.
Batch    80 of   215.
Batch   120 of   215.
Batch   160 of   215.
Batch   200 of   215.
Average Training Loss: 0.45
=====Epoch 2 / 4 =====
Batch    40 of   215.
Batch    80 of   215.
Batch   120 of   215.
Batch   160 of   215.
Batch   200 of   215.
Average Training Loss: 0.34
=====Epoch 3 / 4 =====
Batch    40 of   215.
Batch    80 of   215.
Batch   120 of   215.
Batch   160 of   215.
Batch   200 of   215.
Average Training Loss: 0.26
=====Epoch 4 / 4 =====
Batch    40 of   215.
Batch    80 of   215.
Batch   120 of   215.
Batch   160 of   215.
Batch   200 of   215.
Average Training Loss: 0.21



# Preliminary Analysis

In [None]:
"""model.eval()
eval_loss, eval_accuracy = 0,0
nb_eval_steps, nb_eval_examples = 0,0

for batch in prelim_test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids = None,
                        attention_mask = b_input_mask)
    
    logits = outputs[0].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    tmp_eval_accuracy = accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print('Accuracy: {0:.2f}'.format(eval_accuracy/nb_eval_steps))"""

"model.eval()\neval_loss, eval_accuracy = 0,0\nnb_eval_steps, nb_eval_examples = 0,0\n\nfor batch in prelim_test_dataloader:\n    batch = tuple(t.to(device) for t in batch)\n    b_input_ids, b_input_mask, b_labels = batch\n    \n    with torch.no_grad():\n        outputs = model(b_input_ids,\n                        token_type_ids = None,\n                        attention_mask = b_input_mask)\n    \n    logits = outputs[0].detach().cpu().numpy()\n    label_ids = b_labels.to('cpu').numpy()\n    \n    tmp_eval_accuracy = accuracy(logits, label_ids)\n    \n    eval_accuracy += tmp_eval_accuracy\n    nb_eval_steps += 1\n\nprint('Accuracy: {0:.2f}'.format(eval_accuracy/nb_eval_steps))"

# Kaggle Submission

In [None]:
test_inputs_gpu = train_inputs.to(device)
test_masks_gpu = test_masks.to(device)
with torch.no_grad():
  outputs = model(test_inputs_gpu, test_masks_gpu)

RuntimeError: ignored