In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('GPU: {}'.format(device_name))
else:
  raise SystemError('ERROR')

GPU: /device:GPU:0


In [None]:
import torch
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Using:', torch.cuda.get_device_name(0))

Using: Tesla P100-PCIE-16GB


In [None]:
import wget
import os
if not os.path.exists('./cola_public_1.1.zip'):
  wget.download('https://nyu-mll.github.io/CoLA/cola_public_1.1.zip', './cola_public_1.1.zip')
if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

##### Reference for code: https://medium.com/@aniruddha.choudhury94/part-2-bert-fine-tuning-tutorial-with-pytorch-for-text-classification-on-the-corpus-of-linguistic-18057ce330e1



In [None]:
#https://github.com/pytorch/pytorch/issues/19406#issuecomment-581178550
#pip install torch==1.2.0+cpu torchvision==0.4.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

# Importing Data

In [None]:
import pandas as pd

df = pd.read_csv('./cola_public/raw/in_domain_train.tsv',
                delimiter='\t',
                header = None,
                names = ['sentence_source','label','label_notes','sentence'])
print('Number of tuples: {:,}'.format(df.shape[0]))

df_test = pd.read_csv('./cola_public/raw/in_domain_dev.tsv',
                delimiter='\t',
                header = None,
                names = ['sentence_source','label','label_notes','sentence'])
print('Number of tuples: {:,}'.format(df_test.shape[0]))
df_test.sample(10)

Number of tuples: 8,551
Number of tuples: 527


Unnamed: 0,sentence_source,label,label_notes,sentence
107,r-67,1,,Maxwell is quite a doctor.
360,c_13,1,,In the classroom John put the book on the table.
381,c_13,1,,The children admire their mother.
302,ks08,1,,How did you guess that he fixed the computer?
30,bc01,1,,Water bubbled up out of the kettle.
452,sks13,0,*,Mary wonders that Bill will come.
290,ks08,1,,"George has spent a lot of money, hasn't he?"
493,ad03,1,,It's Anson that I like
143,l-93,1,,We pulled free.
46,bc01,1,,It is nice to go abroad.


In [None]:
sentences = df.sentence.values
labels = df.label.values

sentences_test = df_test.sentence.values
labels_test = df_test.label.values

# Tokenizing

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
input_ids = []

for sent in sentences:
    encoded_sent = tokenizer.encode(sent,
                                    add_special_tokens=True,
                                    truncation=True,
                                    pad_to_max_length=True,
                                    max_length=64)
    input_ids.append(encoded_sent)

print('Original:', sentences[0])
print('Token IDs:', input_ids[0])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
input_ids_test = []

for sent in sentences_test:
    encoded_sent = tokenizer.encode(sent,
                                    add_special_tokens=True,
                                    truncation=True,
                                    pad_to_max_length=True,
                                    max_length=64)
    input_ids_test.append(encoded_sent)
print('Original:', sentences_test[0])
print('Token IDs:', input_ids_test[0])

Original: Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Original: The sailors rode the breeze clear of the rocks.
Token IDs: [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Attention Masks

In [None]:
attention_masks = []

for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

attention_masks_test = []
for sent in input_ids_test:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks_test.append(att_mask)

print(attention_masks[0])
print(attention_masks_test[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Training/Test Creation

In [None]:
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels,
                                                                                    random_state = 2018,
                                                                                    test_size = 0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                       labels,
                                                       random_state = 2018,
                                                       test_size = 0.1)
print(train_inputs[0])
print(train_masks[0])

[101, 2002, 2939, 1996, 3328, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Tensors

In [None]:
import torch

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

test_inputs = torch.tensor(input_ids_test)
test_masks = torch.tensor(attention_masks_test)
test_labels = torch.tensor(labels_test)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# BERT Model

In [None]:
# pip install tensorflow
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                     num_labels=2,
                                                     output_attentions=False,
                                                     output_hidden_states=False)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
params = list(model.named_parameters())
print('The BERT Model has {:} parameters'.format(len(params)))

print('=====Embedding Layer=====')
for p in params[0:5]:
    print('{:<55} {:>12}'.format(p[0], str(tuple(p[1].size()))))
print('=====First Transformer=====')
for p in params[5:21]:
    print('{:<55} {:>12}'.format(p[0], str(tuple(p[1].size()))))
print('=====Output Layer=====')
for p in params[-4:]:
    print('{:<55} {:>12}'.format(p[0], str(tuple(p[1].size()))))

The BERT Model has 201 parameters
=====Embedding Layer=====
bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)
=====First Transformer=====
bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (768,)
bert.encoder.la

# Learning and Optimization

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

# Training

In [None]:
# To test accuracy
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Formatting times
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    print('=====Epoch {:} / {:} ====='.format(epoch_i+1, epochs))
    t0 = time.time() # start time
    total_loss = 0
    model.train() # puts model into training mode
    
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0: # show progress every 40 batches
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids = None,
                        attention_mask = b_input_mask,
                        labels = b_labels)

        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)
    print('Average Training Loss: {0:.2f}'.format(avg_train_loss))
    print('Training Epoch Took: {:}'.format(format_time(time.time() - t0)))

=====Epoch 1 / 4 =====
Batch    40 of   241. Elapsed: 0:00:08.
Batch    80 of   241. Elapsed: 0:00:16.
Batch   120 of   241. Elapsed: 0:00:25.
Batch   160 of   241. Elapsed: 0:00:33.
Batch   200 of   241. Elapsed: 0:00:41.
Batch   240 of   241. Elapsed: 0:00:49.
Average Training Loss: 0.48
Training Epoch Took: 0:00:49
=====Epoch 2 / 4 =====
Batch    40 of   241. Elapsed: 0:00:08.
Batch    80 of   241. Elapsed: 0:00:16.
Batch   120 of   241. Elapsed: 0:00:24.
Batch   160 of   241. Elapsed: 0:00:33.
Batch   200 of   241. Elapsed: 0:00:41.
Batch   240 of   241. Elapsed: 0:00:49.
Average Training Loss: 0.30
Training Epoch Took: 0:00:49
=====Epoch 3 / 4 =====
Batch    40 of   241. Elapsed: 0:00:08.
Batch    80 of   241. Elapsed: 0:00:16.
Batch   120 of   241. Elapsed: 0:00:24.
Batch   160 of   241. Elapsed: 0:00:33.
Batch   200 of   241. Elapsed: 0:00:41.
Batch   240 of   241. Elapsed: 0:00:49.
Average Training Loss: 0.19
Training Epoch Took: 0:00:49
=====Epoch 4 / 4 =====
Batch    40 of   

# Validation

In [None]:
t0 = time.time()
model.eval()
eval_loss, eval_accuracy = 0,0
nb_eval_steps, nb_eval_examples = 0,0

for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids = None,
                        attention_mask = b_input_mask)
    
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print('Accuracy: {0:.2f}'.format(eval_accuracy/nb_eval_steps))
print('Validation elapsed time: {:}'.format(format_time(time.time() - t0)))

Accuracy: 0.83
Validation elapsed time: 0:00:02


In [None]:
import plotly.express as px

f = pd.DataFrame(loss_values)
f.columns = ['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training Loss', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()