### COLA - Corpus of Linguistic Acceptability

Each example is a sequence of words annotated with whether it is a grammatical English sentence.

In [70]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

batch_size_train = 32
batch_size_val = 32
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/cola')

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='cola')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='cola')
train_dataset = train_dataset.shuffle(100).batch(batch_size_train).repeat(2)
valid_dataset = valid_dataset.batch(batch_size_val)

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/Users/shivangi/tensorflow_datasets/glue/cola/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /Users/shivangi/tensorflow_datasets/glue/cola/0.0.2


Number of elements: 1042


In [74]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
learning_rate = 2e-5 #1e-5 2e-5 3e-5 5e-5
epsilon = 1e-08
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [75]:
for steps, _ in enumerate(train_dataset):
    pass

print(f'Number of elements: {steps}')

Number of elements: 535


In [76]:
epochs = 1 #3
steps_per_epoch = steps
history = model.fit(train_dataset, epochs=epochs, steps_per_epoch= steps_per_epoch,
                    validation_data=valid_dataset, validation_steps=7)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
  1/535 [..............................] - ETA: 0s - loss: 1.2760 - accuracy: 0.2188

KeyboardInterrupt: 

training completed

### save and test the model

In [59]:
import os
def create_folder(folder_name):
    path = os.getcwd()
    path = path + '/' + folder_name
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory {} failed. Already exists".format(path))
    else:
        print("Successfully created the directory {}".format(path))

In [60]:
create_folder('save_cola')
# Load the TensorFlow model in PyTorch for inspection
model.save_pretrained('./save_cola/')
pytorch_model = BertForSequenceClassification.from_pretrained('./save_cola/', from_tf=True)

Creation of the directory /Users/shivangi/Documents/Dissertation/BERT/save_cola failed. Already exists


In [61]:
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "something laugher something"
inputs_1 = tokenizer.encode_plus(sentence_0, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_1, add_special_tokens=True, return_tensors='pt')

pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print (pred_1, pred_2)

0 0


In [57]:
print(inputs_1)

{'input_ids': tensor([[ 101, 1188, 1844, 1108, 8080, 1114, 1117, 9505,  119,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


The `tokenizer.encode_plus` function combines multiple steps for us:

1. Split the sentence into tokens.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.
4. Pad or truncate all sentences to the same length.
5. Create the attention masks which explicitly differentiate real tokens from `[PAD]` tokens.

