### Bert finetuning on MRPC data
Pytorch + TensorFlow 2.0

https://github.com/huggingface/transformers#quick-tour-tf-20-training-and-pytorch-interoperability

Paraphrasing task

In [1]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *
%load_ext tensorboard
import datetime

In [2]:
batch_size_train = 32
batch_size_val = 64
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/Users/shivangi/tensorflow_datasets/glue/mrpc/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /Users/shivangi/tensorflow_datasets/glue/mrpc/0.0.2


In [3]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(batch_size_train).repeat(2)
valid_dataset = valid_dataset.batch(batch_size_val)

In [4]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule

learning_rate = 3e-5 #1e-5 2e-5 3e-5 5e-5
epsilon = 1e-08

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [5]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [11]:
# Train and evaluate using tf.keras.Model.fit()

# history = model.fit(train_dataset, epochs=3, steps_per_epoch=115,
#                     validation_data=valid_dataset, validation_steps=7)
epochs = 1 #3
train_data_size = 3668
steps_per_epoch = int(train_data_size / batch_size_train)
history = model.fit(train_dataset, epochs=epochs, steps_per_epoch=steps_per_epoch,
                    validation_data=valid_dataset, validation_steps=7,
                   callbacks=[tensorboard_callback])



In [12]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 73941), started 0:09:30 ago. (Use '!kill 73941' to kill it.)

### end of training

In [26]:
# Load the TensorFlow model in PyTorch for inspection
model.save_pretrained('./save/')
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)

In [35]:
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This is wrong."
sentence_1 = "that was right."
sentence_2 = "This is incorrect."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")

sentence_1 is not a paraphrase of sentence_0
sentence_2 is a paraphrase of sentence_0


## trial and errors

In [7]:
import torch
from thop import profile

In [10]:
inputs = torch.randn(1, 1)
inputs = torch.tensor(inputs).to(torch.int64)
macs, params = profile(pytorch_model,(inputs,))

[91m[WARN] Cannot find rule for <class 'torch.nn.modules.sparse.Embedding'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'torch.nn.modules.normalization.LayerNorm'>. Treat it as zero Macs and zero Params.[00m
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertEmbeddings'>. Treat it as zero Macs and zero Params.[00m
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertSelfAttention'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertSelfOutput'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertAttention'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertInt

  


In [11]:
print(macs, params)

171052032.0 85609730.0


In [12]:
model_new = BertModel.from_pretrained('bert-base-cased')

In [13]:
inputs = torch.randn(1, 1)
inputs = torch.tensor(inputs).to(torch.int64)
macs, params = profile(model_new,(inputs,))

[91m[WARN] Cannot find rule for <class 'torch.nn.modules.sparse.Embedding'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'torch.nn.modules.normalization.LayerNorm'>. Treat it as zero Macs and zero Params.[00m
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertEmbeddings'>. Treat it as zero Macs and zero Params.[00m
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertSelfAttention'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertSelfOutput'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertAttention'>. Treat it as zero Macs and zero Params.[00m
[91m[WARN] Cannot find rule for <class 'transformers.modeling_bert.BertInt

  


In [14]:
print(macs,params)

85524480.0 85608192.0


In [91]:
l = [module for module in model_new.modules() if type(module) != torch.nn.Sequential]

In [126]:
# l

In [122]:
params = list(model_new.named_parameters())

In [123]:
total = 0
for i in range(len(params)):
    #print (params[i][0], len(params[i][1]))
    total += len(params[i][1])
print (total)

235334
