# Comparing TensorFlow (original) and PyTorch models

You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.

To run this notebook, follow these instructions:
- make sure that your Python environment has both TensorFlow and PyTorch installed,
- download the original TensorFlow implementation,
- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,
- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.

If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code.

In [1]:
import os
os.chdir('../')

## 1/ TensorFlow code

In [2]:
pwd

'/dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/pytorch-pretrained-BERT'

In [3]:
original_tf_inplem_dir = "../bert/"
model_dir = "../../../../data/bert/uncased_L-12_H-768_A-12/"
# model_dir = "/tmp/pretraining_output/"

vocab_file = model_dir + "vocab.txt"
bert_config_file = model_dir + "bert_config.json"
init_checkpoint = model_dir + "bert_model.ckpt"

input_file = "./samples/input.txt"
max_seq_length = 128

In [4]:
import importlib.util
import sys

spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features.py')
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
sys.modules['extract_features_tensorflow'] = module
sys.path.append('../bert')
from extract_features_tensorflow import *

W0627 18:42:18.194700 139768221259520 __init__.py:308] Limited tf.compat.v2.summary API due to missing TensorBoard installation.


In [5]:
layer_indexes = list(range(12))
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
    vocab_file=vocab_file, do_lower_case=True)
examples = read_examples(input_file)

features = convert_examples_to_features(
    examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

W0627 18:42:31.361926 139768221259520 deprecation_wrapper.py:119] From ../bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

W0627 18:42:31.517952 139768221259520 deprecation_wrapper.py:119] From ../bert//extract_features.py:285: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.



In [6]:
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    master=None,
    tpu_config=tf.contrib.tpu.TPUConfig(
        num_shards=1,
        per_host_input_for_training=is_per_host))

model_fn = model_fn_builder(
    bert_config=bert_config,
    init_checkpoint=init_checkpoint,
    layer_indexes=layer_indexes,
    use_tpu=False,
    use_one_hot_embeddings=False)

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False,
    model_fn=model_fn,
    config=run_config,
    predict_batch_size=1)

input_fn = input_fn_builder(
    features=features, seq_length=max_seq_length)

W0627 18:42:33.141581 139768221259520 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0627 18:42:33.143906 139768221259520 estimator.py:1984] Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f1e1f3efb70>) includes params argument, but params are not passed to Estimator.
W0627 18:42:33.146666 139768221259520 estimator.py:1811] Using temporary folder as model directory: /tmp/tmpnbs1syqb
W0627 18:42:33.148229 139768221259520 tpu_context.py:750] Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.)
W0627 18:42:33.149096 139768221259520 tpu_context.py:211] eval_on_tpu ig

In [7]:
tensorflow_all_out = []
for result in estimator.predict(input_fn, yield_single_examples=True):
    unique_id = int(result["unique_id"])
    feature = unique_id_to_feature[unique_id]
    output_json = collections.OrderedDict()
    output_json["linex_index"] = unique_id
    tensorflow_all_out_features = []
    # for (i, token) in enumerate(feature.tokens):
    all_layers = []
    for (j, layer_index) in enumerate(layer_indexes):
        print("extracting layer {}".format(j))
        layer_output = result["layer_output_%d" % j]
        layers = collections.OrderedDict()
        layers["index"] = layer_index
        layers["values"] = layer_output
        all_layers.append(layers)
    tensorflow_out_features = collections.OrderedDict()
    tensorflow_out_features["layers"] = all_layers
    tensorflow_all_out_features.append(tensorflow_out_features)

    output_json["features"] = tensorflow_all_out_features
    tensorflow_all_out.append(output_json)

W0627 18:43:00.101773 139768221259520 deprecation_wrapper.py:119] From ../bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W0627 18:43:00.105035 139768221259520 deprecation_wrapper.py:119] From ../bert/modeling.py:409: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

W0627 18:43:00.142245 139768221259520 deprecation_wrapper.py:119] From ../bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.

W0627 18:43:00.209266 139768221259520 deprecation.py:323] From ../bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0627 18:43:02.532649 139768221259520 deprecation_wrapper.py:119] From ../bert//extract_features.py:173: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variab

extracting layer 0
extracting layer 1
extracting layer 2
extracting layer 3
extracting layer 4
extracting layer 5
extracting layer 6
extracting layer 7
extracting layer 8
extracting layer 9
extracting layer 10
extracting layer 11


In [8]:
print(len(tensorflow_all_out))
print(len(tensorflow_all_out[0]))
print(tensorflow_all_out[0].keys())
print("number of tokens", len(tensorflow_all_out[0]['features']))
print("number of layers", len(tensorflow_all_out[0]['features'][0]['layers']))
tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape

1
2
odict_keys(['linex_index', 'features'])
number of tokens 1
number of layers 12


(128, 768)

In [9]:
tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)

In [10]:
print(tensorflow_outputs[0])

[[ 0.10810544  0.00736203 -0.14134324 ...  0.08043151  0.07175563
   0.0031992 ]
 [-0.00526232  0.6327945  -0.2985075  ...  0.10594425  0.09061253
  -0.76824725]
 [-0.3182612  -0.8120704   0.15033704 ... -0.1900597   0.15686822
   0.12246863]
 ...
 [ 0.09414048 -0.33054894  0.61384857 ...  0.43929374 -0.3086228
   0.06017733]
 [ 0.01996002 -0.37984183  0.49045902 ...  0.45061845 -0.21570973
  -0.05887301]
 [ 0.15295641 -0.2668718   0.49672574 ...  0.7504021  -0.5253611
  -0.10960616]]


## 2/ PyTorch code

In [11]:
os.chdir('./examples')

In [12]:
import extract_features
import pytorch_pretrained_bert as ppb
from extract_features import *

In [13]:
pwd

'/dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/pytorch-pretrained-BERT/examples'

In [14]:
init_checkpoint_pt = "/dfs/scratch0/zjian/bert-pretraining/data/bert/uncased_L-12_H-768_A-12/"

In [15]:
device = torch.device("cpu")
model = ppb.BertModel.from_pretrained(init_checkpoint_pt)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [16]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [17]:
layer_indexes = list(range(12))

pytorch_all_out = []
for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:
    print(input_ids)
    print(input_mask)
    print(example_indices)
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)

    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)

    for b, example_index in enumerate(example_indices):
        feature = features[example_index.item()]
        unique_id = int(feature.unique_id)
        # feature = unique_id_to_feature[unique_id]
        output_json = collections.OrderedDict()
        output_json["linex_index"] = unique_id
        all_out_features = []
        # for (i, token) in enumerate(feature.tokens):
        all_layers = []
        for (j, layer_index) in enumerate(layer_indexes):
            print("layer", j, layer_index)
            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
            layer_output = layer_output[b]
            layers = collections.OrderedDict()
            layers["index"] = layer_index
            layer_output = layer_output
            layers["values"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]
            all_layers.append(layers)

            out_features = collections.OrderedDict()
            out_features["layers"] = all_layers
            all_out_features.append(out_features)
        output_json["features"] = all_out_features
        pytorch_all_out.append(output_json)

tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,
          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [18]:
print(len(pytorch_all_out))
print(len(pytorch_all_out[0]))
print(pytorch_all_out[0].keys())
print("number of tokens", len(pytorch_all_out))
print("number of layers", len(pytorch_all_out[0]['features'][0]['layers']))
print("hidden_size", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))
pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape

1
2
odict_keys(['linex_index', 'features'])
number of tokens 1
number of layers 12
hidden_size 128


(128, 768)

In [19]:
pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)
print(pytorch_outputs[0].shape)
print(pytorch_outputs[1].shape)

(128, 768)
(128, 768)


In [20]:
print(tensorflow_outputs[0].shape)
print(tensorflow_outputs[1].shape)

(128, 768)
(128, 768)


In [24]:
print(tensorflow_outputs[11])

[[-0.62740695  0.192312   -0.75341856 ... -1.0650556   0.58133197
   0.5707067 ]
 [-0.76054615 -0.32701385 -0.36266565 ...  0.01773436  0.60588366
  -0.2489675 ]
 [ 0.21669555 -0.5682614  -0.30355218 ... -0.10413647  0.41515136
   0.02198731]
 ...
 [-0.39765483 -0.39336842 -0.05269168 ...  0.19409186  0.51028097
  -0.25353226]
 [-0.43120837 -0.27328053  0.08999324 ...  0.2122698   0.4447497
  -0.24691598]
 [-0.47570807 -0.39823085  0.12587935 ...  0.3550038   0.38457102
  -0.24922228]]


In [23]:
print(pytorch_outputs[11])

[[-0.62811095  0.19321427 -0.75184983 ... -1.0645778   0.58155036
   0.56932104]
 [-0.7641178  -0.3281739  -0.3601332  ...  0.01801311  0.6059512
  -0.25005627]
 [ 0.21434069 -0.5701855  -0.30415305 ... -0.10235173  0.41666248
   0.02177819]
 ...
 [-0.39697227 -0.3921217  -0.05266615 ...  0.19272889  0.5086826
  -0.2545995 ]
 [-0.43031502 -0.27161762  0.08972219 ...  0.210671    0.44224805
  -0.24768041]
 [-0.47492576 -0.39643568  0.12562543 ...  0.3533985   0.38219833
  -0.25067812]]


## 3/ Comparing the standard deviation on the last layer of both models

In [24]:
import numpy as np

In [25]:
print('shape tensorflow layer, shape pytorch layer, standard deviation')
print('\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,
                          np.array(pytorch_outputs[i]).shape, 
                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))

shape tensorflow layer, shape pytorch layer, standard deviation
((128, 768), (128, 768), 0.00021029184)
((128, 768), (128, 768), 0.00055583025)
((128, 768), (128, 768), 0.00068541203)
((128, 768), (128, 768), 0.0008927335)
((128, 768), (128, 768), 0.001315971)
((128, 768), (128, 768), 0.0016274694)
((128, 768), (128, 768), 0.0021441837)
((128, 768), (128, 768), 0.0024197593)
((128, 768), (128, 768), 0.0026458544)
((128, 768), (128, 768), 0.0028913843)
((128, 768), (128, 768), 0.0030688304)
((128, 768), (128, 768), 0.001419331)
