# Comparing TensorFlow (original) and PyTorch models

You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.

To run this notebook, follow these instructions:
- make sure that your Python environment has both TensorFlow and PyTorch installed,
- download the original TensorFlow implementation,
- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,
- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.

If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code.

In [1]:
import os
os.chdir('../')

In [2]:
import tensorflow as tf

W0702 15:12:10.009658 140411700147968 __init__.py:308] Limited tf.compat.v2.summary API due to missing TensorBoard installation.


## 1/ TensorFlow code

In [3]:
original_tf_inplem_dir = "../bert/"
model_dir = "/tmp/pretraining_output_test/"

vocab_file = model_dir + "vocab.txt"
bert_config_file = model_dir + "bert_config.json"
init_checkpoint = model_dir + "model.ckpt-20"

input_file = "./samples/input.txt"
max_seq_length = 128

In [4]:
import importlib.util
import sys

spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features.py')
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
sys.modules['extract_features_tensorflow'] = module
sys.path.append('../bert')
from extract_features_tensorflow import *

In [5]:
# with tf.variable_scope("test", dtype=tf.float64):
layer_indexes = list(range(12))
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
    vocab_file=vocab_file, do_lower_case=True)
examples = read_examples(input_file)

features = convert_examples_to_features(
    examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

W0702 15:12:11.568553 140411700147968 deprecation_wrapper.py:119] From /dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

W0702 15:12:11.719705 140411700147968 deprecation_wrapper.py:119] From ../bert//extract_features.py:295: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.



In [6]:
# with tf.variable_scope("test", dtype=tf.float64):
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    master=None,
    tpu_config=tf.contrib.tpu.TPUConfig(
        num_shards=1,
        per_host_input_for_training=is_per_host))

model_fn = model_fn_builder(
    bert_config=bert_config,
    init_checkpoint=init_checkpoint,
    layer_indexes=layer_indexes,
    use_tpu=False,
    use_one_hot_embeddings=False)

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False,
    model_fn=model_fn,
    config=run_config,
    predict_batch_size=1)

input_fn = input_fn_builder(
    features=features, seq_length=max_seq_length)

W0702 15:12:12.838428 140411700147968 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0702 15:12:12.840993 140411700147968 estimator.py:1984] Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7fb3e9b9f9d8>) includes params argument, but params are not passed to Estimator.
W0702 15:12:12.845199 140411700147968 estimator.py:1811] Using temporary folder as model directory: /tmp/tmpsj8as44d
W0702 15:12:12.847528 140411700147968 tpu_context.py:750] Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.)
W0702 15:12:12.848565 140411700147968 tpu_context.py:211] eval_on_tpu ig

In [7]:
# with tf.variable_scope("test", dtype=tf.float64):
tensorflow_all_out = []
for result in estimator.predict(input_fn, yield_single_examples=True):
    unique_id = int(result["unique_id"])
    feature = unique_id_to_feature[unique_id]
    output_json = collections.OrderedDict()
    output_json["linex_index"] = unique_id
    tensorflow_all_out_features = []
    # for (i, token) in enumerate(feature.tokens):
    all_layers = []
    for (j, layer_index) in enumerate(layer_indexes):
        print("extracting layer {}".format(j))
        layer_output = result["layer_output_%d" % j]
        layers = collections.OrderedDict()
        layers["index"] = layer_index
        layers["values"] = layer_output
        all_layers.append(layers)
    tensorflow_out_features = collections.OrderedDict()
    tensorflow_out_features["layers"] = all_layers
    tensorflow_all_out_features.append(tensorflow_out_features)

    output_json["features"] = tensorflow_all_out_features
    tensorflow_all_out.append(output_json)

W0702 15:12:13.994565 140411700147968 deprecation_wrapper.py:119] From /dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W0702 15:12:13.997931 140411700147968 deprecation_wrapper.py:119] From /dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/bert/modeling.py:409: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

W0702 15:12:14.035680 140411700147968 deprecation_wrapper.py:119] From /dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.

W0702 15:12:14.103726 140411700147968 deprecation.py:323] From /dfs/scratch0/zjian/bert-pretraining/src/bert-pretraining/third_party/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a

bert/embeddings/word_embeddings:0
bert/embeddings/token_type_embeddings:0
bert/embeddings/position_embeddings:0
bert/embeddings/LayerNorm/beta:0
bert/embeddings/LayerNorm/gamma:0
bert/encoder/layer_0/attention/self/query/kernel:0
bert/encoder/layer_0/attention/self/query/bias:0
bert/encoder/layer_0/attention/self/key/kernel:0
bert/encoder/layer_0/attention/self/key/bias:0
bert/encoder/layer_0/attention/self/value/kernel:0
bert/encoder/layer_0/attention/self/value/bias:0
bert/encoder/layer_0/attention/output/dense/kernel:0
bert/encoder/layer_0/attention/output/dense/bias:0
bert/encoder/layer_0/attention/output/LayerNorm/beta:0
bert/encoder/layer_0/attention/output/LayerNorm/gamma:0
bert/encoder/layer_0/intermediate/dense/kernel:0
bert/encoder/layer_0/intermediate/dense/bias:0
bert/encoder/layer_0/output/dense/kernel:0
bert/encoder/layer_0/output/dense/bias:0
bert/encoder/layer_0/output/LayerNorm/beta:0
bert/encoder/layer_0/output/LayerNorm/gamma:0
bert/encoder/layer_1/attention/self/que

W0702 15:12:18.118626 140411700147968 deprecation.py:323] From /lfs/1/zjian/anaconda2/envs/bert-pretraining/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py:1354: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


attention output  [[590593.062 590593.062 590593.062 ... 590593.062 590593.062 590593.062]
 [590593.062 590593.062 590593.062 ... 590593.062 590593.062 590593.062]
 [590593.062 590593.062 590593.062 ... 590593.062 590593.062 590593.062]
 ...
 [590593.062 590593.062 590593.062 ... 590593.062 590593.062 590593.062]
 [590593.062 590593.062 590593.062 ... 590593.062 590593.062 590593.062]
 [590593.062 590593.062 590593.062 ... 590593.062 590593.062 590593.062]] [[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]] [128 768]
attention output 2  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]] [1 1 1 ... 1 1 1] [1 1 1 ... 1 1 1]
intermediate output  [[0.841192 0.841192 0.841192 ... 0.841192 0.841192 0.841192]
 [0.841192 0.841192 0.841192 .

attention output 2  [[2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 ...
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]] [[2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 ...
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]] [1 1 1 ... 1 1 1] [1 1 1 ... 1 1 1]
intermediate output  [[1537 1537 1537 ... 1537 1537 1537]
 [1537 1537 1537 ... 1537 1537 1537]
 [1537 1537 1537 ... 1537 1537 1537]
 ...
 [1537 1537 1537 ... 1537 1537 1537]
 [1537 1537 1537 ... 1537 1537 1537]
 [1537 1537 1537 ... 1537 1537 1537]]
layer output  [[2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 ...
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]
 [2 2 2 ... 2 2 2]]
attention output  [[1180417.12 1180417.12 1180417.12 ... 1180417.12 1180417.12 1180417.12]
 [1180417.12 1180417.12 1180417.12 ... 1180417.12 1180417.12 1180417.12]
 [1180417.12 1180417.12 1180417.12 ... 1180417.12 1180417.12 1180417.12]
 ...
 [1180417.12 1180417.12 1180417.12 ... 1180417.12 1180417.12 1180417.12]
 [1180

In [8]:
print(len(tensorflow_all_out))
print(len(tensorflow_all_out[0]))
print(tensorflow_all_out[0].keys())
print("number of tokens", len(tensorflow_all_out[0]['features']))
print("number of layers", len(tensorflow_all_out[0]['features'][0]['layers']))
tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape

1
2
odict_keys(['linex_index', 'features'])
number of tokens 1
number of layers 12


(128, 768)

In [9]:
tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)

In [10]:
print(tensorflow_outputs[0])

[[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
# def layer_norm(input_tensor, name=None):
#   """Run layer normalization on the last dimension of the tensor."""
#   return tf.contrib.layers.layer_norm(
#       inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)

tf.reset_default_graph()
input_tensor = tf.constant(value=0.5, shape=(128, 276))
output_tensor = tf.contrib.layers.layer_norm(
      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope="double_test")
assign_ops = []
tvars = []
for tvar in tf.trainable_variables():
    assign_ops.append(tf.assign(tvar, tf.ones_like(tvar)))
    tvars.append(tvar)
    
print(tf.trainable_variables())


with tf.compat.v1.Session() as sess:
    sess.run(assign_ops)
    res = sess.run(tvars)
    print(res[0].shape)
    res = sess.run(output_tensor)
    print(res.shape, res)
    


[<tf.Variable 'double_test/beta:0' shape=(276,) dtype=float32_ref>, <tf.Variable 'double_test/gamma:0' shape=(276,) dtype=float32_ref>]
(276,)
(128, 276) [[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]


## 2/ PyTorch code

In [12]:
os.chdir('./examples')

In [13]:
import extract_features
import pytorch_pretrained_bert as ppb
from extract_features import *

In [14]:
init_checkpoint_pt = "/tmp/pretraining_output_test/"

In [15]:
device = torch.device("cpu")
model = ppb.BertModel.from_pretrained(init_checkpoint_pt)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [16]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [17]:
layer_indexes = list(range(12))

pytorch_all_out = []
for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:
    print(input_ids)
    print(input_mask)
    print(example_indices)
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)

    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)

    for b, example_index in enumerate(example_indices):
        feature = features[example_index.item()]
        unique_id = int(feature.unique_id)
        # feature = unique_id_to_feature[unique_id]
        output_json = collections.OrderedDict()
        output_json["linex_index"] = unique_id
        all_out_features = []
        # for (i, token) in enumerate(feature.tokens):
        all_layers = []
        for (j, layer_index) in enumerate(layer_indexes):
            print("layer", j, layer_index)
            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
            layer_output = layer_output[b]
            layers = collections.OrderedDict()
            layers["index"] = layer_index
            layer_output = layer_output
            layers["values"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]
            all_layers.append(layers)

            out_features = collections.OrderedDict()
            out_features["layers"] = all_layers
            all_out_features.append(out_features)
        output_json["features"] = all_out_features
        pytorch_all_out.append(output_json)

tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,
          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

layer done  tensor([[[2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         ...,
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.],
         [2., 2., 2.,  ..., 2., 2., 2.]]], grad_fn=<AddBackward0>)
attention output  tensor([[[1180417.1250, 1180417.1250, 1180417.1250,  ..., 1180417.1250,
          1180417.1250, 1180417.1250],
         [1180417.1250, 1180417.1250, 1180417.1250,  ..., 1180417.1250,
          1180417.1250, 1180417.1250],
         [1180417.1250, 1180417.1250, 1180417.1250,  ..., 1180417.1250,
          1180417.1250, 1180417.1250],
         ...,
         [1180417.1250, 1180417.1250, 1180417.1250,  ..., 1180417.1250,
          1180417.1250, 1180417.1250],
         [1180417.0000, 1180417.0000, 1180417.0000,  ..., 1180417.0000,
          1180417.0000, 1180417.0000],
         [1180417.0000, 1180417.0000, 1180417.0000,  ..., 1180417.0000,
          1180417.0000, 1180417.0000]]], gr

In [18]:
print(len(pytorch_all_out))
print(len(pytorch_all_out[0]))
print(pytorch_all_out[0].keys())
print("number of tokens", len(pytorch_all_out))
print("number of layers", len(pytorch_all_out[0]['features'][0]['layers']))
print("hidden_size", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))
pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape

1
2
odict_keys(['linex_index', 'features'])
number of tokens 1
number of layers 12
hidden_size 128


(128, 768)

In [19]:
pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)
print(pytorch_outputs[0].shape)
print(pytorch_outputs[1].shape)

(128, 768)
(128, 768)


In [20]:
print(tensorflow_outputs[0].shape)
print(tensorflow_outputs[1].shape)

(128, 768)
(128, 768)


In [21]:
print(tensorflow_outputs[0])
# for i in range(tensorflow_outputs[0].shape[0]):
#     print(tensorflow_outputs[0][i])

[[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [22]:
print(pytorch_outputs[0])

[[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]]


In [23]:
for i in range(11):
    print(i, tensorflow_outputs[i])
    print(i, pytorch_outputs[i])
    print(i, tensorflow_outputs[i] - pytorch_outputs[i])

0 [[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0 [[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]]
0 [[ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [-2. -2. -2. ... -2. -2. -2.]
 [-2. -2. -2. ... -2. -2. -2.]]
1 [[2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]]
1 [[1.000000e+00 1.000000e+00 1.000000e+00 ... 1.000000e+00 1.000000e+00
  1.000000e+00]
 [1.000000e+00 1.000000e+00 1.000000e+00 ... 1.000000e+00 1.000000e+00
  1.000000e+00]
 [1.000000e+00 1.000000e+00 1.000000e+00 ... 1.000000e+00 1.000000e+00
  1.000000e+00]
 ...
 [1.000000e+00 1.000000e+00 1.000000e+00 ... 1.000000e+00 1.0000

In [24]:
print(tensorflow_outputs[0] - pytorch_outputs[0])
print(tensorflow_outputs[1] - pytorch_outputs[1])
print(tensorflow_outputs[11] - pytorch_outputs[11])

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [-2. -2. -2. ... -2. -2. -2.]
 [-2. -2. -2. ... -2. -2. -2.]]
[[1.       1.       1.       ... 1.       1.       1.      ]
 [1.       1.       1.       ... 1.       1.       1.      ]
 [1.       1.       1.       ... 1.       1.       1.      ]
 ...
 [1.       1.       1.       ... 1.       1.       1.      ]
 [1.999998 1.999998 1.999998 ... 1.999998 1.999998 1.999998]
 [1.999998 1.999998 1.999998 ... 1.999998 1.999998 1.999998]]
[[1.       1.       1.       ... 1.       1.       1.      ]
 [1.       1.       1.       ... 1.       1.       1.      ]
 [1.       1.       1.       ... 1.       1.       1.      ]
 ...
 [1.       1.       1.       ... 1.       1.       1.      ]
 [1.999998 1.999998 1.999998 ... 1.999998 1.999998 1.999998]
 [1.999998 1.999998 1.999998 ... 1.999998 1.999998 1.999998]]


## 3/ Comparing the standard deviation on the last layer of both models

In [25]:
import numpy as np

In [26]:
print('shape tensorflow layer, shape pytorch layer, standard deviation')
print('\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,
                          np.array(pytorch_outputs[i]).shape, 
                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))

shape tensorflow layer, shape pytorch layer, standard deviation
((128, 768), (128, 768), 0.25)
((128, 768), (128, 768), 1.0897245)
((128, 768), (128, 768), 0.0)
((128, 768), (128, 768), 1.0897245)
((128, 768), (128, 768), 0.0)
((128, 768), (128, 768), 1.0897245)
((128, 768), (128, 768), 0.0)
((128, 768), (128, 768), 1.0897245)
((128, 768), (128, 768), 0.0)
((128, 768), (128, 768), 1.0897245)
((128, 768), (128, 768), 0.0)
((128, 768), (128, 768), 1.0897245)
