# BERT: As one of Autoencoding Language Models 

In [4]:
import pandas as pd
imdb_df = pd.read_csv("IMDB Dataset.csv")
reviews = imdb_df.review.to_string(index=None) 
with open("corpus.txt", "w",encoding='utf-8') as f: 
    f.writelines(reviews) 

In [9]:
imdb_df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [1]:
with open("corpus.txt", "r") as f: 
    text=f.readlines() 

In [2]:
text

[' One of the other reviewers has mentioned that ...\n',
 ' A wonderful little production. <br /><br />The...\n',
 ' I thought this was a wonderful way to spend ti...\n',
 " Basically there's a family where a little boy ...\n",
 ' Petter Mattei\'s "Love in the Time of Money" is...\n',
 ' Probably my all-time favorite movie, a story o...\n',
 ' I sure would like to see a resurrection of a u...\n',
 ' This show was an amazing, fresh & innovative i...\n',
 ' Encouraged by the positive comments about this...\n',
 ' If you like original gut wrenching laughter yo...\n',
 ' Phil the Alien is one of those quirky films wh...\n',
 ' I saw this movie when I was about 12 when it c...\n',
 " So im not a big fan of Boll's work but then ag...\n",
 ' The cast played Shakespeare.<br /><br />Shakes...\n',
 ' This a fantastic movie of three prisoners who ...\n',
 ' Kind of drawn in by the erotic scenes, only to...\n',
 ' Some films just simply should not be remade. T...\n',
 ' This movie made it into one

In [7]:
from tokenizers import BertWordPieceTokenizer
bert_wordpiece_tokenizer = BertWordPieceTokenizer() 
bert_wordpiece_tokenizer.train("corpus.txt") 

In [8]:
bert_wordpiece_tokenizer.get_vocab()

{'princess': 5136,
 'stupidly': 15897,
 'express': 3012,
 '##ilyn': 12285,
 'circl': 16886,
 'finds': 3069,
 'heading': 16175,
 '##eday': 11710,
 'fever': 4354,
 'losing': 6946,
 'december': 6523,
 '##ety': 8852,
 'unemployed': 17922,
 '##omat': 11749,
 'bits': 6375,
 'apartment': 9412,
 'mercenary': 17797,
 '##astimil': 14917,
 'conclude': 17813,
 'greystoke': 18010,
 'enthusi': 4863,
 'sm': 811,
 'chucky': 5365,
 'daniel': 3963,
 'sweeney': 7093,
 'broadway': 3342,
 'eleph': 6413,
 '1999': 3712,
 'coll': 3232,
 '##abe': 15117,
 'unh': 8037,
 'tol': 5817,
 'dizz': 14919,
 'aardman': 11220,
 '##ilation': 11703,
 'theme': 3333,
 'shi': 7963,
 'thoug': 4260,
 'godzilla': 9140,
 'oft': 8808,
 'enthralling': 13716,
 'revival': 16467,
 '81': 13858,
 'watch': 225,
 'prepared': 7692,
 'hamlet': 3984,
 'iden': 16082,
 'trashed': 12800,
 'american': 947,
 'indiana': 16668,
 'malefique': 13233,
 'cujo': 13025,
 'produced': 2212,
 'gein': 9142,
 'helena': 16154,
 'plankton': 18200,
 'sour': 7846,

In [9]:
!mkdir tokenizer
bert_wordpiece_tokenizer.save_model("tokenizer")

['tokenizer/vocab.txt']

In [10]:
tokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

In [11]:
tokenized_sentence = tokenizer.encode("Oh it works just fine")

In [12]:
tokenized_sentence.tokens

['[CLS]', 'oh', 'it', 'works', 'just', 'fine', '[SEP]']

In [13]:
tokenized_sentence = tokenizer.encode("ohoh i thougt it might be workingg well")

In [14]:
from transformers import BertTokenizerFast 
tokenizer = BertTokenizerFast.from_pretrained("tokenizer") 

In [15]:
from transformers import LineByLineTextDataset 
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path="corpus.txt", block_size=128) 



In [16]:
from transformers import DataCollatorForLanguageModeling 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) 

In [17]:
from transformers import TrainingArguments 
training_args = TrainingArguments(output_dir="BERT", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=128) 

In [18]:
from transformers import BertConfig, BertForMaskedLM 
bert = BertForMaskedLM(BertConfig()) 

In [19]:
from transformers import Trainer 
trainer = Trainer(model=bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 

In [20]:
trainer.train()

***** Running training *****
  Num examples = 50022
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 391


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=391, training_loss=5.378278727421675, metrics={'train_runtime': 283.7543, 'train_samples_per_second': 176.286, 'train_steps_per_second': 1.378, 'total_flos': 812585139730200.0, 'train_loss': 5.378278727421675, 'epoch': 1.0})

In [21]:
trainer.save_model("MyBERT")

Saving model checkpoint to MyBERT
Configuration saved in MyBERT/config.json
Model weights saved in MyBERT/pytorch_model.bin


In [22]:
from transformers import BertConfig 
BertConfig() 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [23]:
tiny_bert_config = BertConfig(max_position_embeddings=512, hidden_size=128, num_attention_heads=2, num_hidden_layers=2, intermediate_size=512) 
tiny_bert_config 

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [24]:
tiny_bert = BertForMaskedLM(tiny_bert_config) 
trainer = Trainer(model=tiny_bert, args=training_args, data_collator=data_collator, train_dataset=dataset) 
trainer.train() 

***** Running training *****
  Num examples = 50022
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 391


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=391, training_loss=8.895718235493925, metrics={'train_runtime': 21.5776, 'train_samples_per_second': 2318.234, 'train_steps_per_second': 18.121, 'total_flos': 32771457490200.0, 'train_loss': 8.895718235493925, 'epoch': 1.0})

In [25]:
from transformers import TFBertModel, BertTokenizerFast 
bert = TFBertModel.from_pretrained("bert-base-uncased") 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 
bert.layers 

https://huggingface.co/bert-base-uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp_jskfvy4


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…

storing https://huggingface.co/bert-base-uncased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
creating metadata file for /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…

storing https://huggingface.co/bert-base-uncased/resolve/main/tf_model.h5 in cache at /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5
creating metadata file for /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5
loading weights file https://huggingface.co/bert-base-uncased/resolve/main/tf_model.h5 from cache at /root/.cache/huggingface/transformers/775efbdc2152093295bc5824dee96da82a5f3c1f218dfface1b8cef3094bdf8f.c719a806caef7d36ec0185f14b3b5fa727d919f924abe35622b4b7147bfbb8c7.h5





Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transform

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

storing https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp83epubcm





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…

storing https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
creating metadata file for /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4





https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpgk2_ycuj


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…

storing https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
creating metadata file for /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https




[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7f5b9fd316d0>]

In [26]:
tokenized_text = tokenizer.batch_encode_plus(["hello how is it going with you","lets test it"], return_tensors="tf", max_length=256, truncation=True, pad_to_max_length=True) 
bert(tokenized_text) 



TFBaseModelOutputWithPooling([('last_hidden_state',
                               <tf.Tensor: shape=(2, 256, 768), dtype=float32, numpy=
                               array([[[ 1.00471288e-01,  6.77022934e-02, -8.33591744e-02, ...,
                                        -4.93304461e-01,  1.16539642e-01,  2.26646975e-01],
                                       [ 3.23624432e-01,  3.70718002e-01,  6.14686370e-01, ...,
                                        -6.27267480e-01,  3.79082561e-01,  7.05312043e-02],
                                       [ 1.99534193e-01, -8.75509918e-01, -6.47860616e-02, ...,
                                        -1.28080100e-02,  3.07651967e-01, -2.07310896e-02],
                                       ...,
                                       [-6.53299540e-02,  1.19045913e-01,  5.76846719e-01, ...,
                                        -2.95459926e-01,  2.49742977e-02,  1.13964222e-01],
                                       [-2.64715403e-01, -7.863832

In [27]:
from tensorflow import keras 
import tensorflow as tf 
max_length = 256 
tokens = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
masks = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32) 
embedding_layer = bert.layers[0]([tokens,masks])[0][:,0,:] 
dense = tf.keras.layers.Dense(units=2, activation="softmax")(embedding_layer) 
model = keras.Model([tokens,masks],dense) 

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [28]:
tokenized = tokenizer.batch_encode_plus(["hello how is it going with you","hello how is it going with you"], return_tensors="tf", max_length= max_length, truncation=True, pad_to_max_length=True) 



In [29]:
model([tokenized["input_ids"],tokenized["attention_mask"]]) 

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.56051165, 0.43948835],
       [0.56051165, 0.43948835]], dtype=float32)>

In [30]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"]) 
model.summary() 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           bert[0][0]                   

In [31]:
model.layers[2].trainable = False 

In [32]:
import pandas as pd 
imdb_df = pd.read_csv("IMDB Dataset.csv") 
reviews = list(imdb_df.review) 
tokenized_reviews = tokenizer.batch_encode_plus(reviews, return_tensors="tf", max_length=max_length, truncation=True, pad_to_max_length=True) 

import numpy as np 
train_split = int(0.8 * len(tokenized_reviews["attention_mask"])) 
train_tokens = tokenized_reviews["input_ids"][:train_split] 
test_tokens = tokenized_reviews["input_ids"][train_split:] 
train_masks = tokenized_reviews["attention_mask"][:train_split] 
test_masks = tokenized_reviews["attention_mask"][train_split:] 
sentiments = list(imdb_df.sentiment) 
labels = np.array([[0,1] if sentiment == "positive" else [1,0] for sentiment in sentiments]) 
train_labels = labels[:train_split] 
test_labels = labels[train_split:] 



In [None]:
model.fit([train_tokens,train_masks],train_labels, epochs=5)

Epoch 1/5
