In [2]:
from transformers import BertConfig, BertModel

# building the config
config = BertConfig()

# building the model from config
model = BertModel(config)

# since there is no defined checkpoint, model is created from a random values

In [3]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
model = BertModel.from_pretrained("bert-base-cased")
# model loaded from a defined checkpoint

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 89.4kB/s]
Downloading model.safetensors: 100%|██████████| 436M/436M [00:18<00:00, 23.8MB/s] 


In [7]:
model.save_pretrained("./models")

# will save the model instance to the directory specified

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased") # this loads the bert tokenizer to break word into tokens understandable by model

print(tokenizer("Using a transformer network is simple"))

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 4.13kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 3.89MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 746kB/s]

{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}





In [9]:
tokenizer.save_pretrained("./models/") # saving a tokenizer

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/vocab.txt',
 './models/added_tokens.json')

In [10]:
tokens = tokenizer.tokenize("Using a transformer network is simple") # tokenizing the string into token
print(tokens)

['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']


In [12]:
ids = tokenizer.convert_tokens_to_ids(tokens) # converting tokens to ids
print(ids)

[7993, 170, 11303, 1200, 2443, 1110, 3014]


In [13]:
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("I've been waiting for a something in my life")))

[146, 112, 1396, 1151, 2613, 1111, 170, 1380, 1107, 1139, 1297]


In [16]:
# decoding a token list into a string

decoded_string = tokenizer.decode([  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037,  2242,  1999,
          2026,  2166,   102])
print(decoded_string)

decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

[CLS] 正 國 themselvesine search hours 月 entered Japan largest previous [SEP]
Using a transformer network is simple
