### Import

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification

### Bert base

In [None]:
base_model_checkpoint = "bert-base-uncased"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_checkpoint)
bert_base_model = BertForSequenceClassification.from_pretrained(base_model_checkpoint, num_labels=2)

In [8]:
# parameters
total_params = sum(p.numel() for p in bert_base_model.parameters())
trainable_params = sum(p.numel() for p in bert_base_model.parameters() if p.requires_grad)
print(f"Total nuber of parameters: {total_params}")
print(f"trainable parameters: {trainable_params}")

# model structure
for name, param in bert_base_model.named_parameters():
    print(f"Layer: {name} | Size: {param.shape}")

# layers
for name, module in bert_base_model.named_modules():
    print(name)

Total nuber of parameters: 109483778
trainable parameters: 109483778
Layer: bert.embeddings.word_embeddings.weight | Size: torch.Size([30522, 768])
Layer: bert.embeddings.position_embeddings.weight | Size: torch.Size([512, 768])
Layer: bert.embeddings.token_type_embeddings.weight | Size: torch.Size([2, 768])
Layer: bert.embeddings.LayerNorm.weight | Size: torch.Size([768])
Layer: bert.embeddings.LayerNorm.bias | Size: torch.Size([768])
Layer: bert.encoder.layer.0.attention.self.query.weight | Size: torch.Size([768, 768])
Layer: bert.encoder.layer.0.attention.self.query.bias | Size: torch.Size([768])
Layer: bert.encoder.layer.0.attention.self.key.weight | Size: torch.Size([768, 768])
Layer: bert.encoder.layer.0.attention.self.key.bias | Size: torch.Size([768])
Layer: bert.encoder.layer.0.attention.self.value.weight | Size: torch.Size([768, 768])
Layer: bert.encoder.layer.0.attention.self.value.bias | Size: torch.Size([768])
Layer: bert.encoder.layer.0.attention.output.dense.weight | Siz

In [17]:
inverse_vocab_base = {}
for token, token_id in list(base_tokenizer.vocab.items()):
    inverse_vocab_base[token_id] = token

for token, token_id in list(base_tokenizer.vocab.items())[:30]:
    inverse_vocab_base[token_id] = token

for index in range(2130,2160):
    print(f"Token: {inverse_vocab_base[index]}, ID: {index}")


Token: even, ID: 2130
Token: get, ID: 2131
Token: head, ID: 2132
Token: ..., ID: 2133
Token: didn, ID: 2134
Token: ##ly, ID: 2135
Token: team, ID: 2136
Token: american, ID: 2137
Token: because, ID: 2138
Token: de, ID: 2139
Token: ##l, ID: 2140
Token: born, ID: 2141
Token: united, ID: 2142
Token: film, ID: 2143
Token: since, ID: 2144
Token: still, ID: 2145
Token: long, ID: 2146
Token: work, ID: 2147
Token: south, ID: 2148
Token: us, ID: 2149
Token: became, ID: 2150
Token: any, ID: 2151
Token: high, ID: 2152
Token: again, ID: 2153
Token: day, ID: 2154
Token: family, ID: 2155
Token: see, ID: 2156
Token: right, ID: 2157
Token: man, ID: 2158
Token: eyes, ID: 2159


In [18]:
text = "Hello, how are you doing today?"

# tokenization
tokens = base_tokenizer.tokenize(text)
token_ids = base_tokenizer.convert_tokens_to_ids(tokens)

print(f"Original text: {text}")
print(f"Tokens: {tokens}")
print(f"Tokens (ID-s): {token_ids}")
print("Vocab Length: ", len(base_tokenizer.vocab.items()))

Original text: Hello, how are you doing today?
Tokens: ['hello', ',', 'how', 'are', 'you', 'doing', 'today', '?']
Tokens (ID-s): [7592, 1010, 2129, 2024, 2017, 2725, 2651, 1029]
Vocab Length:  30522


### Bertic

In [None]:
bertic_model_checkpoint = "classla/bcms-bertic"
bertic_model = BertForSequenceClassification.from_pretrained(bertic_model_checkpoint, num_labels=2)
bertic_tokenizer = AutoTokenizer.from_pretrained(bertic_model_checkpoint)

In [19]:
# model structure
for name, param in bertic_model.named_parameters():
    print(f"Sloj: {name} | Veličina: {param.shape}")

Sloj: bert.embeddings.word_embeddings.weight | Veličina: torch.Size([32000, 768])
Sloj: bert.embeddings.position_embeddings.weight | Veličina: torch.Size([512, 768])
Sloj: bert.embeddings.token_type_embeddings.weight | Veličina: torch.Size([2, 768])
Sloj: bert.embeddings.LayerNorm.weight | Veličina: torch.Size([768])
Sloj: bert.embeddings.LayerNorm.bias | Veličina: torch.Size([768])
Sloj: bert.encoder.layer.0.attention.self.query.weight | Veličina: torch.Size([768, 768])
Sloj: bert.encoder.layer.0.attention.self.query.bias | Veličina: torch.Size([768])
Sloj: bert.encoder.layer.0.attention.self.key.weight | Veličina: torch.Size([768, 768])
Sloj: bert.encoder.layer.0.attention.self.key.bias | Veličina: torch.Size([768])
Sloj: bert.encoder.layer.0.attention.self.value.weight | Veličina: torch.Size([768, 768])
Sloj: bert.encoder.layer.0.attention.self.value.bias | Veličina: torch.Size([768])
Sloj: bert.encoder.layer.0.attention.output.dense.weight | Veličina: torch.Size([768, 768])
Sloj: b

In [20]:
inverse_vocab_bertic = {}
for token, token_id in list(bertic_tokenizer.vocab.items()):
    inverse_vocab_bertic[token_id] = token


for token, token_id in list(bertic_tokenizer.vocab.items())[:30]:
    inverse_vocab_bertic[token_id] = token
    print("ID tokena: ", token_id, "token: ", token)

ID tokena:  3234 token:  zdrav
ID tokena:  19613 token:  ##ји
ID tokena:  16502 token:  drva
ID tokena:  6046 token:  prijave
ID tokena:  20221 token:  1943
ID tokena:  6861 token:  ##lacija
ID tokena:  5072 token:  ##UP
ID tokena:  21825 token:  ##dum
ID tokena:  29964 token:  Konvencije
ID tokena:  31878 token:  košarkaš
ID tokena:  30060 token:  njemačkim
ID tokena:  5142 token:  listopada
ID tokena:  27636 token:  prekrasna
ID tokena:  31558 token:  luksuz
ID tokena:  17515 token:  ##znije
ID tokena:  8590 token:  Bila
ID tokena:  31966 token:  učesnike
ID tokena:  21273 token:  razmišljaju
ID tokena:  5200 token:  cijena
ID tokena:  18250 token:  All
ID tokena:  28435 token:  evropskog
ID tokena:  22391 token:  statistika
ID tokena:  2940 token:  ##mina
ID tokena:  30706 token:  privukao
ID tokena:  26594 token:  otje
ID tokena:  7139 token:  rezultata
ID tokena:  27189 token:  čitatelji
ID tokena:  2682 token:  ##vijek
ID tokena:  203 token:  ď
ID tokena:  14493 token:  prehrani


In [23]:
for index in range(10):
  print(f"Token: {inverse_vocab_bertic[index]}, ID: {index}")

for index in range(2130, 2150):
  print(f"Token: {inverse_vocab_bertic[index]}, ID: {index}")

Token: [PAD], ID: 0
Token: [UNK], ID: 1
Token: [CLS], ID: 2
Token: [SEP], ID: 3
Token: [MASK], ID: 4
Token: !, ID: 5
Token: ", ID: 6
Token: #, ID: 7
Token: $, ID: 8
Token: %, ID: 9
Token: 201, ID: 2130
Token: ##đu, ID: 2131
Token: ##zo, ID: 2132
Token: godine, ID: 2133
Token: ##zu, ID: 2134
Token: ##nta, ID: 2135
Token: be, ID: 2136
Token: može, ID: 2137
Token: us, ID: 2138
Token: ##đa, ID: 2139
Token: ##sno, ID: 2140
Token: ##zna, ID: 2141
Token: ##liko, ID: 2142
Token: stra, ID: 2143
Token: Sa, ID: 2144
Token: ##tni, ID: 2145
Token: ##skog, ID: 2146
Token: ##bu, ID: 2147
Token: lju, ID: 2148
Token: ##šta, ID: 2149


In [24]:
print("Length: ", len(bertic_tokenizer.vocab.items()))

Length:  32000


## Comparing bert base and bertic tokenizers

In [29]:
def comparе_tokens_for_word(text):
    # tokens - bertic
    tokens_bertic = bertic_tokenizer.tokenize(text)
    token_ids_bertic = bertic_tokenizer.convert_tokens_to_ids(tokens_bertic)

    print(f"Original text: {text}")
    print(bertic_model_checkpoint)
    print(f"tokens: {tokens_bertic}")
    print(f"tokens (ID-s): {token_ids_bertic}")

    # tokens - BERT base
    tokens_base = base_tokenizer.tokenize(text)
    token_ids_base = base_tokenizer.convert_tokens_to_ids(tokens_base)

    print(base_model_checkpoint)
    print(f"tokens: {tokens_base}")
    print(f"tokens (ID-s): {token_ids_base}")

In [31]:
example_words = ["šumarstvo", "uzvratiti", "mačka"]
for text in example_words:
    comparе_tokens_for_word(text = text)
    print("\n")

Original text: šumarstvo
classla/bcms-bertic
tokens: ['šuma', '##rstvo']
tokens (ID-s): [7434, 11446]
bert-base-uncased
tokens: ['sum', '##ars', '##tv', '##o']
tokens (ID-s): [7680, 11650, 9189, 2080]


Original text: uzvratiti
classla/bcms-bertic
tokens: ['uzvrati', '##ti']
tokens (ID-s): [21020, 1916]
bert-base-uncased
tokens: ['u', '##z', '##vr', '##ati', '##ti']
tokens (ID-s): [1057, 2480, 19716, 10450, 3775]


Original text: mačka
classla/bcms-bertic
tokens: ['mačka']
tokens (ID-s): [22314]
bert-base-uncased
tokens: ['mack', '##a']
tokens (ID-s): [11349, 2050]


