[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Henya14/deep-learning-ner/blob/main/basic_training.ipynb)

In [1]:
!pip install torch
!pip install transformers



In [2]:
from transformers import AutoTokenizer, AutoModel, BertForTokenClassification


tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")
model = BertForTokenClassification.from_pretrained("SZTAKI-HLT/hubert-base-cc",  num_labels=10)


Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized 

In [3]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32001, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [4]:
tokenizer(["alma", "körte", "banán", "manán", "asd", "[SEP]", "asd", "asdasdasdasd"], padding="max_length", is_split_into_words=True, truncation=True)

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [2, 17036, 16048, 31734, 27370, 12423, 2056, 17911, 31750, 3, 17911, 31750, 17911, 4305, 6566, 10213, 10213, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
import os
train_test_devel_data_path = os.path.join("data", "train-devel-test")
train_test_devel_data_dirs = [os.path.join(train_test_devel_data_path, data_dir) for data_dir in os.listdir(train_test_devel_data_path) if os.path.isdir(os.path.join(train_test_devel_data_path, data_dir))]

In [6]:
import re
import pandas as pd
csv_file_pattern = re.compile(".*_full.csv") 
def get_csv_files_in_dir(path_to_dir):
    return [f for f in os.listdir(path_to_dir) if csv_file_pattern.match(f)]

In [7]:
def get_train_devel_test_dirs():
    train_devel_test_file_dirs = {}
    for d in train_test_devel_data_dirs:
        file_dirs = [os.path.join(d, genre_dir, "no-morph") for genre_dir in os.listdir(d) if os.path.isdir(os.path.join(d, genre_dir)) and "no-morph" in os.listdir(os.path.join(d, genre_dir))]
        train_devel_test_file_dirs[os.path.basename(d)] = file_dirs
    return train_devel_test_file_dirs

In [8]:
def load_all_csv_files_in_dir(path_to_dir, train_test_devel, genre, save_intermediate_dataframes_to_csv = False):
    data_file_paths = [os.path.join(path_to_dir, cf) for cf in get_csv_files_in_dir(path_to_dir)]
    combined_df = pd.DataFrame()
    for csv_file in data_file_paths:
        print(csv_file)
        df = pd.read_csv(csv_file, index)
        if "sentence_index" in combined_df:
            df["sentence_index"] = df["sentence_index"] + (combined_df["sentence_index"].max() + 1)
            print(combined_df["sentence_index"].max())
            print(combined_df["sentence_index"].max() + 1)
        combined_df = pd.concat([combined_df, df])
    return combined_df

In [58]:
dfs = {}
train_devel_test_dirs = get_train_devel_test_dirs()
for data_set in train_devel_test_dirs:
    dfs[data_set] = pd.DataFrame()
    #word_counts[data_set] = {}
    for genre_dir in train_devel_test_dirs[data_set]:
        print(genre_dir)
        genre = genre_dir.split(os.path.sep)[-2]
        df = load_all_csv_files_in_dir(genre_dir, data_set, genre, True)
        #df.to_csv(os.path.join(genre_dir, f"{genre}_full.csv"), index=False)
        dfs[data_set] = pd.concat([dfs[data_set], df], ignore_index=True)

data\train-devel-test\devel\fiction\no-morph
data\train-devel-test\devel\fiction\no-morph\fiction_full.csv
data\train-devel-test\devel\legal\no-morph
data\train-devel-test\devel\legal\no-morph\legal_full.csv
data\train-devel-test\devel\news\no-morph
data\train-devel-test\devel\news\no-morph\news_full.csv
data\train-devel-test\test\fiction\no-morph
data\train-devel-test\test\fiction\no-morph\fiction_full.csv
data\train-devel-test\test\legal\no-morph
data\train-devel-test\test\legal\no-morph\legal_full.csv
data\train-devel-test\test\news\no-morph
data\train-devel-test\test\news\no-morph\news_full.csv
data\train-devel-test\train\fiction\no-morph
data\train-devel-test\train\fiction\no-morph\fiction_full.csv
data\train-devel-test\train\legal\no-morph
data\train-devel-test\train\legal\no-morph\legal_full.csv
data\train-devel-test\train\news\no-morph
data\train-devel-test\train\news\no-morph\news_full.csv
data\train-devel-test\train\wikipedia\no-morph
data\train-devel-test\train\wikipedia\no-

In [10]:
def get_sentences(df: pd.DataFrame):
    copy_df = df.copy()
    copy_df = copy_df.sort_values(["sentence_index", "position_number_in_sentence"])
    sentences = []
    print(copy_df["sentence_index"].max())
    for i in range(copy_df["sentence_index"].max()):
        form_tag_pairs = copy_df[copy_df["sentence_index"]==i][["position_number_in_sentence", "FORM", "CONLL:NER"]]
        sentences.append({"FORM": form_tag_pairs["FORM"].tolist(),"TAG": form_tag_pairs["CONLL:NER"].tolist()})
        
    return sentences

In [11]:
get_sentences(dfs["test"])

999


[{'FORM': ['Tehát',
   'most',
   'ezen',
   'dolgozom',
   '–',
   'a',
   'fejlesztések',
   'bevezetésén',
   '.'],
  'TAG': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']},
 {'FORM': ['GV',
   ':',
   'Megosztanál',
   'néhányat',
   'a',
   'reformelképzeléseid',
   'közül',
   '?'],
  'TAG': ['B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']},
 {'FORM': ['MI', ':', 'Nagyon', 'sok', 'ötletem', 'van', '!'],
  'TAG': ['B-PER', 'O', 'O', 'O', 'O', 'O', 'O']},
 {'FORM': ['Ezek',
   'két',
   'kategóriába',
   'sorolhatók',
   ':',
   'a',
   'népszerűsítés',
   'és',
   'a',
   'feladatok',
   'megosztása',
   '.'],
  'TAG': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']},
 {'FORM': ['Egyelőre',
   'nem',
   'vagyunk',
   'nagyon',
   'jók',
   'a',
   'promócióban',
   'és',
   'azt',
   'gondolom',
   ',',
   'hogy',
   'hiányzik',
   'az',
   'olvasottság',
   ',',
   'miközben',
   'a',
   'tartalom',
   'nagyon',
   'impozáns',
   '.'],
  'TAG': ['O',
   'O',
   'O',
   '

In [12]:
dfs["devel"].sort_values(["sentence_index"])["sentence_index"]

0          0
19         0
20         0
22         0
23         0
        ... 
19832    846
19831    846
19847    846
19838    846
19848    846
Name: sentence_index, Length: 19849, dtype: int64

In [13]:
test_sentences = get_sentences(dfs["test"])
print(test_sentences)

999
[{'FORM': ['Tehát', 'most', 'ezen', 'dolgozom', '–', 'a', 'fejlesztések', 'bevezetésén', '.'], 'TAG': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'FORM': ['GV', ':', 'Megosztanál', 'néhányat', 'a', 'reformelképzeléseid', 'közül', '?'], 'TAG': ['B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'FORM': ['MI', ':', 'Nagyon', 'sok', 'ötletem', 'van', '!'], 'TAG': ['B-PER', 'O', 'O', 'O', 'O', 'O', 'O']}, {'FORM': ['Ezek', 'két', 'kategóriába', 'sorolhatók', ':', 'a', 'népszerűsítés', 'és', 'a', 'feladatok', 'megosztása', '.'], 'TAG': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'FORM': ['Egyelőre', 'nem', 'vagyunk', 'nagyon', 'jók', 'a', 'promócióban', 'és', 'azt', 'gondolom', ',', 'hogy', 'hiányzik', 'az', 'olvasottság', ',', 'miközben', 'a', 'tartalom', 'nagyon', 'impozáns', '.'], 'TAG': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'FORM': ['Például', 'kicsit', 'nehéz', 'átvinni', 'azt', 'az', 

In [14]:
tokenized_text = tokenizer(test_sentences[0]["FORM"], padding='max_length', max_length=512, truncation=True, return_tensors="pt", is_split_into_words=True)

In [15]:
print(tokenizer.decode(tokenized_text.input_ids[0]))

[CLS] Tehát most ezen dolgozom – a fejlesztések bevezetésén. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [16]:
print(tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0]))

['[CLS]', 'Tehát', 'most', 'ezen', 'dolgozom', '–', 'a', 'fejlesztések', 'bevezet', '##ésén', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

In [40]:
word_ids = tokenized_text.word_ids()

In [18]:
word_ids

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 7,
 8,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 

In [39]:
tokenized_text

{'input_ids': tensor([[    2,  6738,  2672,  3690, 16913,  2292,  2005, 10554,  5781, 24994,
          4575,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [19]:
combined_df = pd.DataFrame()
for df_key in dfs:
    combined_df = pd.concat([combined_df, dfs[df_key]])
labels = combined_df["CONLL:NER"].unique()
print(labels)

ids_to_labels = {k: v for k, v in enumerate(sorted(labels)) }
labels_to_ids = {v: k for k, v in enumerate(sorted(labels)) }
print(ids_to_labels)
print(labels_to_ids)

['B-LOC' 'O' 'B-ORG' 'I-ORG' 'B-PER' 'I-PER' 'B-MISC' 'I-MISC' 'I-LOC']
{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}
{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


In [20]:
print(tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"][0]))
print(word_ids)

['[CLS]', 'Tehát', 'most', 'ezen', 'dolgozom', '–', 'a', 'fejlesztések', 'bevezet', '##ésén', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

Code from: https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a

In [21]:
def align_labels_of_tokenized_sentence(sentence, labels, should_tokenize_sub_words = False):
    SPECIAL_TOKEN_ID = -1
    label_ids = []
    previous_word_id = None
    for word_id in sentence:
        if word_id is None:
            label_ids.append(SPECIAL_TOKEN_ID)
        elif word_id != previous_word_id:
            label_ids.append(labels_to_ids[labels[word_id]])
        else:
            label_ids.append(labels_to_ids[labels[word_id]] if should_tokenize_sub_words else SPECIAL_TOKEN_ID)
        previous_word_id = word_id
    return label_ids
    
    

In [22]:
print(tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"][0]))

['[CLS]', 'Tehát', 'most', 'ezen', 'dolgozom', '–', 'a', 'fejlesztések', 'bevezet', '##ésén', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

In [23]:
tokenized_text = tokenizer(test_sentences[0]["FORM"], padding='max_length', max_length=512, truncation=True, return_tensors="pt", is_split_into_words=True)

In [24]:
tokenized_text

{'input_ids': tensor([[    2,  6738,  2672,  3690, 16913,  2292,  2005, 10554,  5781, 24994,
          4575,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [25]:
align_labels_of_tokenized_sentence(tokenized_text.word_ids(), test_sentences[0]["TAG"])

[-1,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 -1,
 8,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,

In [27]:
aligned = align_labels_of_tokenized_sentence(tokenized_text.word_ids(), test_sentences[0]["TAG"])
converted = tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"][0])
for i in range(len(aligned)):
    print(f"{converted[i].ljust(15)} {str(aligned[i]).ljust(4)} \t { 'None' if aligned[i] == -1 else ids_to_labels[aligned[i]]} ")

[CLS]           -1   	 None 
Tehát           8    	 O 
most            8    	 O 
ezen            8    	 O 
dolgozom        8    	 O 
–               8    	 O 
a               8    	 O 
fejlesztések    8    	 O 
bevezet         8    	 O 
##ésén          -1   	 None 
.               8    	 O 
[SEP]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]           -1   	 None 
[PAD]       

In [28]:
tokenized_text

{'input_ids': tensor([[    2,  6738,  2672,  3690, 16913,  2292,  2005, 10554,  5781, 24994,
          4575,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [29]:
tokenized_text["input_ids"]

tensor([[    2,  6738,  2672,  3690, 16913,  2292,  2005, 10554,  5781, 24994,
          4575,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

# Dataset creation

In [30]:
import torch
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, data_df):
        sentences = get_sentences(data_df)
        self.tokenized_sentences = [tokenizer(sentence["FORM"], padding='max_length', max_length=512, truncation=True, return_tensors="pt", is_split_into_words=True) for sentence in sentences]
        self.labels = [align_labels_of_tokenized_sentence(tokenized_sentences.word_ids(), sentence["TAG"]) for tokenized_sentences, sentence in zip(self.tokenized_sentences, sentences)]

        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
            return self.tokenized_sentences[index],  torch.LongTensor(self.labels[index])

In [31]:
a = NERDataset(dfs["test"])

999


# Training

https://www.kaggle.com/code/angyalfold/hugging-face-bert-with-custom-classifier-pytorch/notebook
https://www.youtube.com/watch?v=MqQ7rqRllIc
https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
https://towardsdatascience.com/deep-dive-into-the-code-of-bert-model-9f618472353e

In [38]:
from torch import nn
from transformers import BertModel
class NERModel(torch.nn.Module):
    
    def __init__(self, num_labels):
        super(NERModel, self).__init__()
        self.bert = AutoModel.from_pretrained("SZTAKI-HLT/hubert-base-cc")
        self.dropout1 = nn.Dropout(0.1)
        self.linear1 = nn.Linear(in_features=768, out_features=512)
        self.relu1 =  nn.ReLU()
        self.linear2 = nn.Linear(in_features=512, out_features=num_labels)
        self.sigmoid = nn.Sigmoid()
    def forward(self, input_ids, attention_mask, labels):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        print(x.last_hidden_state.size())
        x = self.dropout1(x[0])
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = nn.Sigmoid()
        print("heee", x)
        print(x.size())
        z =  torch.argmax(x, dim=2)
        print("max", torch.argmax(x, dim=2))
        print("asd", z.size())
        print("asd")
        
        return x
    def calculate_loss(predicted_labels, actual_labels):
        self.9*-
        
        

In [33]:
train_dataset, devel_dataset, test_dataset = NERDataset(dfs["train"][0:2000]), NERDataset(dfs["devel"]), NERDataset(dfs["test"])

146
846
999


In [34]:
len(dfs["train"])
from tqdm import tqdm


In [37]:
from torch.utils.data.dataloader import DataLoader
batch_size = 16
epoch_num = 1000
learning_rate = 0.0001
print(torch.cuda.is_available())
def loop(model, train_dataset, devel_dataset):
    df_val = dfs["devel"]
    df_train = dfs["train"]
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(devel_dataset, batch_size=batch_size)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    
    is_cuda_available = torch.cuda.is_available()
    device  = "cuda" if is_cuda_available else "cpu"
    if is_cuda_available:
        model.to(device)
    model.bert.requires_grad_(False)
    loss_function = nn.MSELoss() 
    
   
    for epoch in range(epoch_num):
        model.train()
        total_acc_train = 0
        total_loss_train = 0
        for tokenized_sentence, label in tqdm(train_dataloader):
            train_label = label.to(device)
            attention_mask = tokenized_sentence['attention_mask'].squeeze(1).to(device)
            input_ids = tokenized_sentence['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask, train_label)
            #print(logits, train_label)
            print(logits.size(), train_label.size())
            loss = loss_function(logits, train_label.float())
            for i in range(logits.shape[0]):
                
                logits_clean = logits[i][train_label[i] != -1]
                label_clean = train_label[i][train_label[i] != -1]
                #print(logits_clean)
                #print(label_clean)
       
                
            loss.backward()
            optimizer.step()
        model.eval()
        

def main():
    model = NERModel(len(labels))
    print(model)
    loop(model, train_dataset, devel_dataset)
main()
            

True


Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NERModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32001, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

torch.Size([16, 512, 768])
heee 

  0%|                                                                                           | 0/10 [00:01<?, ?it/s]

tensor([[[-2.3860e-01,  2.3554e-01, -6.5911e-02,  ..., -9.0682e-02,
          -1.6434e-01, -1.4770e-02],
         [-2.6156e-01,  3.6104e-01, -2.9063e-01,  ..., -3.6796e-03,
          -1.9368e-01,  1.8293e-01],
         [-1.3168e-01,  7.1370e-02, -1.5125e-01,  ..., -2.9445e-02,
          -6.0380e-02,  1.7509e-01],
         ...,
         [-1.5828e-01,  1.8755e-01, -1.3426e-01,  ...,  4.0988e-02,
          -9.2832e-02,  2.4808e-01],
         [-2.1980e-01,  2.7189e-01, -1.2024e-01,  ..., -2.5296e-02,
          -1.4686e-01,  2.9959e-01],
         [-1.7129e-01,  2.9584e-01, -1.1069e-01,  ..., -4.1005e-03,
          -1.7658e-01,  2.6323e-01]],

        [[-3.4297e-02,  1.8148e-01,  6.7710e-02,  ...,  1.7566e-01,
           2.0801e-02,  2.5981e-01],
         [-2.0423e-01,  2.1441e-01, -2.0650e-01,  ...,  9.4911e-02,
          -6.1172e-02,  3.3445e-01],
         [ 5.6949e-03,  1.8917e-01, -1.6483e-01,  ...,  1.7189e-01,
           1.3830e-01,  2.4446e-01],
         ...,
         [-2.6855e-01,  1




RuntimeError: The size of tensor a (9) must match the size of tensor b (512) at non-singleton dimension 2

In [61]:
test = dfs["test"]
test[test["genre"] == "news"]["CONLL:NER"].value_counts()

O         19536
B-PER       306
B-ORG       254
B-LOC       199
I-PER       195
B-MISC      143
I-ORG       106
I-MISC       86
I-LOC        23
Name: CONLL:NER, dtype: int64

In [None]:
model = AutoModel.from_pretrained("bert-base-cased",  num_labels=len(labels))

In [None]:
from transformers import BertForTokenClassification
BertForTokenClassification.from_pretrained("SZTAKI-HLT/hubert-base-cc",  num_labels=len(labels))