In [1]:
# Importing the required libraries
import os
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          AutoConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification)

from transformers import (BertTokenizerFast,
                          BertForTokenClassification,
                          Trainer,
                          TrainingArguments)
import torch
# Importing custom classes for loading and labeling the corpus
from utils.corpusprocessor import CorpusType
from utils.corpusprocessor import CorpusLoader
from utils.labeler import Labeler

# Loading the corpus using the custom CorpusLoader class
corpus = CorpusLoader()
labeler = Labeler( tags=(1, 2),
                 regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
                 chars=(" ", "‌"),
                 class_count=2,
                 )


bijan_data = corpus.load_bijan(CorpusType.whole_raw)
labeler.set_text(bijan_data, corpus_type=CorpusType.whole_raw)
chars, labels = labeler.labeler()
chars = chars[:3000]
labels = labels[:3000]
model_dir = "./Model2113/"
# pretrained_model = "HooshvareLab/bert-base-parsbert-uncased"
pretrained_model = "bert-base-multilingual-uncased"

2023-10-31 17:06:40.786830: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-10-31 17:06:40.786909: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model)
# Tokenize your data
tokenized_inputs = tokenizer(chars, padding="max_length", is_split_into_words=True,)
encoded_tokenized = [tokenizer.encode(x)[1] for x in chars]
# Prepare the labels. Here I'm simply padding the labels list with -100s (the default ignore index in PyTorch)
labels = labels + [-100] * (len(encoded_tokenized) - len(labels))
# data = Dataset.from_dict({"input_ids": [tokenized_inputs['input_ids']], "attention_mask": [tokenized_inputs['attention_mask']], "labels": [labels]})


In [3]:
def chunked_tokenization(tokens, labels,tokenizer,chunk_size=512):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

   
    # Create chunks
    for i in range(0, len(tokens), 512 - 2):  # We subtract 2 to account for special tokens
        chunk_tokens = tokens[i:i + 512 - 2]
        chunk_label_ids = labels[i:i + 512 - 2]

        # Add special tokens
        chunk_tokens = [tokenizer.encode('[CLS]')[1]] + chunk_tokens + [tokenizer.encode('[SEP]')[1]]
        # chunk_tokens = ['[CLS]'] + chunk_tokens + ['[SEP]']
        chunk_label_ids = [-100] + chunk_label_ids + [-100]

        # Convert tokens to input IDs and create attention mask
       
        chunk_attention_mask = [1] * len(chunk_tokens)

        # Pad sequenceschunk_input_ids = tokenizer.convert_tokens_to_ids(chunk_tokens)
        while len(chunk_tokens) < chunk_size:
            chunk_tokens.append(0)
            chunk_attention_mask.append(0)
            chunk_label_ids.append(-100)

        input_ids_list.append(chunk_tokens)
        attention_mask_list.append(chunk_attention_mask)
        labels_list.append(chunk_label_ids)

    return input_ids_list, attention_mask_list, labels_list



# Use this function to prepare your data
input_ids_list, attention_mask_list, labels_list = chunked_tokenization(encoded_tokenized, labels,tokenizer)

# Testing the Model Input

In [4]:
import numpy as np
atmask=np.array(attention_mask_list).shape
inpid=np.array(input_ids_list).shape
labls=np.array(labels_list).shape

print(atmask,inpid,labls)
def count_unique_types(two_d_list):
    unique_types = set()
    for sublist in two_d_list:
        for item in sublist:
            unique_types.add(type(item))
    return unique_types

# Example usage:



print('input_ids_list\n',np.array(input_ids_list))
print(inpid,count_unique_types(input_ids_list))  # Output: 4


print('\n\n\n\nlabels_list\n',np.array(labels_list))
print(labls,count_unique_types(labels_list))  # Output: 4

print('\n\n\n\nattention_mask_list\n',np.array(attention_mask_list))
print(atmask,count_unique_types(attention_mask_list))  # Output: 4

(6, 512) (6, 512) (6, 512)
input_ids_list
 [[101 108 451 479 475 481 477 463 481 451 461 478 458 451 461 456 451 462
  476 477 468 479 476 478 464 476 463 481 459 481 459 478 464 459 119 108
  108 479 451 464 477 507 454 477 471 458 452 461 507 462 451 461 481 456
  476 478 479 461 481 451 463 475 451 476 481 108 463 454 451 461 478 464
  477 451 463 451 477 476 481 507 479 481 477 459 474 478 476 476 474 477
  451 463 454 451 479 475 481 477 463 481 451 461 478 458 451 461 456 451
  462 476 477 468 479 476 478 464 476 463 481 461 451 459 481 459 478 452
  451 464 477 459 119 467 481 463 451 475 478 451 481 451 458 481 461 446
  452 481 464 451 462 459 479 459 479 456 481 477 463 481 451 461 478 459
  461 476 459 451 461 451 467 461 451 472 463 454 451 461 478 478 451 474
  464 472 464 459 478 451 477 459 119 475 481 474 477 463 454 451 461 478
  464 477 451 463 451 477 478 461 507 462 451 481 477 463 481 451 461 478
  478 451 461 451 479 451 473 469 451 102 477 459 481 459 478 452 479

In [5]:
import numpy as np
# np.array(labels_list)[:]
# tokenizer.decode(input_ids_list[0])


# object from the tokenized inputs and labels
dataset = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list})
# train_dataset = torch.utils.data.Dataset()
# train_dataset.encodings = input_ids_list
# train_dataset.labels = labels_list
# train_dataset.attention_mask = attention_mask_list

# Initializing a data collator for token classification with the pre-trained tokenizer
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [6]:
# Prepare your data

# Create a Dataset object
data = Dataset.from_dict({"input_ids": input_ids_list, "labels": labels_list,})
data
# "attention_mask":attention_mask_list

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 6
})

In [7]:
model = BertForTokenClassification.from_pretrained(pretrained_model,num_labels=3)
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [8]:

# Setting up training arguments for the Trainer class
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=200,
    # max_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    # warmup_steps=500,2
    logging_dir=model_dir+'/logs',
    evaluation_strategy="no",
    save_strategy="no"
)

# Initializing a Trainer instance with the model, training arguments, dataset, tokenizer, and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator
)



In [9]:
# Training the model on the dataset
trainer.train()


trainer.save_model(model_dir + "model/") 

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model(model_dir + "model/") 
