In [1]:
# Importing the required libraries
import os
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          AutoConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification)

# Setting up a proxy for all connections made by the Python interpreter
# os.environ['all_proxy'] = "socks5://127.0.0.1:10808"

# Changing the current working directory to "/home/matin/Desktop/Masters"
# os.chdir("/home/matin/Desktop/Masters")

# Importing custom classes for loading and labeling the corpus
from utils.corpusloader import CorpusType
from utils.corpusloader import CorpusLoader
from utils.labeler import Labeler


2023-10-07 12:29:36.234776: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-10-07 12:29:36.234797: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Loading the corpus using the custom CorpusLoader class
corpus = CorpusLoader()
data = corpus.read_bijan(CorpusType.sents_raw)
# TODO: add label 1 to the end of sentances
# TODO: add the cls sep to labels
# 
# Labeling the data using the custom Labeler class
labeler = Labeler()
labeler.set_text(data, corpus_type=CorpusType.sents_raw)
chars, labels = labeler.labeler()

pretrained_model = "HooshvareLab/bert-base-parsbert-uncased"
model_dir = "./Model1/"
# pretrained_model = "bert-base-multilingual-cased"

# Limiting the data to the first 30 samples for demonstration purposes
chars = chars[:20]
labels = labels[:20]

# Converting label strings to integers
for i, label in enumerate(labels):
    labels[i] = [int(l) for l in label]


In [3]:
# Loading a pre-trained tokenizer and model configuration from Hugging Face's model hub
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
config = AutoConfig.from_pretrained(pretrained_model)

# Modifying the model configuration to have 3 labels for token classification
config.num_labels = 3

# Initializing a model for token classification with the modified configuration
model = AutoModelForTokenClassification.from_config(config)


In [4]:

# Tokenizing the input data and adding special tokens
tokens_all = []
for char in chars:
    tokens = [tokenizer.encode(ch)[1] for ch in char]
    tokens.insert(0, 2)  # Adding CLS token at the beginning of each sequence "2 is the tokenized "[CLS]" "
    tokens.append(4)  # Adding SEP token at the end of each sequence "4 is the tokenized "[SEP]" "
    tokens_all.append(tokens)
# Truncate the token sequences
tokens_all = [tokens[:512] for tokens in tokens_all]

# creating the right
labels_all = []
for label in labels:
    lab = label
    lab.insert(0,0) # # Adding CLS token label at the beginning of each sequence 
    lab[-1] = 1 # at the end of a sentence there would be a space obviously
    lab.append(0) # Adding SEP token label at the end of each sequence 
    labels_all.append(lab)
# Truncate the labels
labels_all = [labels[:512] for labels in labels_all]    


In [5]:

# Creating a Dataset object from the tokenized inputs and labels
dataset = Dataset.from_dict({"input_ids": tokens_all, "labels": labels_all})

# Initializing a data collator for token classification with the pre-trained tokenizer
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Setting up training arguments for the Trainer class
training_args = TrainingArguments(
    output_dir=model_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="no",
    save_strategy="no"
)

# Initializing a Trainer instance with the model, training arguments, dataset, tokenizer, and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [6]:
# Training the model on the dataset
trainer.train()

# Saving the trained model in the "./Model/model/" directory
trainer.save_model(model_dir + "model/")


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
