In [2]:
from datasets import Dataset
from transformers import (DataCollatorForTokenClassification,
                          BertTokenizer,BertForTokenClassification)
from transformers import Trainer,TrainingArguments
#######
from utils.corpusprocessor import CorpusType
from utils.corpusprocessor import CorpusLoader
from utils.labeler import Labeler
from utils.datasetbuilder import DatasetBuilder
# Loading the corpus using the custom CorpusLoader class

In [1]:
corpus = CorpusLoader()
bijan_data = corpus.load_bijan(CorpusType.whole_raw)
labeler = Labeler( tags=(1, 2),
                 regexes=(r'[^\S\r\n\v\f]', r'\u200c'),
                 chars=(" ", "‌"),
                 class_count=2,
                 )
labeler.set_text(bijan_data, corpus_type=CorpusType.whole_raw)
chars, labels = labeler.labeler()
model_dir = "./Model3111/"

# pretrained_model = "HooshvareLab/bert-base-parsbert-uncased"
pretrained_model = "bert-base-multilingual-uncased"

tokenizer = BertTokenizer.from_pretrained(pretrained_model)
dataset_builder= DatasetBuilder(tokenizer)
# Initializing a data collator for token classification with the pre-trained tokenizer
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# chars = chars[:2100]
# labels = labels[:2100]


NameError: name 'CorpusLoader' is not defined

In [24]:

tokens = tokenizer(chars, padding="max_length", is_split_into_words=True,max_length=512)["input_ids"]
# Prepare the labels. Here I'm simply padding the labels list with -100s (the default ignore index in PyTorch)
labels = labels + [-100] * (len(tokens) - len(labels))
# print(tokens)
# print(len(tokens))

In [14]:
input_ids, attention_mask, labels = dataset_builder.chunck_tokens(tokens, labels,chunk_size=512,summery=True)
dataset = Dataset.from_dict({"input_ids": input_ids,
                             "labels": labels,
                             "attention_mask":attention_mask})

Shapes
IDs:			 (5, 512)
Labels:			 (5, 512)
Attention Mask:	 (5, 512)


In [28]:
print(len(tokens))
print(len(labels))
print(len(chars))
b = 10000
for i in range(100):
    print(tokens[b+i],chars[b+i],labels[b+i])

9865889
9876930
9876930
464 پ 0
454 ش 0
474 ت 0
451 ك 0
461 ا 0
461 ر 1
479 ر 0
462 و 0
478 ز 0
459 ه 2
451 د 0
461 ا 0
476 ر 1
481 م 0
454 ي 2
479 ت 0
451 و 0
477 ا 0
459 ن 0
478 د 1
476 ه 0
481 م 0
464 ي 0
478 ش 0
451 ه 1
479 ا 0
461 و 1
451 ر 0
479 ا 1
451 و 0
459 ا 0
451 د 0
461 ا 0
474 ر 1
477 ك 0
459 ن 0
474 د 1
478 ك 0
451 ه 1
462 ا 0
451 ز 1
476 ا 0
454 م 0
457 ت 0
451 ح 0
477 ا 0
451 ن 0
454 ا 0
477 ت 1
454 ن 0
461 ت 0
463 ر 0
459 س 0
479 د 1
478 و 1
476 ه 0
481 م 0
464 ي 0
478 ش 0
459 ه 1
461 د 0
451 ر 1
476 ا 0
454 م 0
457 ت 0
451 ح 0
477 ا 0
451 ن 0
454 ا 0
476 ت 1
458 م 0
454 خ 0
475 ت 0
472 ل 0
463 ف 1
461 س 0
452 ر 0
475 ب 0
477 ل 0
459 ن 0
452 د 1
451 ب 0
464 ا 0
459 ش 0
119 د 0
479 . 1
478 و 1
476 ه 0
478 م 0
451 ه 1
481 ا 0
477 ي 0
478 ن 0
451 ه 0
474 ا 1
478 ك 0
507 ه 1
472 گ 0
454 ف 0
481 ت 0


In [29]:
import numpy as np
mask_shape = np.array(attention_mask).shape
ids_shape = np.array(input_ids).shape
lbl_shape = np.array(labels).shape
print("Shapes\nIDs:\t\t\t", ids_shape)
print("Labels:\t\t\t", lbl_shape)
print("Attention Mask:\t", mask_shape)

Shapes
IDs:			 (5, 512)
Labels:			 (9876930,)
Attention Mask:	 (5, 512)


In [None]:
dataset


In [None]:
model = BertForTokenClassification.from_pretrained(pretrained_model,num_labels=3)
import torch
# torch.backends.cuda.chunk_size = 512 * 1024 * 1024*2
model

In [11]:

# Setting up training arguments for the Trainer class
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=200,
    # max_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    # warmup_steps=500,2
    logging_dir=model_dir+'/logs',
    save_strategy="epoch"
)

# Initializing a Trainer instance with the model, training arguments, dataset, tokenizer, and data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [12]:
import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)



In [7]:
torch.backends.cuda.chunk_size = 512 * 1024 * 1024


In [13]:
# Training the model on the dataset
trainer.train()
trainer.save_model(model_dir + "model/") 



Step,Training Loss


KeyboardInterrupt: 

In [1]:
import torch
print("Torch version:",torch.__version__)
print(f"Is CUDA supported by this system?	{torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:	{torch.cuda.current_device()}")
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")
import os
print(os.environ.get("PYTORCH_CUDA_ALLOC_CONF"))
torch.backends.cuda.chunk_size = 512 * 1024 * 1024

Torch version: 2.1.0+cu121
Is CUDA supported by this system?	True
CUDA version: 12.1
ID of current CUDA device:	0
Name of current CUDA device:NVIDIA GeForce RTX 3050 Laptop GPU
None


In [10]:
torch.cuda.get_device_properties(0)

print(torch.cuda.memory_allocated())


0
