In [33]:
def read_lines(path):
	with open(path, 'r') as file:
		return file.readlines()

sentences = read_lines('temp/sentences.txt')
characters_fullnames = read_lines('temp/characters.txt')	

characters = []
for name in characters_fullnames:
	words = name.strip().split() 
	characters.extend(words) 

characters = set(characters)
characters = list(characters)
characters = [item for item in characters if item not in ['I', 'II', 'III', 'IV', 'V', 'VI']]
characters = characters[0:128]

In [34]:
import torch
from transformers import BertTokenizer, BertForTokenClassification


# Create a mapping of characters to labels
label_map = {char: i for i, char in enumerate(characters)}

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(characters))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [36]:

from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset for training
class NERDataset(Dataset):
	def __init__(self, sentences, labels, tokenizer):
		self.sentences = sentences
		self.labels = labels
		self.tokenizer = tokenizer

	def __len__(self):
		return len(self.sentences)

	def __getitem__(self, idx):
		sentence = self.sentences[idx]
		label = self.labels[idx]

		inputs = self.tokenizer.encode_plus(
			sentence,
			add_special_tokens=True,
			return_tensors="pt",
			max_length=128,
			truncation=True,
			padding='max_length',
		)
		return {
			'input_ids': inputs['input_ids'].squeeze(),
			'attention_mask': inputs['attention_mask'].squeeze(),
			'labels': torch.tensor(label, dtype=torch.long)
		}

# Prepare data
labels = [[1 if char in sentence else 0 for char in characters] for sentence in sentences]

# Pad sequences in batches
def collate_fn(batch):
	input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True)
	attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True)
	labels = torch.stack([item['labels'] for item in batch])

	return {
		'input_ids': input_ids,
		'attention_mask': attention_mask,
		'labels': labels
	}

dataset = NERDataset(sentences, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
# Fine-tuning
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

print(f"Cuda={torch.cuda.is_available()}")
#device = ("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)

for epoch in range(3):
	model.train()
	total_loss = 0

	for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
		input_ids = batch['input_ids'].to(device)
		attention_mask = batch['attention_mask'].to(device)
		labels = batch['labels'].to(device)

		outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
		loss = outputs.loss
		loss.backward()

		optimizer.step()
		optimizer.zero_grad()

		total_loss += loss.item()

	print(f"Epoch {epoch+1} - Loss: {total_loss / len(dataloader)}")

print("Training finished!")

Cuda=True


Epoch 1: 100%|██████████| 1488/1488 [31:45<00:00,  1.28s/it]


Epoch 1 - Loss: 0.04302178915461428


Epoch 2: 100%|██████████| 1488/1488 [31:52<00:00,  1.29s/it]


Epoch 2 - Loss: 0.005218633826726908


Epoch 3: 100%|██████████| 1488/1488 [31:57<00:00,  1.29s/it]

Epoch 3 - Loss: 0.004035595181331671
Training finished!





In [38]:
model_path = 'models/chars.pth'  # Path where you want to save the model

# Save the entire model including both model architecture and trained weights
torch.save(model.state_dict(), model_path)

In [51]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained BERT model architecture
trained_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(characters))
trained_model.load_state_dict(torch.load(model_path), strict=False)  # Load the trained weights
trained_model.eval()  # Set the model to evaluation mode

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [53]:
text = sentences[0]
inputs = tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	return_tensors="pt",
	max_length=128,
	truncation=True,
	padding='max_length'
)
with torch.no_grad():
	input_ids = inputs['input_ids']
	attention_mask = inputs['attention_mask']
	outputs = trained_model(input_ids, attention_mask=attention_mask)

predicted_class = torch.argmax(outputs.logits, dim=1)
print("Predicted Class:", characters[predicted_class.item()])

Predicted Class: Caliondo
