Load and preprocess the data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch

# Function to parse the file
def parse_pos_file(file_path):
    sentences = []
    current_sentence = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith('<Sentence id='):
                if current_sentence:
                    sentences.append(current_sentence)
                current_sentence = []
            elif line and not line.startswith('</Sentence>'):
                current_sentence.extend(line.split())
        if current_sentence:
            sentences.append(current_sentence)
    
    data = []
    for sentence in sentences:
        words = []
        tags = []
        for word_tag in sentence:
            parts = word_tag.rsplit('_', 1)
            if len(parts) == 2:
                word, tag = parts
                words.append(word)
                tags.append(tag)
            else:
                print(f"Warning: Unexpected format in word_tag: {word_tag}")
        if words and tags:
            data.append((words, tags))
    
    return data

# Load your dataset
file_path = 'indian\hindi.pos'
data = parse_pos_file(file_path)

# Print some information about the dataset
print(f"Number of sentences: {len(data)}")
if data:
    print(f"Example sentence: {data[0]}")
else:
    print("No valid sentences found in the data.")

# Split the data
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Function to encode the data
def encode_data(sentences, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    labels = []
    
    for words, tags in sentences:
        encoded = tokenizer.encode_plus(
            words,
            is_split_into_words=True,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True
        )
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
        # Adjust labels to match tokenized input
        label_ids = [-100] + [tag2id[tag] for tag in tags] + [-100] * (max_length - len(words) - 1)
        label_ids = label_ids[:max_length]
        labels.append(label_ids)
    
    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_masks),
        'labels': torch.tensor(labels)
    }

# Create tag to ID mapping
all_tags = set(tag for _, tags in data for tag in tags)
tag2id = {tag: id for id, tag in enumerate(all_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# Encode the datasets
train_encodings = encode_data(train_data, tokenizer)
val_encodings = encode_data(val_data, tokenizer)
test_encodings = encode_data(test_data, tokenizer)

# Print some information about the dataset
print(f"Number of sentences: {len(data)}")
print(f"Number of unique tags: {len(all_tags)}")
print(f"Tags: {', '.join(all_tags)}")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

  from .autonotebook import tqdm as notebook_tqdm


Number of sentences: 540
Example sentence: (['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक'], ['JJ', 'NN', 'VFM', 'SYM', 'NNP'])
Number of sentences: 540
Number of unique tags: 26
Tags: , NNPC, VRB, PUNC, VNN, NEG, SYM, VJJ, NNC, NVB, NN, INTF, VFM, QW, JJ, CC, VAUX, QF, JVB, QFNUM, NNP, RB, PREP, PRP, NLOC, RP
Training samples: 378
Validation samples: 81
Test samples: 81


Create dataset and dataloader

In [2]:
from torch.utils.data import Dataset, DataLoader

class POSDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['labels'])

# Create datasets
train_dataset = POSDataset(train_encodings)
val_dataset = POSDataset(val_encodings)
test_dataset = POSDataset(test_encodings)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

Set up the model

In [3]:
from transformers import BertForTokenClassification

num_labels = len(tag2id)
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Set up training

In [4]:
from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs = 10
num_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



Training loop

In [5]:
from tqdm import tqdm

def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    return total_loss / len(dataloader)

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, device)
    val_loss = evaluate(model, val_loader, device)
    print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 24/24 [00:21<00:00,  1.13it/s]


Epoch 1: Train Loss: 2.7226, Val Loss: 2.6157


 42%|████▏     | 10/24 [00:09<00:13,  1.07it/s]


KeyboardInterrupt: 

Evaluate the model

In [None]:
import torch
from sklearn.metrics import classification_report

def get_predictions(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=2)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

# Get predictions
predictions, true_labels = get_predictions(model, test_loader, device)

# Convert numeric labels back to text labels
label_map = id2tag  # We already created this mapping earlier
true_labels_text = [[label_map[l] for l in label if l != -100] for label in true_labels]
predictions_text = [[label_map[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, true_labels)]

# Flatten the lists
flattened_true_labels = [label for sublist in true_labels_text for label in sublist]
flattened_predictions = [label for sublist in predictions_text for label in sublist]

# Print the classification report
print(classification_report(flattened_true_labels, flattened_predictions))


              precision    recall  f1-score   support

          CC       0.24      0.10      0.14        40
        INTF       0.00      0.00      0.00         3
          JJ       0.42      0.09      0.15        54
         JVB       0.00      0.00      0.00        13
         NEG       0.00      0.00      0.00         7
        NLOC       0.00      0.00      0.00         7
          NN       0.28      0.45      0.34       249
         NNC       0.25      0.09      0.13        55
         NNP       0.48      0.38      0.42       119
        NNPC       0.12      0.09      0.11        32
         NVB       0.00      0.00      0.00        23
        PREP       0.33      0.49      0.40       291
         PRP       0.48      0.25      0.33        60
        PUNC       0.44      0.39      0.42        82
          QF       0.00      0.00      0.00         6
       QFNUM       0.37      0.47      0.41        75
          QW       0.00      0.00      0.00         1
          RB       0.67    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Save the model

In [None]:
model.save_pretrained('./pos_tagger_model')
tokenizer.save_pretrained('./pos_tagger_model')

('./pos_tagger_model\\tokenizer_config.json',
 './pos_tagger_model\\special_tokens_map.json',
 './pos_tagger_model\\vocab.txt',
 './pos_tagger_model\\added_tokens.json')

Use the model for tagging:

In [None]:
def tag_text(text, model, tokenizer, device):
    model.eval()
    words = text.split()
    inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    tags = [id2tag[p.item()] for p in predictions[0][1:-1]]  # Ignore [CLS] and [SEP] tokens
    return list(zip(words, tags))

# Example usage
text = "पूर्ण प्रतिबंध हटाओ : इराक"
tagged = tag_text(text, model, tokenizer, device)
print(tagged)

[('पूर्ण', 'NNC'), ('प्रतिबंध', 'NN'), ('हटाओ', 'PREP'), (':', 'PREP'), ('इराक', 'PREP')]
