In [59]:
import string
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from TorchCRF import CRF

In [60]:
def get_words(filepath):
    # given the source file path, return all input words
    words = []
    with open(filepath, encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            words += [line.strip()]
    return words

def tag_original_word(original_word, tokenized_word):
    # Initialize pointers for the original word and tags
    org_idx = tok_idx = 0
    tags = []
    tag = "B"
    # Iterate through the tokenized word
    while tok_idx < len(tokenized_word) and org_idx < len(original_word):
        # Skip characters not in the original word
        if tokenized_word[tok_idx] == original_word[org_idx]:
            tags.append(tag)
            if tag == "B":
                tag = "I"
            org_idx += 1
        elif tokenized_word[tok_idx] == " ":
            tag = "B"
            tok_idx += 1
        else:
            tok_idx += 1

    # Return the final tags as a string
    return "".join(tags)

# preparing labels for the dataset
# Using "B", "I" labeling, B for beginning of the word, I for inside the word
def get_labels(origional_words, tokenized_words):
    labels = []
    for o, t in zip(origional_words, tokenized_words):
        labels.append(tag_original_word(o, t))
    return labels

In [61]:
def encode(word, char_to_idx, is_input):
    if is_input:
        return [char_to_idx[ch] if ch in char_to_idx else char_to_idx['<unk>'] for ch in word]
    return [char_to_idx['<start>']] + [char_to_idx[ch] for ch in word] + [char_to_idx['<end>']]

def decode(encoded_word, idx_to_char):
    return ''.join(idx_to_char[idx] for idx in encoded_word if idx_to_char[idx] not in ['<pad>', '<unk>', '<start>', '<end>'])

def encode_whole(words, char_to_idx, is_input):
    words_encoded = []
    for word in words:
        encoded_word = encode(word, char_to_idx, is_input)
        words_encoded.append(encoded_word)
    
    return words_encoded

def decode_whole(encoded_words, idx_to_char):   
    decoded_words = []
    for encoded_word in encoded_words:
        decoded_word = decode(encoded_word, idx_to_char)
        decoded_words.append(decoded_word)
    
    return decoded_words

In [62]:
from collections import Counter
# labels = get_labels('dataset/shp.train.tgt')
input_words = get_words('dataset/shp.train.src')
output_words = get_words('dataset/shp.train.tgt')

train_labels = get_labels(input_words, output_words)
# print(train_labels)
# print(Counter(''.join(train_labels)))

val_input_words = get_words('dataset/shp.dev.src')
val_output_words = get_words('dataset/shp.dev.tgt')

val_labels = get_labels(val_input_words, val_output_words)

test_words = get_words('dataset/shp.test.src')

# get char to index mapping and index to char mapping
all_chars = set(char for seq in input_words + val_input_words for char in seq)
all_labels = set(label for lbls in train_labels for label in lbls)

char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(all_chars))}  # Reserve 0 for padding
char_to_idx["<UNK>"] = len(char_to_idx) + 1 
label_to_idx = {label: idx for idx, label in enumerate(sorted(all_labels))}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

# encoded_origin = encode_whole(input_words, char_to_idx, is_input=True)
# encoded_tokenized = encode_whole(train_labels, label_to_idx, is_input=False)
# encoded_val = encode_whole(val_input_words, char_to_idx, is_input=True)
# encoded_val_tokenized = encode_whole(val_labels, label_to_idx, is_input=False)


In [63]:
from torch.utils.data import Dataset

class SequenceLabelingDataset(Dataset):
    def __init__(self, sequences, labels, char_to_idx, label_to_idx):
        self.sequences = [[char_to_idx.get(char, char_to_idx['<UNK>']) for char in seq] for seq in sequences]
        self.labels = [[label_to_idx.get(label) for label in seq] for seq in labels]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return torch.tensor(sequence, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [64]:
def collate_fn(batch):
    sequences, labels = zip(*batch)
    max_len = max(max(len(seq) for seq in sequences), max(len(lbl) for lbl in labels))
    
    # Pad sequences
    padded_sequences = [
        torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)], dim=0)
        for seq in sequences
    ]
    
    # Pad labels
    padded_labels = [
        torch.cat([lbl, torch.full((max_len - len(lbl),), -1, dtype=torch.long)], dim=0)
        for lbl in labels
    ]

    # Create a mask to indicate valid positions (1 for valid, 0 for padding)
    mask = [
        torch.cat([torch.ones(len(seq), dtype=torch.uint8), torch.zeros(max_len - len(seq), dtype=torch.uint8)], dim=0)
        for seq in sequences
    ]
    
    # Ensure the first timestep of the mask is always 1
    for m in mask:
        m[0] = 1
    
    return torch.stack(padded_sequences), torch.stack(padded_labels), torch.stack(mask)


In [65]:
from torch.utils.data import DataLoader

train_dataset = SequenceLabelingDataset(input_words, train_labels, char_to_idx, label_to_idx)
val_dataset = SequenceLabelingDataset(val_input_words, val_labels, char_to_idx, label_to_idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [66]:
# BiLSTM-CRF Model
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size):
        super(BiLSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        embeddings = self.embedding(x)
        lstm_out, _ = self.lstm(embeddings)
        emissions = self.hidden2tag(lstm_out)
        
        if tags is not None:
            valid_mask = mask.bool()
            loss = -self.crf(emissions, tags, mask=valid_mask, reduction='mean')
            return loss
        else:
            return self.crf.decode(emissions, mask=mask.bool())

In [67]:
from tqdm import tqdm

vocab_size = len(char_to_idx) + 1
tagset_size = len(label_to_idx)
embedding_dim = 50
hidden_dim = 100

# Instantiate Model
model = BiLSTMCRF(vocab_size, embedding_dim, hidden_dim, tagset_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
MODEL_PATH = "best_seq2seq_model.pth"


# Training Loop
def train_model(model, train_loader, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as pbar:
            for sequences, labels, mask in train_loader:
                sequences, labels, mask = sequences.to(device), labels.to(device), mask.to(device)
                # mask = (labels != -1).float()  # Mask padded labels

                optimizer.zero_grad()
                loss = model(sequences, tags=labels, mask=mask)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                pbar.update(1)

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")
        torch.save(model.state_dict(),MODEL_PATH)

train_model(model, train_loader, optimizer, epochs=10)

Epoch 1/10: 100%|██████████| 28/28 [00:01<00:00, 20.19batch/s]


Epoch 1/10, Loss: 2.6133


Epoch 2/10: 100%|██████████| 28/28 [00:01<00:00, 16.81batch/s]


Epoch 2/10, Loss: 1.6568


Epoch 3/10: 100%|██████████| 28/28 [00:02<00:00, 12.98batch/s]


Epoch 3/10, Loss: 1.4229


Epoch 4/10: 100%|██████████| 28/28 [00:03<00:00,  9.28batch/s]


Epoch 4/10, Loss: 1.2546


Epoch 5/10: 100%|██████████| 28/28 [00:03<00:00,  7.89batch/s]


Epoch 5/10, Loss: 1.0682


Epoch 6/10: 100%|██████████| 28/28 [00:02<00:00, 12.03batch/s]


Epoch 6/10, Loss: 0.9981


Epoch 7/10: 100%|██████████| 28/28 [00:02<00:00, 11.42batch/s]


Epoch 7/10, Loss: 0.8342


Epoch 8/10: 100%|██████████| 28/28 [00:02<00:00, 10.95batch/s]


Epoch 8/10, Loss: 0.7512


Epoch 9/10: 100%|██████████| 28/28 [00:02<00:00, 12.51batch/s]


Epoch 9/10, Loss: 0.7032


Epoch 10/10: 100%|██████████| 28/28 [00:02<00:00, 13.33batch/s]

Epoch 10/10, Loss: 0.5948





In [68]:
from sklearn.metrics import accuracy_score

def evaluate_model(model, data_loader, device):
    model.eval()
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for sequences, labels, mask in data_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            
            # Mask for valid positions (non-padded)
            # mask = (labels != -1).float()

            # Get predictions from the model
            predicted_indices = model(sequences, mask=mask)

            # Collect predictions and targets
            for pred_seq, true_seq, seq_mask in zip(predicted_indices, labels, mask):
                # Convert mask to int for slicing
                seq_len = int(seq_mask.sum().item())
                
                # Remove padding and collect valid predictions and targets
                all_predictions.extend(pred_seq[:seq_len])
                all_targets.extend(true_seq[:seq_len].tolist())

    # Calculate accuracy
    accuracy = accuracy_score(all_targets, all_predictions)
    return accuracy

# Prediction Loop
model.eval()
with torch.no_grad():
    for sequences, _, mask in val_loader:
        sequences = sequences.to(device)
        predictions = model(sequences, mask=mask)
        print("Predictions:", predictions)


# Evaluate the model on the dev dataset
accuracy = evaluate_model(model, val_loader, device)
print(f"Dev Set Accuracy: {accuracy * 100:.2f}%")

Predictions: [[0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 0, 1, 1, 1], [0, 0, 1, 1, 0], [0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1], [0, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1], [0, 1, 1, 1], [0, 1, 1, 1, 1], [0, 1, 1, 0, 1, 1, 0], [0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 1, 1, 1, 1], [0, 1, 1, 1, 0, 1], [0, 1, 1, 1, 0, 1], [0, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 0, 1], [0, 1, 1, 1, 0, 1, 0, 1, 1, 1], [0, 1, 1, 1, 0, 1], [0, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1]]
Predictions: [[0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 1, 0, 1], [0, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 1], [1, 0, 1, 1, 1, 1, 1, 1, 0, 1], [0, 1, 1, 1, 0, 1], [0, 1, 1, 1, 1, 0, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1

In [69]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, sequences, char_to_idx):
        self.sequences = [[char_to_idx.get(char, char_to_idx['<UNK>']) for char in seq] for seq in sequences]
        
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        return torch.tensor(sequence, dtype=torch.long)

In [70]:
def collate_fn_unlabeled(batch):
    if not batch:
        raise ValueError("Received an empty batch.")

    # Ensure all items are valid 1-dimensional tensors
    valid_sequences = [seq for seq in batch if len(seq.shape) == 1]
    if not valid_sequences:
        raise ValueError("All sequences in the batch are invalid or empty.")

    max_len = max(len(seq) for seq in valid_sequences)

    # Pad sequences to the maximum length
    padded_sequences = [
        torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)], dim=0)
        for seq in valid_sequences
    ]

    return torch.stack(padded_sequences)


In [71]:
def predict_unlabeled(model, data_loader):
    model.eval()  # Set model to evaluation mode
    predictions = []

    with torch.no_grad():  # Disable gradient calculations
        for sequences, mask in data_loader:
            sequences, mask = sequences.to(device), mask.to(device)
            # Get the most likely tag sequence using the CRF decoder
            pred_tags = model(sequences, mask=mask)
            predictions.extend(pred_tags)

    return predictions


In [72]:
test_dataset = TestDataset(test_words, char_to_idx)
# print(test_dataset[40])
# print(char_to_idx)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_unlabeled)

model = BiLSTMCRF(vocab_size, embedding_dim, hidden_dim, tagset_size)
model.load_state_dict(torch.load(MODEL_PATH))
model.to(device)
model.eval()

# print(vocab_size)
# for seq in test_loader:
#     print(seq)
#     print(torch.max(seq))
results = []
with torch.no_grad():
    for sequences in tqdm(test_loader, desc="Running Inference"):
        sequences = sequences.to(device)
        mask = (sequences != 0).float()  # Mask for non-padded positions
        predictions = model(sequences, mask=mask)
        results.extend(predictions)

def decode_tag(words, tags):
    decoded = []
    for word, tag in zip(words, tags):
        decoded_word = []
        for idx, (w, t) in enumerate(zip(word, tag)):
            if t == 0 and idx != 0:
                decoded_word.append(' ')
            decoded_word.append(w)
        decoded.append(''.join(decoded_word))
    return decoded

decoded = decode_tag(test_words, results)

Running Inference: 100%|██████████| 4/4 [00:00<00:00, 25.54it/s]


In [73]:
# write the decoded words to a file with name pred_tar.test.tgt
with open('pred_shp.test.tgt', 'w') as file:
    for word in decoded:
        file.write(word + '\n')