# 2 Tagger

## 2.1 LSTM Tagger

### 1. Data Loading and Preprocessing

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Function to read the data file
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

# Function to extract sentences and tags
def extract_sentences_tags(lines):
    sentences, tags = [], []
    sentence, tag = [], []
    for line in lines:
        if line == '\n':
            sentences.append(sentence)
            tags.append(tag)
            sentence, tag = [], []
            continue
        splits = line.strip().split(' ')
        sentence.append(splits[0])
        tag.append(splits[-1])
    return sentences, tags

# Load data
train_lines = read_data('/mnt/data/train.txt')
valid_lines = read_data('/mnt/data/valid.txt')
test_lines = read_data('/mnt/data/test.txt')

# Extract sentences and tags
train_sentences, train_tags = extract_sentences_tags(train_lines)
valid_sentences, valid_tags = extract_sentences_tags(valid_lines)
test_sentences, test_tags = extract_sentences_tags(test_lines)

# Create word and tag vocabularies
word_to_ix = {word: i for i, sentence in enumerate(train_sentences) for word in sentence}
tag_to_ix = {tag: i for i, tag in enumerate(set(tag for tag_list in train_tags for tag in tag_list))}

# Function to convert sentences and tags to indices
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# Pad the sequences
pad_token = word_to_ix['<PAD>']  # Assuming <PAD> is added to the word_to_ix dictionary
padded_train_sentences = pad_sequence([prepare_sequence(s, word_to_ix) for s in train_sentences], padding_value=pad_token)
padded_train_tags = pad_sequence([prepare_sequence(t, tag_to_ix) for t in train_tags], padding_value=tag_to_ix['O'])  # 'O' is typically used for 'Outside' in NER tags

# Now padded_train_sentences and padded_train_tags are ready to be used in your model


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming you have processed your data into these variables
train_data = # Your processed training data
word_to_ix = # Dictionary mapping each word to a unique index
tag_to_ix = # Dictionary mapping each tag to a unique index

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Model instantiation
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Training loop
for epoch in range(300):  
    for sentence, tags in train_data:
        model.zero_grad()

        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
