In [245]:
import re
import math
import conllu
import random
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [246]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
device

device(type='cpu')

In [247]:
word_to_index = {}
tag_to_index = {}
max_sentence_length = 0
word_count = {}
word_to_index['<PAD>'] = 0
tag_to_index['<UNK>'] = 0

Got all senteces with padding and each sentence with pos tags

In [248]:
# This will provide each sentence with paddings <unk>
def process_dataset(dataset_file, p=2, s=3):
    sentences_list = []
    pos_list = []

    with open(dataset_file, 'r', encoding='utf-8') as f:
        sentence_tokens = []
        pos_tags = []

        for line in f:
            line = line.strip()

            if line.startswith('#'):
                sentence_tokens = []
                pos_tags = []
                continue
            elif line == '':
                # Append padding to the end of the sentence
                padded_sentence = ' '.join(['<PAD>'] * p) + ' ' + ' '.join(sentence_tokens) + ' ' + ' '.join(['<PAD>'] * s)
                padded_pos = ' '.join(['<UNK>'] * p + pos_tags + ['<UNK>'] * s)
                sentences_list.append(padded_sentence)
                pos_list.append(padded_pos)
                continue
            else:
                # New sentence begins
                token_attrs = line.split('\t')
                word_form = token_attrs[1]  # Word form of the token
                pos_tag = token_attrs[3]    # POS tag of the token
                sentence_tokens.append(word_form)
                pos_tags.append(pos_tag)

    return sentences_list, pos_list

In [249]:
def get_indices(sentences_list, pos_list, word_to_index, tag_to_index, max_sentence_length, word_count):
    # Process each sentence to tokenize the data
    for sentence_str, tag_str in zip(sentences_list, pos_list):
        # Tokenize the sentence into individual tokens
        tokens = sentence_str.split(' ')
        tags = tag_str.split(' ')
            # Word to index
        for word, tag in zip(tokens, tags):
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)
                word_count[word] = word_count.get(word, 0) + 1
        # Tag to index
            if tag not in tag_to_index:
                tag_to_index[tag] = len(tag_to_index)
        max_sentence_length = max(max_sentence_length, len(tokens))
    return word_to_index, tag_to_index, max_sentence_length, word_count

In [250]:
train_dataset = "./conllu/train.conllu"
test_dataset = "./conllu/test.conllu"
val_dataset = "./conllu/val.conllu"
train_sentece_list, train_pos_list = process_dataset(train_dataset, p= 2, s= 3)
test_sentece_list, test_pos_list = process_dataset(test_dataset, p= 2, s= 3)
val_sentece_list, val_pos_list = process_dataset(val_dataset, p= 2, s= 3)


sentences_list = train_sentece_list + test_sentece_list + val_sentece_list
pos_list = train_pos_list + test_pos_list + val_pos_list

# Split the data into sentences
word_to_index = {'<UNK>': 0}
tag_to_index = {'<UNK>': 0}
word_count = {'<UNK>': 1}
max_sentence_length = 0
# get all indices
word_to_index, tag_to_index, max_sentence_length, word_count = get_indices(sentences_list, pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)

## Function for calculating embedding for all sentences 

In [251]:
def PrepareEmbedding(sentence_dataset, pos_dataset, word_to_index, tag_to_index, max_sentence_length, word_count):
    token_embeddings = []
    labels_embedding = []
    for sentence_str, tag_str in zip(sentence_dataset, pos_dataset):
        # Tokenize the sentence into individual tokens
        tokens = sentence_str.split(' ')
        tags = tag_str.split(' ')
        one_sentence_token_embedding = []
        one_sentence_pos_embedding = []
        # Word to index
        for word, tag in zip(tokens, tags):
            if word in word_to_index:
                if word_count[word] < 2:
                    word_cur_idx = word_to_index['<UNK>']
                else:
                    word_cur_idx = word_to_index[word]
            else:
                word_cur_idx = word_to_index['<UNK>']
            # Tag to index
            if tag in tag_to_index:
                tag_cur_idx = tag_to_index[tag]
            else:
                tag_cur_idx = tag_to_index['<UNK>']
            one_sentence_token_embedding.append(word_cur_idx)
            one_sentence_pos_embedding.append(tag_cur_idx)
        # Pad sequences using PyTorch's pad_sequence function
        one_sentence_token_embedding.extend([0] * (max_sentence_length - len(one_sentence_token_embedding)))
        one_sentence_pos_embedding.extend([0] * (max_sentence_length - len(one_sentence_pos_embedding)))
        token_embeddings.append(one_sentence_token_embedding)
        labels_embedding.append(one_sentence_pos_embedding)
    return token_embeddings, labels_embedding

In [286]:
train_sentence_embeddings, train_pos_embeddings = PrepareEmbedding(train_sentece_list, train_pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)
test_sentence_embeddings, test_pos_embeddings = PrepareEmbedding(test_sentece_list, test_pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)
val_sentence_embeddings, val_pos_embeddings = PrepareEmbedding(val_sentece_list, val_pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)
# It will save a lot of time

### Feed Forward netword

In [287]:
import numpy as np
# Step 1: Define the Model
class FFNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, p, s):
        super(FFNN, self).__init__()
        # Calculate the actual input size considering embedding dimensions
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear((p + s + 1) *embedding_dim , hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Flatten the input tensor
        first = self.embedding(x)
        first = first.view(-1)
        out = self.fc1(first)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [288]:
# Define the number of steps after which to print the loss and accuracy
print_interval = 10

# Step 2: Define Loss Function
criterion = nn.CrossEntropyLoss()

# Step 3: Instantiate Model
vocab_size = len(word_to_index)
embedding_dim = 100  # Example dimension, adjust as needed
hidden_size = 128    # Example size, adjust as needed
output_size = len(tag_to_index)
p = 2
s = 3
model = FFNN(vocab_size, embedding_dim, hidden_size, output_size, p, s)
optimizer = optim.Adam(model.parameters(), lr=0.001) # Example optimizer, adjust as needed

def train_model(model, criterion, optimizer, train_embeddings, train_pos_embeddings, print_interval=10):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    # Iterate over the training dataset
    for token_indices, pos_indices in zip(train_embeddings, train_pos_embeddings):
        # Create sliding window of size 6 and convert to tensors
        for i in range(p, len(token_indices) - s):
            window_tokens = token_indices[i-p:i+s+1]    
            window_tokens_tensor = torch.LongTensor(window_tokens)
            pos_tag = pos_indices[i]
            # creating one hot encoding for the pos tag
            # length should be the number of tags
            pos_tag_tensor = torch.zeros(len(tag_to_index))
            pos_tag_tensor[pos_tag] = 1
            optimizer.zero_grad()
            outputs = model(window_tokens_tensor)  # Forward pass
            # Calculate loss
            loss = criterion(outputs, pos_tag_tensor)  # Compare outputs with true labels
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

    return running_loss / len(train_embeddings)

def evaluate_model(model, criterion, val_embeddings, val_pos_embeddings):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0

    # Iterate over the validation dataset
    with torch.no_grad():
        for token_indices, pos_indices in zip(val_embeddings, val_pos_embeddings):
            # Create sliding window of size 6 and convert to tensors
            for i in range(p, len(token_indices) - s):
                window_tokens = token_indices[i-p:i+s+1]    
                window_tokens_tensor = torch.LongTensor(window_tokens)
                pos_tag = pos_indices[i]
                # creating one hot encoding for the pos tag
                # length should be the number of tags
                pos_tag_tensor = torch.zeros(len(tag_to_index))
                pos_tag_tensor[pos_tag] = 1
                outputs = model(window_tokens_tensor)  # Forward pass
                # Calculate loss
                loss = criterion(outputs, pos_tag_tensor)  # Compare outputs with true labels
                running_loss += loss.item()

    return running_loss / len(val_embeddings)

def test_model(model, criterion, test_embeddings, test_pos_embeddings):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0

    # Iterate over the test dataset
    with torch.no_grad():
        for token_indices, pos_indices in zip(test_embeddings, test_pos_embeddings):
            # Create sliding window of size 6 and convert to tensors
            for i in range(p, len(token_indices) - s):
                window_tokens = token_indices[i-p:i+s+1]    
                window_tokens_tensor = torch.LongTensor(window_tokens)
                pos_tag = pos_indices[i]
                # creating one hot encoding for the pos tag
                # length should be the number of tags
                pos_tag_tensor = torch.zeros(len(tag_to_index))
                pos_tag_tensor[pos_tag] = 1
                outputs = model(window_tokens_tensor)  # Forward pass
                # Calculate loss
                loss = criterion(outputs, pos_tag_tensor)  # Compare outputs with true labels
                running_loss += loss.item()

    return running_loss / len(test_embeddings)

# Number of epochs
num_epochs = 10
for epoch in range(num_epochs):
    # Training phase
    train_loss = train_model(model, criterion, optimizer, train_sentence_embeddings, train_pos_embeddings, print_interval)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}")

    # Validation phase
    val_loss = evaluate_model(model, criterion, val_sentence_embeddings, val_pos_embeddings)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {val_loss:.4f}")

# Testing phase
test_loss = test_model(model, criterion, test_sentence_embeddings, test_pos_embeddings)
print(f"Test Loss: {test_loss:.4f}")


In [222]:
# Define the number of steps after which to print the loss and accuracy
print_interval = 10
running_loss = 0.0
running_accuracy = 0.0
# Step 2: Define Loss Function
criterion = nn.CrossEntropyLoss()

# Step 3: Instantiate Model
vocab_size = len(word_to_index)
embedding_dim = 100  # Example dimension, adjust as needed
hidden_size = 128    # Example size, adjust as needed
output_size = len(tag_to_index)
p = 2
s = 3
model = FFNN(vocab_size, embedding_dim, hidden_size, output_size, p, s)
optimizer = optim.Adam(model.parameters(), lr=0.001) # Example optimizer, adjust as needed
num_epochs = 10 
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    # Iterate over your dataset
    for sentence, pos_tags in zip(sentences_list, pos_list):
        # Convert tokens and POS tags to indices
        token_indices = [word_to_index[token] for token in sentence.strip().split()]
        pos_indices = [tag_to_index[pos_tag] for pos_tag in pos_tags.strip().split()]
        
        # Create sliding window of size 6 and convert to tensors
        for i in range(p, len(token_indices) - s):
            window_tokens = token_indices[i-p:i+s+1]    
            window_tokens_tensor = torch.LongTensor(window_tokens)
            pos_tag = pos_indices[i]
            # creating on e hot encodeing for the pos tag
            # length should be the number of tags
            pos_tag_tensor = torch.zeros(len(tag_to_index))
            pos_tag_tensor[pos_tag] = 1
            optimizer.zero_grad()
            outputs = model(window_tokens_tensor)  # Forward pass
            predicted = outputs.argmax() # Get the index of the max logit as the predicted class
            # Calculate loss
            loss = criterion(outputs, pos_tag_tensor)  # Compare outputs with true labels
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Print loss and accuracy every `print_interval` steps
        if i % print_interval == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(sentences_list)}], Loss: {running_loss / print_interval:.4f}')
            running_loss = 0.0
          

Epoch [1/10], Step [21/4273], Loss: 10.9461
Epoch [1/10], Step [11/4273], Loss: 79.7393
Epoch [1/10], Step [21/4273], Loss: 8.2242
Epoch [1/10], Step [11/4273], Loss: 9.8153
Epoch [1/10], Step [21/4273], Loss: 2.9105
Epoch [1/10], Step [21/4273], Loss: 2.1472
Epoch [1/10], Step [11/4273], Loss: 21.5849
Epoch [1/10], Step [11/4273], Loss: 5.2639
Epoch [1/10], Step [11/4273], Loss: 1.8262
Epoch [1/10], Step [11/4273], Loss: 16.7029
Epoch [1/10], Step [11/4273], Loss: 3.7315
Epoch [1/10], Step [11/4273], Loss: 0.5785
Epoch [1/10], Step [11/4273], Loss: 0.0211
Epoch [1/10], Step [11/4273], Loss: 0.9139
Epoch [1/10], Step [11/4273], Loss: 8.1671
Epoch [1/10], Step [11/4273], Loss: 0.0583
Epoch [1/10], Step [11/4273], Loss: 6.8401
Epoch [1/10], Step [11/4273], Loss: 6.9535
Epoch [1/10], Step [11/4273], Loss: 4.6516
Epoch [1/10], Step [11/4273], Loss: 0.0744
Epoch [1/10], Step [11/4273], Loss: 0.6762
Epoch [1/10], Step [11/4273], Loss: 0.1874
Epoch [1/10], Step [31/4273], Loss: 9.0454
Epoch [

KeyboardInterrupt: 

Data prepare

In [16]:
X_train, y_train = process_data(train_data, word_to_index, tag_to_index, max_sentence_length, word_count)
X_val, y_val = process_data(val_data, word_to_index, tag_to_index, max_sentence_length, word_count)
X_test, y_test = process_data(test_data, word_to_index, tag_to_index, max_sentence_length, word_count)




# Step 2: Create PyTorch DataLoaders

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

#### Create PyTorch datasets and data loaders for training data. This allows efficient batching and shuffling of data.

In [67]:
train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define Model

X_train.shape[1] is used to specify the input size of the neural network model.     X_train.shape[1] accesses the second element of the shape tuple, which represents the number of features or input dimensions in your dataset. In the example (1000, 50), X_train.shape[1] would be 50.

Sending X_train.shape[1] as the input_size parameter to the FFNN constructor ensures that the input layer of your neural network has the correct number of neurons to accommodate the input features of your dataset.

In [87]:
# Step 3: Define Loss Function and Optimizer
model = FFNN(input_size=X_train.shape[1], hidden_size=128, output_size=y_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Step 4: Training Loop

In [78]:
num_epochs = 20
# for training 
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")


Epoch [1/20], Loss: 1942.5025033523787
Epoch [2/20], Loss: 1406.451219245569
Epoch [3/20], Loss: 1080.4993978471898
Epoch [4/20], Loss: 874.9364979302705
Epoch [5/20], Loss: 744.392890588561
Epoch [6/20], Loss: 654.735463156629
Epoch [7/20], Loss: 589.1151683294951
Epoch [8/20], Loss: 538.1906392111707
Epoch [9/20], Loss: 498.44278773976794
Epoch [10/20], Loss: 465.6859440590019
Epoch [11/20], Loss: 438.57284591447063
Epoch [12/20], Loss: 415.38864682325675
Epoch [13/20], Loss: 395.12138571668027
Epoch [14/20], Loss: 377.8649383089436
Epoch [15/20], Loss: 362.8608712723006
Epoch [16/20], Loss: 348.303059819919
Epoch [17/20], Loss: 335.64912983908584
Epoch [18/20], Loss: 324.2286862045971
Epoch [19/20], Loss: 313.86049435743644
Epoch [20/20], Loss: 303.9825999701201


In [None]:
for epoch in range(1000):  # Adjust number of epochs as needed
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.float())
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [96]:
index_to_tag = {v: k for k, v in tag_to_index.items()}

In [105]:
# Example input sentence
input_sentence = "Mary had a little lamb"
# Tokenize the sentence and convert it to a sequence of word indices
words = sentence.split()
input_sequence = []
for word in words:
    if word in word_to_index:
        if word_count[word] < 2:
            input_sequence.append(word_to_index['<UNK>'])
        else:
            input_sequence.append(word_to_index[word])
    else:
        input_sequence.append(word_to_index['<UNK>'])

# Tokenize the input sentence
# input_sequence = input_sentence.split()
# Pad the sequence with zeros to make it the same length as max_sen_len
if len(input_sequence) < max_sentence_length:
    input_sequence += [0] * (max_sentence_length - len(input_sequence))

# Convert tokens to indices using the vocabulary
input_indices = [word_to_index.get(token, word_to_index['<UNK>']) for token in input_sequence]
# Pad the input sequence if necessary
input_sequence = np.array(input_sequence).reshape(1, max_sentence_length)
print(input_sequence)
# Convert input indices to PyTorch tensor
# input_tensor = torch.tensor(input_sequence)
# Ensure input tensor has the correct shape and type if needed

# Pass the input through the model
with torch.no_grad():
    model.eval()  # Set model to evaluation mode
    outputs = model(input_sequence)

# Map output predictions to POS tags
predicted_tags = [index_to_tag[torch.argmax(output).item()] for output in outputs]

print("Input sentence:", input_sentence)
print("Predicted tags:", predicted_tags)


[[1 1 7 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0]]


AttributeError: 'numpy.ndarray' object has no attribute 'to'

# Question2 : 2 Recurrent Neural Network POS Tagging
Design and implement a model which uses Recurrent Neural Networks (Vanilla
RNN, LSTM, or GRU) for POS Tagging. The model should take the embeddings for all tokens in a sentence and output the corresponding POS tags in
sequence.
- For Example: In the sentence "An apple a day keeps the doctor away",
 the model takes the embeddings for 
- ["An", "apple", "a", "day", "keeps", "the","doctor", "away"] and
 outputs the POS tags for all the words in the sentence
- ["DET", "NOUN", "DET", "NOUN", "VERB", "DET", "NOUN", "ADV"] 

Step1 : Count all words and postags and provide them a index value

- Doing it for all 3 datasets train, validation and test-dataset

In [151]:
with open("./conllu/train.conllu", "r", encoding="utf-8") as f:
    train_data = f.read()
with open("./conllu/val.conllu", "r", encoding="utf-8") as f:
    val_data = f.read()
with open("./conllu/test.conllu", "r", encoding="utf-8") as f:
    test_data = f.read()

In [152]:
def process_dataset(dataset_file, p=2, s=3):
    sentences_list = []
    pos_list = []

    with open(dataset_file, 'r') as f:
        sentence_tokens = []
        pos_tags = []

        for line in f:
            line = line.strip()

            if line.startswith('#'):
                sentence_tokens = []
                pos_tags = []
                continue
            elif line == '':
                # Append padding to the end of the sentence
                padded_sentence = ' '.join(['<PAD>'] * p) + ' ' + ' '.join(sentence_tokens) + ' ' + ' '.join(['<PAD>'] * s)
                padded_pos = ' '.join(['<UNK>'] * p + pos_tags + ['<UNK>'] * s)
                sentences_list.append(padded_sentence)
                pos_list.append(padded_pos)
                continue
            else:
                # New sentence begins
                token_attrs = line.split('\t')
                word_form = token_attrs[1]  # Word form of the token
                pos_tag = token_attrs[3]    # POS tag of the token
                sentence_tokens.append(word_form)
                pos_tags.append(pos_tag)

    return sentences_list, pos_list



### for all train, test and validation dataset
- I got all sentences tokens and respective pos-tag in form of sentence sepateted by space 
- Max sentece length, word count , word to index adn tag t index 

In [163]:
# Split the data into sentences
word_to_index = {'<PAD>': 0, '<UNK>': 1}
tag_to_index = {'<PAD>': 0, '<UNK>': 1}
word_count = {'<PAD>': 1, '<UNK>': 1}
max_sentence_length = 0
train_dataset = "./conllu/train.conllu"
test_dataset = "./conllu/test.conllu"
val_dataset = "./conllu/val.conllu"
train_sentece_list, train_pos_list = process_dataset(train_dataset, p= 2, s= 3)
test_sentece_list, test_pos_list = process_dataset(test_dataset, p= 2, s= 3)
val_sentece_list, val_pos_list = process_dataset(val_dataset, p= 2, s= 3)

sentences_list = train_sentece_list + test_sentece_list + val_sentece_list
pos_list = train_pos_list + test_pos_list + val_pos_list

# Process each sentence to tokenize the data
for sentence_str, tag_str in zip(sentences_list, pos_list):
    # Tokenize the sentence into individual tokens
    tokens = sentence_str.split(' ')
    tags = tag_str.split(' ')
        # Word to index
    for word, tag in zip(tokens, tags):
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
            word_count[word] = word_count.get(word, 0) + 1
    # Tag to index
        if tag not in tag_to_index:
            tag_to_index[tag] = len(tag_to_index)
    max_sentence_length = max(max_sentence_length, len(tokens))

    

In [164]:
with open("out.txt", "w") as f:
    for word in tag_to_index:
        f.write(f"{word}\n")    

# PAdding and provide tag <unk> to those words wshich comes once

- train_sentece_list, train_pos_list  
- test_sentece_list, test_pos_list 
- val_sentece_list, val_pos_list  

### GET EMBEDDINGS

- Change to embeddings

In [173]:
def PrepareEmbedding(sentence_dataset, pos_dataset, word_to_index, tag_to_index, max_sentence_length, word_count):
    token_embeddings = []
    labels_embedding = []
    for sentence_str, tag_str in zip(sentence_dataset, pos_dataset):
        # Tokenize the sentence into individual tokens
        tokens = sentence_str.split(' ')
        tags = tag_str.split(' ')
        one_sentence_token_embedding = []
        one_sentence_pos_embedding = []
        # Word to index
        for word, tag in zip(tokens, tags):
            if word in word_to_index:
                if word_count[word] < 2:
                    word_cur_idx = word_to_index['<UNK>']
                else:
                    word_cur_idx = word_to_index[word]
            else:
                word_cur_idx = word_to_index['<UNK>']
            # Tag to index
            if tag in tag_to_index:
                tag_cur_idx = tag_to_index[tag]
            else:
                tag_cur_idx = tag_to_index['<UNK>']
            one_sentence_token_embedding.append(word_cur_idx)
            one_sentence_pos_embedding.append(tag_cur_idx)
        # Pad sequences using PyTorch's pad_sequence function
        one_sentence_token_embedding.extend([0] * (max_sentence_length - len(one_sentence_token_embedding)))
        one_sentence_pos_embedding.extend([0] * (max_sentence_length - len(one_sentence_pos_embedding)))
        token_embeddings.append(one_sentence_token_embedding)
        labels_embedding.append(one_sentence_pos_embedding)
    return token_embeddings, labels_embedding
            
    
    

In [177]:
train_sentece_embeddings, train_pos_embeddings = PrepareEmbedding(train_sentece_list, train_pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)
test_sentece_embeddings, test_pos_embeddings = PrepareEmbedding(test_sentece_list, test_pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)
val_sentece_embeddings, val_pos_embeddings = PrepareEmbedding(val_sentece_list, val_pos_list, word_to_index, tag_to_index, max_sentence_length, word_count)


# index to tag dictionary will be used for predicting

In [179]:
index_to_tag = {v: k for k, v in tag_to_index.items()}
index_to_tag

{0: '<PAD>',
 1: '<UNK>',
 2: 'PRON',
 3: 'AUX',
 4: 'DET',
 5: 'NOUN',
 6: 'ADP',
 7: 'PROPN',
 8: 'VERB',
 9: 'NUM',
 10: 'ADJ',
 11: 'CCONJ',
 12: 'ADV',
 13: 'PART',
 14: 'INTJ',
 15: 'SYM'}

In [176]:
with open("out.txt", "w") as f:
    for word in train_sentece_embeddings:
        f.write(f"{word}\n")
        break

51


# Lstm for training as well as validting and testing 

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, input_sentence):
        embeds = self.word_embeddings(input_sentence)
        lstm_out, _ = self.lstm(embeds.view(len(input_sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(input_sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
