In [10]:
import re
import math
import conllu
import random
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [11]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
device

device(type='cpu')

In [12]:
train_data = open("./conllu/train.conllu", mode="r", encoding="utf-8")
annotated_train_data = train_data.read()
sentences = conllu.parse(annotated_train_data)

Got all senteces with padding and each sentence with pos tags

In [13]:
word_to_index = {}
tag_to_index = {}
max_sentence_length = 0
word_count = {}
word_to_index['<PAD>'] = 0
tag_to_index['<UNK>'] = 0

def process_dataset(dataset_file, p=2, s=3):
    sentences_list = []
    pos_list = []

    with open(dataset_file, 'r', encoding='utf-8') as f:
        sentence_tokens = []
        pos_tags = []

        for line in f:
            line = line.strip()

            if line.startswith('#'):
                sentence_tokens = []
                pos_tags = []
                continue
            elif line == '':
                # Append padding to the end of the sentence
                padded_sentence = ' '.join(['<PAD>'] * p) + ' ' + ' '.join(sentence_tokens) + ' ' + ' '.join(['<PAD>'] * s)
                padded_pos = ' '.join(['<UNK>'] * p + pos_tags + ['<UNK>'] * s)
                sentences_list.append(padded_sentence)
                pos_list.append(padded_pos)
                continue
            else:
                # New sentence begins
                token_attrs = line.split('\t')
                word_form = token_attrs[1]  # Word form of the token
                pos_tag = token_attrs[3]    # POS tag of the token
                sentence_tokens.append(word_form)
                pos_tags.append(pos_tag)

    return sentences_list, pos_list

# Example usage:
dataset_file = './conllu/train.conllu'
sentences_list, pos_list = process_dataset(dataset_file, p= 2, s= 3)



with open("out.txt", "w") as f:
    for word in sentences_list:
        f.write(f"{word}\n")

In [14]:
# Split the data into sentences
word_to_index = {'<PAD>': 0, '<UNK>': 1}
tag_to_index = {'<UNK>': 0}
word_count = {}
max_sentence_length = 0

# Process each sentence to tokenize the data
for sentence_str, tag_str in zip(sentences_list, pos_list):
    # Tokenize the sentence into individual tokens
    tokens = sentence_str.split(' ')
    tags = tag_str.split(' ')
        # Word to index
    for word, tag in zip(tokens, tags):
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
            word_count[word] = word_count.get(word, 0) + 1
    # Tag to index
        if tag not in tag_to_index:
            tag_to_index[tag] = len(tag_to_index)

In [8]:
with open("out.txt", "w") as f:
    for word, a in word_to_index.items():
        f.write(f"{word} {a}\n")

In [113]:
import numpy as np
# Step 1: Define the Model
class FFNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, p, s):
        super(FFNN, self).__init__()
        # Calculate the actual input size considering embedding dimensions
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear((p + s + 1) *embedding_dim , hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Flatten the input tensor
        first = self.embedding(x)
        first = first.view(-1)
        out = self.fc1(first)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [58]:
sen = [sentences_list[0]]
pos = [pos_list[0]]
sen, pos

(['<PAD> <PAD> what is the cost of a round trip flight from pittsburgh to atlanta beginning on april twenty fifth and returning on may sixth <PAD> <PAD> <PAD>'],
 ['<UNK> <UNK> PRON AUX DET NOUN ADP DET NOUN NOUN NOUN ADP PROPN ADP PROPN VERB ADP NOUN NUM ADJ CCONJ VERB ADP NOUN ADJ <UNK> <UNK> <UNK>'])

In [130]:
criterion = nn.CrossEntropyLoss()  # Define the loss function

# Step 3: Instantiate Model
vocab_size = len(word_to_index)
embedding_dim = 100  # Example dimension, adjust as needed
hidden_size = 128    # Example size, adjust as needed
p = 2
s = 3
output_size = (s + p + 1)
output_size = len(tag_to_index)
model = FFNN(vocab_size, embedding_dim, hidden_size, output_size, p, s)
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Example optimizer, adjust as needed
num_epochs = 1

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    # Iterate over your dataset
    for sentence, pos_tags in zip(sen, pos):  # Consider your data structures here
        # Convert tokens and POS tags to indices
        token_indices = [word_to_index[token] for token in sentence.strip().split()]
        pos_indices = [tag_to_index[pos_tag] for pos_tag in pos_tags.strip().split()]
        
        # Create sliding window of size 6 and convert to tensors
        for i in range(p, len(token_indices) - s):
            window_tokens = token_indices[i-p:i+s+1]    
            window_tokens_tensor = torch.LongTensor(window_tokens)
            window_pos_tensor = torch.LongTensor(pos_indices[i-p:i+s+1])
            optimizer.zero_grad()
            outputs = model(window_tokens_tensor)  # Forward pass
            predicted = outputs.argmax() # Get the index of the max logit as the predicted class
            print(predicted)
            print("window_tokens_tensor", window_tokens_tensor)
            print("window_pos_tensor", window_pos_tensor)
            print("predicted", predicted)
            print("outputs", outputs)

            # Calculate loss
            loss = criterion(predicted, window_pos_tensor)  # Compare outputs with true labels
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            print(i, loss.item())  # Print loss for monitoring purposes


tensor(2)
window_tokens_tensor tensor([0, 0, 2, 3, 4, 5])
window_pos_tensor tensor([0, 0, 1, 2, 3, 4])
predicted tensor(2)
outputs tensor([-0.2841, -0.2653,  0.4636, -0.2601,  0.0806, -0.0814,  0.4453, -0.0682,
         0.2261,  0.0062,  0.3250, -0.0343, -0.0174,  0.1120],
       grad_fn=<AddBackward0>)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [78]:

# Step 2: Define Loss Function
criterion = nn.CrossEntropyLoss()

# Step 3: Instantiate Model
vocab_size = len(word_to_index)
embedding_dim = 100  # Example dimension, adjust as needed
hidden_size = 128    # Example size, adjust as needed
output_size = len(tag_to_index)
p = 2
s = 3
model = FFNN(vocab_size, embedding_dim, hidden_size, output_size, p, s)
optimizer = optim.Adam(model.parameters(), lr=0.001) # Example optimizer, adjust as needed
num_epochs = 10 
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    # Iterate over your dataset
    for sentence, pos_tags in zip(sentences_list, pos_list):
        # Convert tokens and POS tags to indices
        token_indices = [word_to_index[token] for token in sentence.strip().split()]
        pos_indices = [tag_to_index[pos_tag] for pos_tag in pos_tags.strip().split()]
        # Create sliding window of size 6 and convert to tensors
        # print(token_indices, pos_indices)
        for i in range(p, len(token_indices) - s):
            window_tokens = token_indices[i-p:i+s+1]
            # window_pos = pos_indices[i+2]  # Assuming you want to predict the POS tag at the center of the window
            # Convert to PyTorch tensors
            window_tokens_tensor = torch.LongTensor(window_tokens)
            window_pos_tensor = torch.LongTensor(pos_indices[i-p:i+s+1])
            # Zero the gradients
            optimizer.zero_grad()
            outputs = model(window_tokens_tensor)
            predicted = outputs.argmax()
            # print(outputs, predicted,window_pos_tensor)

            # Calculate loss
            print(i)
            loss = criterion(outputs, window_pos_tensor[i])

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
          

2
3
4
5
6


IndexError: index 6 is out of bounds for dimension 0 with size 6

In [1]:
with open("out.txt", "w") as f:
    for word in encoded_inputs:
        f.write(f"{word}\n")
        break

NameError: name 'encoded_inputs' is not defined

In [55]:
len(word_to_index), len(tag_to_index), max_sentence_length

(865, 14, 46)

Now iterate over each list of size s + p + 1


Divide data in words (X) and tags (Y)

Vectorise X and Y
Encode X and Y to integer values

using LSTM

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [47]:
with open("./conllu/train.conllu", "r", encoding="utf-8") as f:
    train_data = f.read()
with open("./conllu/val.conllu", "r", encoding="utf-8") as f:
    val_data = f.read()
with open("./conllu/test.conllu", "r", encoding="utf-8") as f:
    test_data = f.read()

In [48]:
# Split the data into sentences
sentences_data = train_data.strip().split("\n\n")

# Process each sentence to tokenize the data
word_to_index = {}
tag_to_index = {}
max_sentence_length = 0
word_count = {}

# Initialize special tokens
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
tag_to_index['<PAD>'] = 0

# Iterate through each sentence
for sentence_str in sentences_data:
    # Tokenize the sentence into individual tokens
    token_list = sentence_str.strip().split("\n")
    for token_str in token_list:
        # Ensure the line is not a comment
        if token_str[0] != '#':
            # Split the token fields
            token_fields = token_str.split("\t")

            # Extract token index, word, and tag
            token_index = int(token_fields[0])
            word = token_fields[1]
            tag = token_fields[3]
            # ------------------Word to index-------------------
            # Add the word to the word_to_index dictionary if it doesn't exist
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)
                # it will update the size after new word inserted

            # Update word count dictionary
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] += 1
            # ------------------TAg to index-------------------
            # Add the tag to the tag_to_index dictionary if it doesn't exist
            if tag not in tag_to_index:
                tag_to_index[tag] = len(tag_to_index)
            # no need to count the tag as it is already in the dictionary
            # Update max_sentence_length if needed
            max_sentence_length = max(token_index, max_sentence_length)


In [15]:
import numpy as np

def process_data(data, word_to_index, tag_to_index, max_sentence_length, word_count):
    input_sequences = []
    output_sequences = []
    # splitting all sentences
    All_sentences_list = data.strip().split("\n\n")
    for sentence in All_sentences_list:
        input_sequence = []
        output_sequence = []

        tokens_strings = sentence.strip().split("\n")
        for each_token in tokens_strings:
            if each_token[0] == '#':
                continue
            else:
                # as differnt fields
                # ex - 5	of	of	ADP	_	_	7	case
                fields = each_token.split("\t")
                word = fields[1]
                tag = fields[3]

                if word in word_to_index:
                    if word_count[word] < 2:
                        word_cur_idx = word_to_index['<UNK>']
                    else:
                        word_cur_idx = word_to_index[word]
                else:
                    word_cur_idx = word_to_index['<UNK>']
                
                if tag in tag_to_index:
                    tag_cur_idx = tag_to_index[tag]
                else:
                    # what to do here
                    tag_cur_idx = tag_to_index['PRON']
                #For each word append current word index to input_sequence and tag index to output_sequence 
                input_sequence.append(word_cur_idx)
                output_sequence.append(tag_cur_idx)
        
        # Pad sequences using PyTorch's pad_sequence function
        input_sequence = torch.tensor(input_sequence)
        output_sequence = torch.tensor(output_sequence)
        input_sequence = torch.nn.functional.pad(input_sequence, (0, max_sentence_length - len(input_sequence)))
        output_sequence = torch.nn.functional.pad(output_sequence, (0, max_sentence_length - len(output_sequence)))
        input_sequences.append(input_sequence)
        output_sequences.append(output_sequence)
     # Stack sequences to create tensors
    input_sequences = torch.stack(input_sequences).numpy()
    output_sequences = torch.stack(output_sequences).numpy()

    return input_sequences, output_sequences


Data prepare

In [16]:
X_train, y_train = process_data(train_data, word_to_index, tag_to_index, max_sentence_length, word_count)
X_val, y_val = process_data(val_data, word_to_index, tag_to_index, max_sentence_length, word_count)
X_test, y_test = process_data(test_data, word_to_index, tag_to_index, max_sentence_length, word_count)




In [45]:
from sklearn.preprocessing import StandardScaler
# Assuming X_train is your input data
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
y_train_normalized = scaler.fit_transform(y_train)
X_val_normalized = scaler.fit_transform(X_val)
y_val_normalized = scaler.fit_transform(y_val)
X_test_normalized = scaler.fit_transform(X_test)
y_test_normalized = scaler.fit_transform(y_test)

In [46]:
with open("out.txt", "w") as f:
    # save input sequences to file
    for i in y_test_normalized:
        f.write(f"{i}\n")

# Step 2: Create PyTorch DataLoaders

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

#### Create PyTorch datasets and data loaders for training data. This allows efficient batching and shuffling of data.

In [67]:
train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define Model

X_train.shape[1] is used to specify the input size of the neural network model.     X_train.shape[1] accesses the second element of the shape tuple, which represents the number of features or input dimensions in your dataset. In the example (1000, 50), X_train.shape[1] would be 50.

Sending X_train.shape[1] as the input_size parameter to the FFNN constructor ensures that the input layer of your neural network has the correct number of neurons to accommodate the input features of your dataset.

In [87]:
# Step 3: Define Loss Function and Optimizer
model = FFNN(input_size=X_train.shape[1], hidden_size=128, output_size=y_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Step 4: Training Loop

In [78]:
num_epochs = 20
# for training 
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")


Epoch [1/20], Loss: 1942.5025033523787
Epoch [2/20], Loss: 1406.451219245569
Epoch [3/20], Loss: 1080.4993978471898
Epoch [4/20], Loss: 874.9364979302705
Epoch [5/20], Loss: 744.392890588561
Epoch [6/20], Loss: 654.735463156629
Epoch [7/20], Loss: 589.1151683294951
Epoch [8/20], Loss: 538.1906392111707
Epoch [9/20], Loss: 498.44278773976794
Epoch [10/20], Loss: 465.6859440590019
Epoch [11/20], Loss: 438.57284591447063
Epoch [12/20], Loss: 415.38864682325675
Epoch [13/20], Loss: 395.12138571668027
Epoch [14/20], Loss: 377.8649383089436
Epoch [15/20], Loss: 362.8608712723006
Epoch [16/20], Loss: 348.303059819919
Epoch [17/20], Loss: 335.64912983908584
Epoch [18/20], Loss: 324.2286862045971
Epoch [19/20], Loss: 313.86049435743644
Epoch [20/20], Loss: 303.9825999701201


In [None]:
for epoch in range(1000):  # Adjust number of epochs as needed
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.float())
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [96]:
index_to_tag = {v: k for k, v in tag_to_index.items()}

In [105]:
# Example input sentence
input_sentence = "Mary had a little lamb"
# Tokenize the sentence and convert it to a sequence of word indices
words = sentence.split()
input_sequence = []
for word in words:
    if word in word_to_index:
        if word_count[word] < 2:
            input_sequence.append(word_to_index['<UNK>'])
        else:
            input_sequence.append(word_to_index[word])
    else:
        input_sequence.append(word_to_index['<UNK>'])

# Tokenize the input sentence
# input_sequence = input_sentence.split()
# Pad the sequence with zeros to make it the same length as max_sen_len
if len(input_sequence) < max_sentence_length:
    input_sequence += [0] * (max_sentence_length - len(input_sequence))

# Convert tokens to indices using the vocabulary
input_indices = [word_to_index.get(token, word_to_index['<UNK>']) for token in input_sequence]
# Pad the input sequence if necessary
input_sequence = np.array(input_sequence).reshape(1, max_sentence_length)
print(input_sequence)
# Convert input indices to PyTorch tensor
# input_tensor = torch.tensor(input_sequence)
# Ensure input tensor has the correct shape and type if needed

# Pass the input through the model
with torch.no_grad():
    model.eval()  # Set model to evaluation mode
    outputs = model(input_sequence)

# Map output predictions to POS tags
predicted_tags = [index_to_tag[torch.argmax(output).item()] for output in outputs]

print("Input sentence:", input_sentence)
print("Predicted tags:", predicted_tags)


[[1 1 7 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0]]


AttributeError: 'numpy.ndarray' object has no attribute 'to'

# Question2 : 2 Recurrent Neural Network POS Tagging
Design and implement a model which uses Recurrent Neural Networks (Vanilla
RNN, LSTM, or GRU) for POS Tagging. The model should take the embeddings for all tokens in a sentence and output the corresponding POS tags in
sequence.
- For Example: In the sentence "An apple a day keeps the doctor away",
 the model takes the embeddings for 
- ["An", "apple", "a", "day", "keeps", "the","doctor", "away"] and
 outputs the POS tags for all the words in the sentence
- ["DET", "NOUN", "DET", "NOUN", "VERB", "DET", "NOUN", "ADV"] 

Step1 : Count all words and postags and provide them a index value

- Doing it for all 3 datasets train, validation and test-dataset

In [132]:
with open("./conllu/train.conllu", "r", encoding="utf-8") as f:
    train_data = f.read()
with open("./conllu/val.conllu", "r", encoding="utf-8") as f:
    val_data = f.read()
with open("./conllu/test.conllu", "r", encoding="utf-8") as f:
    test_data = f.read()

In [133]:
def process_dataset(dataset_file, p=2, s=3):
    sentences_list = []
    pos_list = []

    with open(dataset_file, 'r') as f:
        sentence_tokens = []
        pos_tags = []

        for line in f:
            line = line.strip()

            if line.startswith('#'):
                sentence_tokens = []
                pos_tags = []
                continue
            elif line == '':
                # Append padding to the end of the sentence
                padded_sentence = ' '.join(['<PAD>'] * p) + ' ' + ' '.join(sentence_tokens) + ' ' + ' '.join(['<PAD>'] * s)
                padded_pos = ' '.join(['<UNK>'] * p + pos_tags + ['<UNK>'] * s)
                sentences_list.append(padded_sentence)
                pos_list.append(padded_pos)
                continue
            else:
                # New sentence begins
                token_attrs = line.split('\t')
                word_form = token_attrs[1]  # Word form of the token
                pos_tag = token_attrs[3]    # POS tag of the token
                sentence_tokens.append(word_form)
                pos_tags.append(pos_tag)

    return sentences_list, pos_list



### for all train, test and validation dataset
- I got all sentences tokens and respective pos-tag in form of sentence sepateted by space 
- Max sentece length, word count , word to index adn tag t index 

In [134]:
# Split the data into sentences
word_to_index = {'<PAD>': 0, '<UNK>': 1}
tag_to_index = {'<UNK>': 0}
word_count = {}
max_sentence_length = 0
train_dataset = "./conllu/train.conllu"
test_dataset = "./conllu/test.conllu"
val_dataset = "./conllu/val.conllu"
train_sentece_list, train_pos_list = process_dataset(train_dataset, p= 2, s= 3)
test_sentece_list, test_pos_list = process_dataset(test_dataset, p= 2, s= 3)
val_sentece_list, val_pos_list = process_dataset(val_dataset, p= 2, s= 3)

sentences_list = train_sentece_list + test_sentece_list + val_sentece_list
pos_list = train_pos_list + test_pos_list + val_pos_list

# Process each sentence to tokenize the data
for sentence_str, tag_str in zip(sentences_list, pos_list):
    # Tokenize the sentence into individual tokens
    tokens = sentence_str.split(' ')
    tags = tag_str.split(' ')
        # Word to index
    for word, tag in zip(tokens, tags):
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
            word_count[word] = word_count.get(word, 0) + 1
    # Tag to index
        if tag not in tag_to_index:
            tag_to_index[tag] = len(tag_to_index)
    max_sentence_length = max(max_sentence_length, len(tokens))
    
    

In [138]:
with open("out.txt", "w") as f:
    for word in pos_list:
        f.write(f"{word}\n")    

# PAdding and provide tag <unk> to those words wshich comes once

- train_sentece_list, train_pos_list  
- test_sentece_list, test_pos_list 
- val_sentece_list, val_pos_list  

In [6]:
with open("out.txt", "w") as f:
    for word, a in word_to_index.items():
        f.write(f"{word} {a}\n")