# INLP - Assignment 2
## Harshavardhan P - 2021111003

In [1]:
import torch
import torch.nn as nn
import conllu
import pandas as pd
import gensim.downloader as api
from torch.utils.data import Dataset, DataLoader

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
device = ''
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(device)

mps


# Flow - 1

## Dataset

### Importing data file

In [3]:
# import the data files
dataset_path_train = 'ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-train.conllu'
dataset_path_dev = 'ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-dev.conllu'
dataset_path_test = 'ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-test.conllu'

dataset_train = conllu.parse_incr(open(dataset_path_train))
dataset_dev = conllu.parse_incr(open(dataset_path_dev))
dataset_test = conllu.parse_incr(open(dataset_path_test))

# create a dataframe from the data
def create_dataframe(dataset):
    data = []
    for tokenlist in dataset:
        for token in tokenlist:
            data.append([token['form'], token['upostag']])
    return pd.DataFrame(data, columns=['word', 'pos'])

df_train = create_dataframe(dataset_train)
df_dev = create_dataframe(dataset_dev)
df_test = create_dataframe(dataset_test)

print(df_train.head())
print(df_dev.head())
print(df_test.head())

   word   pos
0  what  PRON
1    is   AUX
2   the   DET
3  cost  NOUN
4    of   ADP
       word   pos
0         i  PRON
1     would   AUX
2      like  VERB
3       the   DET
4  cheapest   ADJ
      word   pos
0     what  PRON
1      are   AUX
2      the   DET
3    coach  NOUN
4  flights  NOUN


### Preprocess the data

In [4]:
# function to find vocabulary and POS tags, as well as load the word embeddings to be used
def preprocess_train(df, embedding_type='glove-wiki-gigaword-100'):
    vocab = set(df['word'])
    pos_tags = set(df['pos'])
    word_vectors_all = api.load(embedding_type)

    word_vectors = {}
    for word in vocab:
        if word in word_vectors_all:
            word_vectors[word] = word_vectors_all[word]
        else:
            word_vectors[word] = torch.zeros(len(word_vectors_all['the']))

    # one hot encode the POS tags
    pos_tags_one_hot = {}
    for i, tag in enumerate(pos_tags):
        one_hot = torch.zeros(len(pos_tags))
        one_hot[i] = 1
        pos_tags_one_hot[tag] = one_hot

    # convert the df to list
    data = df.values.tolist()
    dataset = []
    for i in range(len(data)):
        dataset.append([word_vectors[data[i][0]], pos_tags_one_hot[data[i][1]]])

    return dataset, word_vectors, pos_tags_one_hot

In [5]:
# function to preprocess the dev and test data, using the word vectors and POS tags from the training data
def preprocess_dev_test(df, word_vectors, pos_tags_one_hot):
    data = df.values.tolist()
    dataset = []
    for i in range(len(data)):
        # dataset.append([word_vectors[data[i][0]], pos_tags_one_hot[data[i][1]]])
        word_embedding = word_vectors[data[i][0]] if data[i][0] in word_vectors else torch.zeros(len(word_vectors['the']))
        pos_tag = pos_tags_one_hot[data[i][1]] if data[i][1] in pos_tags_one_hot else torch.zeros(len(pos_tags_one_hot['NOUN']))
        dataset.append([word_embedding, pos_tag])

    return dataset

### Creating Dataset and loading

In [6]:
class CoNLLUDataset(Dataset):
    def __init__(self, data):
        self.dataset = data

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.dataset[idx][0], dtype=torch.float32, device=device)
        target_tensor = torch.tensor(self.dataset[idx][1], dtype=torch.float32, device=device)
        return input_tensor, target_tensor

In [7]:
# preprocess the data
train_data, word_vectors, pos_tags_one_hot = preprocess_train(df_train)
dev_data = preprocess_dev_test(df_dev, word_vectors, pos_tags_one_hot)
test_data = preprocess_dev_test(df_test, word_vectors, pos_tags_one_hot)

# create the dataloaders
train_conllu_dataset = CoNLLUDataset(train_data)
dev_conllu_dataset = CoNLLUDataset(dev_data)
test_conllu_dataset = CoNLLUDataset(test_data)

train_dataloader = DataLoader(train_conllu_dataset, batch_size=32, shuffle=True)
dev_dataloader = DataLoader(dev_conllu_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_conllu_dataset, batch_size=32, shuffle=True)

## Models

## Training

## Evaluation

# Experimentations

## Importing Dataset

In [8]:
# import conllu from dataset paths
dataset_path_train = 'ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-train.conllu'
dataset_path_dev = 'ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-dev.conllu'
dataset_path_test = 'ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-test.conllu'

dataset_train = conllu.parse_incr(open(dataset_path_train))
dataset_dev = conllu.parse_incr(open(dataset_path_dev))
dataset_test = conllu.parse_incr(open(dataset_path_test))

print("Number of sentences in training dataset: ", len(list(dataset_train)))
print("Number of sentences in dev dataset: ", len(list(dataset_dev)))
print("Number of sentences in test dataset: ", len(list(dataset_test)))

Number of sentences in training dataset:  4274
Number of sentences in dev dataset:  572
Number of sentences in test dataset:  586


In [9]:
# bring the pointer back to the beginning of the file
dataset_train = conllu.parse_incr(open(dataset_path_train))
dataset_dev = conllu.parse_incr(open(dataset_path_dev))
dataset_test = conllu.parse_incr(open(dataset_path_test))

# convert this data to a pandas dataframe
def conllu_to_pandas(dataset):
    data = []
    for tokenlist in dataset:
        for token in tokenlist:
            data.append(token)
    return pd.DataFrame(data)

df_train = conllu_to_pandas(dataset_train)
df_dev = conllu_to_pandas(dataset_dev)
df_test = conllu_to_pandas(dataset_test)

print(df_train.head())

   id  form lemma  upos  xpos  \
0   1  what  what  PRON  None   
1   2    is    be   AUX  None   
2   3   the   the   DET  None   
3   4  cost  cost  NOUN  None   
4   5    of    of   ADP  None   

                                               feats  head deprel  deps  misc  
0                            {'PronType': 'Int,Rel'}     0   root  None  None  
1  {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3...     1    cop  None  None  
2                                {'PronType': 'Art'}     4    det  None  None  
3                                 {'Number': 'Sing'}     1  nsubj  None  None  
4                                               None     7   case  None  None  


## Embedding

### Finding vocab

In [10]:
# create vocabulary of the words in the training set
vocab = df_train['form'].unique()
print("Number of unique words in the training set: ", len(vocab))

Number of unique words in the training set:  863


In [11]:
# add <s>, </s> and <unk> to the vocabulary
vocab = ['<s>', '</s>', '<unk>'] + list(vocab)
print("Number of unique words in the training set after adding <s>, </s>, <unk>: ", len(vocab))

Number of unique words in the training set after adding <s>, </s>, <unk>:  866


In [12]:
# finding all unique pos tags
upos = df_train['upos'].unique()
print("Number of unique upos in the training set: ", len(upos))
print(upos)

Number of unique upos in the training set:  13
['PRON' 'AUX' 'DET' 'NOUN' 'ADP' 'PROPN' 'VERB' 'NUM' 'ADJ' 'CCONJ' 'ADV'
 'PART' 'INTJ']


In [13]:
# adding pos tags for <s>, </s>, <unk> to the upos
upos = ['STRT', 'END', 'UNK'] + list(upos)

print("Number of unique upos in the training set after adding STRT, END, UNK: ", len(upos))

Number of unique upos in the training set after adding STRT, END, UNK:  16


### Embedding layer

In [14]:
# for the vocab, we must create a nn embedding
# we will use the nn.Embedding class from pytorch

# create a dictionary to map words to indices
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
print(word_to_idx)

embedding = nn.Embedding(len(vocab), 100, device=device)
print(embedding)

# create a tensor of indices for the words in the first sentence
sentence = df_train[df_train['id'] == 1]
print(len(sentence))
# print(sentence)

word_indices = [word_to_idx.get(word, word_to_idx['<unk>']) for word in sentence['form']]
print(word_indices)

word_indices = torch.tensor(word_indices, dtype=torch.long, device=device)
print(word_indices)

# pass the tensor of indices to the embedding
embedded = embedding(word_indices)
print(embedded)
print(embedded.shape)

{'<s>': 0, '</s>': 1, '<unk>': 2, 'what': 3, 'is': 4, 'the': 5, 'cost': 6, 'of': 7, 'a': 8, 'round': 9, 'trip': 10, 'flight': 11, 'from': 12, 'pittsburgh': 13, 'to': 14, 'atlanta': 15, 'beginning': 16, 'on': 17, 'april': 18, 'twenty': 19, 'fifth': 20, 'and': 21, 'returning': 22, 'may': 23, 'sixth': 24, 'now': 25, 'i': 26, 'need': 27, 'leaving': 28, 'fort': 29, 'worth': 30, 'arriving': 31, 'in': 32, 'denver': 33, 'no': 34, 'later': 35, 'than': 36, '2': 37, 'pm': 38, 'next': 39, 'monday': 40, 'fly': 41, 'kansas': 42, 'city': 43, 'chicago': 44, 'wednesday': 45, 'following': 46, 'day': 47, 'meaning': 48, 'meal': 49, 'code': 50, 's': 51, 'show': 52, 'me': 53, 'all': 54, 'flights': 55, 'which': 56, 'serve': 57, 'for': 58, 'after': 59, 'tomorrow': 60, 'us': 61, 'air': 62, 'list': 63, 'nonstop': 64, 'early': 65, 'tuesday': 66, 'morning': 67, 'dallas': 68, 'st.': 69, 'petersburg': 70, 'toronto': 71, 'that': 72, 'arrive': 73, 'listing': 74, 'new': 75, 'york': 76, 'montreal': 77, 'canada': 78, 'd

### Word2Vec embedding

In [15]:
# load the pre-trained word2vec model - wiki 100
word2vec = api.load("glove-wiki-gigaword-100")
print('loaded')
print(word2vec['king'])

loaded
[-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881
  0.16483  -

## Changing label rep. to one-hot

In [16]:
# change the pos tags to one-hot vectors


## Creating torch dataset

In [17]:
class CoNLLUDataset(Dataset):
    def __init__(self, conllu_file):
        self.data = self.load_conllu(conllu_file)

    def load_conllu(self, conllu_file):
        dataset = conllu.parse_incr(open(conllu_file))
        data = []
        for tokenlist in dataset:
            for token in tokenlist:
                data.append(token)
        dataset = pd.DataFrame(data)
        # only retain the columns form and upos
        dataset = dataset[['form', 'upos']]
        # convert dataset to normal list
        dataset = dataset.values.tolist()
        return dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [18]:
# print a element in the dataset
dataset = CoNLLUDataset(dataset_path_train)
print(dataset[0])

['what', 'PRON']


In [19]:
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, drop_last=False)

## Models

### FNN

In [20]:
# creating a FNN which takes n dim input and returns pos tag
class FNN(nn.Module):
    def __init__(self, embed_dim, prev_n, succ_n, hidden_params, output_dim):
        super(FNN, self).__init__()
        # for each element in hidden_params, we will create a linear layer
        hidden_layers = []
        hidden_layers.append(nn.Linear(embed_dim * (prev_n + 1 + succ_n), hidden_params[0]))
        hidden_layers.append(nn.ReLU())
        for i in range(1, len(hidden_params)):
            hidden_layers.append(nn.Linear(hidden_params[i-1], hidden_params[i]))
            hidden_layers.append(nn.ReLU())
        # softmax layer for output
        self.hidden_layers = nn.Sequential(*hidden_layers)
        self.output_layer = nn.Linear(hidden_params[-1], output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        x = self.softmax(x)
        return x

# create a model
input_dim = 100
hidden_params = [100, 50]
