In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset
from transformers import BertTokenizer

import spacy
import numpy as np

import time 
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

Dataset URL: https://huggingface.co/datasets/universal_dependencies

In [3]:
dataset = load_dataset("universal_dependencies", "en_ewt")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 12543
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 2002
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 2077
    })
})

`xpos` == `ptbtags`  
`upos` == `udtags`  

`upos` = Universal POS tags
`xpos` = Language-specific POS tags 

In [5]:
columns_to_remove = [col for col in dataset['train'].column_names if col not in ["tokens","upos", "xpos"]]
dataset = dataset.remove_columns(columns_to_remove)

In [6]:
def lowercase_tokens(tokens):
    return {'tokens': [token.lower() for token in tokens['tokens']]}

In [7]:
ud_dataset = dataset.map(lowercase_tokens)

In [8]:
train_data = ud_dataset["train"]
val_data = ud_dataset["validation"]
test_data = ud_dataset["test"]

In [9]:
train_data[]

Dataset({
    features: ['tokens', 'upos', 'xpos'],
    num_rows: 12543
})

In [13]:
train_data['tokens']

[['al',
  '-',
  'zaman',
  ':',
  'american',
  'forces',
  'killed',
  'shaikh',
  'abdullah',
  'al',
  '-',
  'ani',
  ',',
  'the',
  'preacher',
  'at',
  'the',
  'mosque',
  'in',
  'the',
  'town',
  'of',
  'qaim',
  ',',
  'near',
  'the',
  'syrian',
  'border',
  '.'],
 ['[',
  'this',
  'killing',
  'of',
  'a',
  'respected',
  'cleric',
  'will',
  'be',
  'causing',
  'us',
  'trouble',
  'for',
  'years',
  'to',
  'come',
  '.',
  ']'],
 ['dpa',
  ':',
  'iraqi',
  'authorities',
  'announced',
  'that',
  'they',
  'had',
  'busted',
  'up',
  '3',
  'terrorist',
  'cells',
  'operating',
  'in',
  'baghdad',
  '.'],
 ['two',
  'of',
  'them',
  'were',
  'being',
  'run',
  'by',
  '2',
  'officials',
  'of',
  'the',
  'ministry',
  'of',
  'the',
  'interior',
  '!'],
 ['the',
  'moi',
  'in',
  'iraq',
  'is',
  'equivalent',
  'to',
  'the',
  'us',
  'fbi',
  ',',
  'so',
  'this',
  'would',
  'be',
  'like',
  'having',
  'j.',
  'edgar',
  'hoover',
  'unwi

In [82]:
train_data[0]['uposb']

[10,
 1,
 10,
 1,
 6,
 0,
 16,
 10,
 10,
 10,
 1,
 10,
 1,
 8,
 0,
 2,
 8,
 0,
 2,
 8,
 0,
 2,
 10,
 1,
 2,
 8,
 6,
 0,
 1]

In [83]:
train_data[0]['xpos']

['NNP',
 'HYPH',
 'NNP',
 ':',
 'JJ',
 'NNS',
 'VBD',
 'NNP',
 'NNP',
 'NNP',
 'HYPH',
 'NNP',
 ',',
 'DT',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'DT',
 'NN',
 'IN',
 'NNP',
 ',',
 'IN',
 'DT',
 'JJ',
 'NN',
 '.']

### Bulding a Vocabulary
maping of tokens to integers wich means that each token will be represented by a unique integer.

We want some unknown tokens within our dataset in order to replicate how this model would be used in real life, so we set the `min_freq` to 2 which means only tokens that appear twice in the training set will be added to the vocabulary and the rest will be replaced by `<unk>` tokens.

We also load the GloVe pre-trained token embeddings. Specifically, the 100-dimensional embeddings that have been trained on 6 billion tokens. Using pre-trained embeddings usually leads to improved performance - although admittedly the dataset used in this tutorial is too small to take advantage of the pre-trained embeddings.

`unk_init` is used to initialize the token embeddings which are not in the pre-trained embedding vocabulary. By default this sets those embeddings to zeros, however it is better to not have them all initialized to the same value, so we initialize them from a Normal/Gaussian distribution.

In [11]:
def load_glove_embeddings(file_path):
    embeddings_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

glove_file = "D:\Dawnloads\glove.6B\glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_file)

In [23]:
from collections import Counter

def build_vocab(dataset, min_freq=2):
    counter = Counter()
    for example in dataset:
        counter.update(example['tokens'])
    
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    
    return vocab

train_dataset = train_data
vocab = build_vocab(train_dataset, min_freq=2)



In [28]:
vocab['the']

11

In [26]:
def create_embedding_matrix(vocab, embeddings, embedding_dim=100):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        if word in embeddings:
            embedding_matrix[idx] = embeddings[word]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

embedding_matrix = create_embedding_matrix(vocab, glove_embeddings)

In [31]:
def vectorize_tokens(example, vocab):
    return [vocab.get(token, vocab['<UNK>']) for token in example['tokens']]

def vectorize_tags(example, tag_list):
    # return [tag_list.index(tag) for tag in example['upos']]

# upos_tags = list(set([tag for example in train_dataset for tag in example['upos']]))
xpos_tags = list(set([tag for example in train_dataset for tag in example['xpos']]))

train_dataset = train_dataset.map(lambda x: {'token_ids': vectorize_tokens(x, vocab),
                                              
                                              'xpos_tag_ids': vectorize_tags(x, xpos_tags)})

val_dataset = ud_dataset['validation'].map(lambda x: {'token_ids': vectorize_tokens(x, vocab),
                                                       
                                                       'xpos_tag_ids': vectorize_tags(x, xpos_tags)})

test_dataset = ud_dataset['test'].map(lambda x: {'token_ids': vectorize_tokens(x, vocab),
                                                  
                                                  'xpos_tag_ids': vectorize_tags(x, xpos_tags)})

Map:   0%|          | 0/12543 [00:00<?, ? examples/s]


ValueError: 10 is not in list