In [2]:
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
ABSENT_WORD = ("*****", "*****")
PADDING_CHAR = " "
PADDING_WORD = " "
ABSENT_PAIR = ABSENT_WORD
WINDOW_LEN = 5
RANDOM_CHANCE=0.1

In [4]:
def read_dataset(file_path, with_tags=True):
    """
    Read the dataset from file
    Args:
        file_path (str): path to the file to read from
        with_tags (bool): flag that indicates the presence of tags in data.
                          Use False to read test data.
    Returns:
        If with_tags is true, the list of tuples, one for each sentence
            One tuple contains list of lowercase words and corresponding list of tags
        Othervise the list of lowercase word lists, one fo each sentence
    """
    
    dataset = []
    with open(file_path, "r") as data_file:
        for line in data_file.readlines():
            # Split each sentence into items
            items = line[:-1].split(" ")
            if with_tags:
                # If tags are present, create separate lists of words and tags
                words = []
                tags = []
                for item in items:
                    [word, tag] = item.rsplit("/", 1)
                    words.append(word.lower())
                    tags.append(tag)
                dataset.append((words, tags))
            else:
                # If tags are not present, append word list to the dataset
                dataset.append([word.lower() for word in items])
    return dataset


def dataset_to_dictionary(dataset, absent_pair=None, absent_char=None):
    word_to_idx = {}
    idx_to_word = {}
    tag_to_idx = {}
    idx_to_tag = {}
    
    char_to_idx = {}
    idx_to_char = {}
    
    for (words, tags) in dataset:
        for word in words:
            if word not in word_to_idx:
                idx = len(word_to_idx)
                word_to_idx[word] = idx
                idx_to_word[idx] = word
            for letter in word:
                if letter not in char_to_idx:
                    idx = len(char_to_idx)
                    char_to_idx[letter] = idx
                    idx_to_char[idx] = letter
            
        for tag in tags:
            if tag not in tag_to_idx:
                idx = len(tag_to_idx)
                tag_to_idx[tag] = idx
                idx_to_tag[idx] = tag
                
    if absent_pair is not None:
        absent_word, absent_tag = absent_pair
        if absent_word not in word_to_idx:
            idx = len(word_to_idx)
            word_to_idx[absent_word] = idx
            idx_to_word[idx] = absent_word
        if absent_tag not in tag_to_idx:
            idx = len(tag_to_idx)
            tag_to_idx[absent_tag] = idx
            idx_to_tag[idx] = absent_tag
        for letter in absent_word:
                if letter not in char_to_idx:
                    idx = len(char_to_idx)
                    char_to_idx[letter] = idx
                    idx_to_char[idx] = letter
    if absent_char is not None and absent_char not in char_to_idx:
        idx = len(char_to_idx)
        char_to_idx[absent_char] = idx
        idx_to_char[idx] = absent_char
        
    return word_to_idx, tag_to_idx, char_to_idx, idx_to_word, idx_to_tag, idx_to_char


def prepare_sequence(sequence, dictionary, absent_key=None, random_key=None, random_chance=0.1):  
    """
    Translate sequence according to dictionary.
    Args:
        sequence (list): list of keys
        dictionary (dict): mapping from key to integer
        absent_key (str): key which will substitute absent keys in sequence.
                            if None, absent keys will be ignored
        random_key (bool): key which will substitute keys in sequence 
                            with some chance (10% maybe)
                            if None, random substitution will not be used.
    Returns:
        list of transformed sequence
    """
    translated_seq = []
    for key in sequence:
        # Handle absent keys if absent_key specified
        if key not in dictionary:
            if absent_key is not None:
                translated_seq.append(dictionary[absent_key])
        # Random substitute if random_key specified
        elif random_key is not None and torch.rand(1)[0]<random_chance:
            translated_seq.append(dictionary[random_key])
        else:
            translated_seq.append(dictionary[key])
    return torch.tensor(translated_seq, dtype=torch.long)

In [5]:
train_dataset = read_dataset("corpus.train", with_tags=True)

In [6]:
word_to_idx, tag_to_idx, char_to_idx, idx_to_word, idx_to_tag, idx_to_char = dataset_to_dictionary(train_dataset, 
                                                                                                   absent_pair=ABSENT_PAIR, 
                                                                                                   absent_char=PADDING_CHAR)

### Data analyze

In [13]:
max_len = 0
max_word = ''
lengths = []
for (words, tags) in train_dataset:
      for word in words:
        lengths.append(len(word))
        if max_len < len(word):
          max_len = len(word)
          max_word = word

print("Max length word", max_len, max_word)

Max length word 54 capitalist-exploiters-greedy-american-consumers-global


In [10]:
import statistics
mean = statistics.mean(lengths)
print("Mean length:", mean)

Mean length: 4.455600879956665


##Batching

In [None]:
for (idx, sent) in enumerate(train_dataset):
  print(f'{idx}: len = {len(sent[0])}')

In [7]:
def sort_func(el):
  return len(el[0])

In [8]:
def padd_word(word, length = 54, symbol = " "):
  return word + symbol*(length - len(word))

In [9]:
def padd_sentence(sentence, words_amount, padd_pair):
  padd_word, padd_tag = padd_pair
  # print("Padding...", padd_word, padd_tag)
  for i in range(words_amount):
    sentence[0].append(padd_word)
    sentence[1].append(padd_tag)
  return sentence

In [209]:
train_dataset = read_dataset("corpus.train", with_tags=True)
train_dataset.sort(key=sort_func)

In [10]:
def create_batch(sentences, word_to_idx, char_to_idx, tag_to_idx, label = True):
  max_words_num = max(len(s[0]) for s in sentences)
  max_words_len = find_max_word_len(sentences)
  batch_sentences = []
  batch_taggs = []
  batch_words = []
  # print('MAX WORD LEN:', max_words_len)
  # print('MAX WORD NUM:', max_words_num)
  for sent in sentences:
    padded_sent = padd_sentence(sent, max_words_num - len(sent[0]), ABSENT_PAIR)
    # print('\nWordsNum:', len(padded_sent[0]))
    words, taggs = padded_sent
    codded_sentence = prepare_sequence(words, word_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
    codded_taggs = prepare_sequence(taggs, tag_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
    # print(codded_sentence.shape)
    batch_sentences.append(codded_sentence)
    batch_taggs.append(codded_taggs)
    codded_words = []
    for word in words:
        paddedWord = padd_word(word, length = max_words_len, symbol=' ')
        codded_word = prepare_sequence(paddedWord, char_to_idx, absent_key=PADDING_CHAR)
        codded_word = torch.reshape(codded_word, (1,-1))
        codded_words.append(codded_word)
    words_ = torch.cat(codded_words,dim=0)
    batch_words.append(words_)
    # print( batch_words[-1].size(), batch_words[-1])
  # print(len(batch_sentences))
  batch_words = torch.stack(batch_words, dim=0)
  batch_sentences = torch.stack(batch_sentences, dim=0)
  batch_taggs = torch.stack(batch_taggs, dim=0)
  # print(batch_sentences.shape)
  # print(batch_words.size(), batch_senteces.size())
  return batch_sentences, batch_words, batch_taggs



In [11]:
def find_max_word_len(sentences):
  max_words_len = 0
  for s in sentences:
    ws, tg = s
    max_s= max(len(w) for w in ws)
    if max_words_len < max_s:
      max_words_len = max_s
  return max_words_len

In [26]:
import math

class Batchizer():
  def __init__(self, dataset, word_to_idx, char_to_idx, tag_to_idx, batch_size = 10, label = True, sorting = True):
    batches = []
    sentences_batches = []
    if (sorting and label):
      dataset.sort(key=sort_func)
    num_batches = math.ceil(len(dataset) / batch_size )
    for i in range(num_batches - 1):
      if i + batch_size >= len(dataset):
        sentences = dataset[i:len(dataset)]
      else: 
        sentences = dataset[i:(i + batch_size)]
      
      batch = create_batch(sentences, word_to_idx, char_to_idx, tag_to_idx, label)
      batches.append(batch)

    self.batches = batches
    self.word_to_idx = word_to_idx
    self.char_to_idx = char_to_idx

  def __len__(self):
    return len(self.batches)
  
  def get_batch(self, index):
    return self.batches[index]



In [13]:
train_dataset = read_dataset("corpus.train", with_tags=True)
batchizer = Batchizer(train_dataset, word_to_idx, char_to_idx, tag_to_idx,)

In [15]:
sentences, words, taggs = batchizer.get_batch(1)
print(words.size(), sentences.size(), taggs.size())

torch.Size([10, 1, 14]) torch.Size([10, 1]) torch.Size([10, 1])


## Model

In [16]:
class VeryComplicatedModel(nn.Module):
    def __init__(self, char_emb_dim, word_emb_dim, hidden_dim, vocab_size, charset_size, tagset_size, window, l):
        super(VeryComplicatedModel, self).__init__()
        self.char_embeddings = nn.Embedding(charset_size, char_emb_dim)
        self.word_embeddings = nn.Embedding(vocab_size, word_emb_dim)
        
        self.conv1 = nn.Conv1d(char_emb_dim, l, window, padding=(window-1)//2)
        
        self.lstm = nn.LSTM(word_emb_dim+l, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentence, words):
        # Pass each window through CNN, max_pool the results for each word

        cnn_word_vecs = []
        chars_batch = self.char_embeddings(words)
        chars_batch = chars_batch.permute(0,2,1)
        conv_out = self.conv1(chars_batch)
        pool_out, _ = torch.max(conv_out, dim=2)
        cnn_word_vecs = pool_out
  
        word_embeds = self.word_embeddings(sentence)
        concated = torch.cat((word_embeds, cnn_word_vecs), dim=1)
        lstm_out, _ = self.lstm(concated.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores    

In [17]:
model = VeryComplicatedModel(char_emb_dim=10,
                         word_emb_dim=10,
                         hidden_dim=6,
                         charset_size=len(char_to_idx),
                         vocab_size=len(word_to_idx),
                         tagset_size=len(tag_to_idx),
                         window=WINDOW_LEN, 
                         l = 5)

In [195]:
for sentence in train_dataset:
    words, taggs = sentence
    codded_sentence = prepare_sequence(words, word_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
    codded_words = []
    max_word_len = max(len(word) for word in words)
    for word in words:
        paddedWord = padd_word(word, length = max_word_len, symbol=' ')
        codded_word = prepare_sequence(paddedWord, char_to_idx, absent_key=PADDING_CHAR)
        codded_word = torch.reshape(codded_word, (1,-1))
        codded_words.append(codded_word)
    words_ = torch.cat(codded_words,dim=0)
    # print(words_.size(), len(codded_words), words_)
    print('Input words', words_.size())
    print(model(codded_sentence, words_).size())
    break


Input words torch.Size([49, 11])
torch.Size([49, 47])


In [18]:
class FinalModel(nn.Module):
    def __init__(self, char_emb_dim, word_emb_dim, hidden_dim, vocab_size, charset_size, tagset_size, window, l):
        super(FinalModel, self).__init__()
        self.char_embeddings = nn.Embedding(charset_size, char_emb_dim)
        self.word_embeddings = nn.Embedding(vocab_size, word_emb_dim)
        self.conv1 = nn.Conv1d(char_emb_dim, l, window, padding=(window-1)//2)
        self.lstm = nn.LSTM(word_emb_dim+l, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentences, words, batch_size=10):
        # Pass each window through CNN, max_pool the results for each word

        words_ = words.reshape(-1, words.shape[-1])
        print("1", words.shape, words_.shape)
        chars_batch = self.char_embeddings(words_)
        chars_batch = chars_batch.permute(0,2,1)
        conv_out = self.conv1(chars_batch)
        pool_out, _ = torch.max(conv_out, dim=2)
        cnn_word_vecs = pool_out
        print("2", cnn_word_vecs.shape)

        sentences_ = sentences.reshape(-1)
        print("3", sentences.shape, sentences_.shape)
        word_embeds = self.word_embeddings(sentences_)
        print("4", word_embeds.shape)
        concated = torch.cat((word_embeds, cnn_word_vecs), dim=1)
        lstm_out, _ = self.lstm(concated.view(len(sentences_), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentences_), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        tag_scores = tag_scores.reshape(batch_size, -1, tag_scores.shape[-1])
        return tag_scores    

In [19]:
model = FinalModel(char_emb_dim=10,
                         word_emb_dim=10,
                         hidden_dim=6,
                         charset_size=len(char_to_idx),
                         vocab_size=len(word_to_idx),
                         tagset_size=len(tag_to_idx),
                         window=WINDOW_LEN, 
                         l = 5)

In [27]:
train_dataset = read_dataset("corpus.train", with_tags=True)
batchizer = Batchizer(train_dataset, word_to_idx, char_to_idx, tag_to_idx)
sentences, words, taggs = batchizer.get_batch(100)
model(sentences,words).size()

1 torch.Size([10, 2, 15]) torch.Size([20, 15])
2 torch.Size([20, 5])
3 torch.Size([10, 2]) torch.Size([20])
4 torch.Size([20, 10])


torch.Size([10, 2, 47])

In [28]:
len(batchizer)

3793

### Training

In [23]:
import torch.optim as optim

In [47]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = []

train_dataset = read_dataset("corpus.train", with_tags=True)
batch_size = 10
batchizer = Batchizer(train_dataset, word_to_idx, char_to_idx, tag_to_idx, batch_size)

for epoch in range(100):
    for step in range(len(batchizer)):
        model.zero_grad()

        sentences, words, taggs = batchizer.get_batch(step)

        tag_scores = model(sentences, words, batch_size)
        # pred = torch.argmax(tag_scores, dim=2) #.reshape(-1)
        # print(pred)
        print(tag_scores.shape)
        loss = loss_function(tag_scores, taggs)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if step % 100 == 0:
          print(f"\t loss = {losses[-1]}")

    print(f"Epoch {epoch}: loss={losses[-1]}")
    torch.save(model, 'model.pth')

1 torch.Size([10, 1, 14]) torch.Size([10, 14])
2 torch.Size([10, 5])
3 torch.Size([10, 1]) torch.Size([10])
4 torch.Size([10, 10])
torch.Size([10, 1, 47])


ValueError: ignored

## Validation

In [80]:
model = torch.load('model.pth')

In [81]:
model.eval()

VeryComplicatedModel(
  (char_embeddings): Embedding(59, 10)
  (word_embeddings): Embedding(38473, 10)
  (conv1): Conv1d(10, 5, kernel_size=(5,), stride=(1,), padding=(2,))
  (lstm): LSTM(15, 6, bidirectional=True)
  (hidden2tag): Linear(in_features=12, out_features=47, bias=True)
)

In [82]:
test_dataset = read_dataset("corpus.answer", with_tags=True)

In [83]:
from sklearn.metrics import accuracy_score

sentence = test_dataset[2]
words, taggs = sentence
# print(words)
target = prepare_sequence(taggs, tag_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
codded_sentence = prepare_sequence(words, word_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
codded_words = []

max_word_len = max(len(word) for word in words)
for word in words:
  paddedWord = padd_word(word, length = max_word_len, symbol=' ')
  codded_word = prepare_sequence(paddedWord, char_to_idx, absent_key=PADDING_CHAR)
  codded_word = torch.reshape(codded_word, (1,-1))
  codded_words.append(codded_word)

words_ = torch.cat(codded_words,dim=0)
tag_scores = model(codded_sentence, words_)
pred = torch.argmax(tag_scores, dim=1)
print(accuracy_score(target, pred))

0.8666666666666667


In [84]:
word_to_idx[ABSENT_WORD[0]]

38472

In [85]:
y_true = []
y_pred = []
for (idx, sentence) in enumerate(test_dataset):
  words, taggs = sentence

  target = prepare_sequence(taggs, tag_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
  codded_sentence = prepare_sequence(words, word_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
  codded_words = []
  max_word_len = max(len(word) for word in words)
  for word in words:
    paddedWord = padd_word(word, length = max_word_len, symbol=' ')
    codded_word = prepare_sequence(paddedWord, char_to_idx, absent_key=PADDING_CHAR)
    codded_word = torch.reshape(codded_word, (1,-1))
    codded_words.append(codded_word)

  words_ = torch.cat(codded_words,dim=0)
  tag_scores = model(codded_sentence, words_)
  pred = torch.argmax(tag_scores, dim=1)
  y_true += target.tolist()
  y_pred += pred.tolist()


In [86]:
print(accuracy_score(y_true, y_pred))

0.8718248626222056
