In [4]:
import torch
from torch import nn
import torch.nn.functional as F

ModuleNotFoundError: No module named 'torch'

In [5]:
!pip3 install torch

You should consider upgrading via the '/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [200]:
ABSENT_WORD = ("*****", "*****")
PADDING_CHAR = " "
WINDOW_LEN = 5
RANDOM_CHANCE=0.1

In [201]:
def read_dataset(file_path, with_tags=True):
    """
    Read the dataset from file
    Args:
        file_path (str): path to the file to read from
        with_tags (bool): flag that indicates the presence of tags in data.
                          Use False to read test data.
    Returns:
        If with_tags is true, the list of tuples, one for each sentence
            One tuple contains list of lowercase words and corresponding list of tags
        Othervise the list of lowercase word lists, one fo each sentence
    """
    
    dataset = []
    with open(file_path, "r") as data_file:
        for line in data_file.readlines():
            # Split each sentence into items
            items = line[:-1].split(" ")
            if with_tags:
                # If tags are present, create separate lists of words and tags
                words = []
                tags = []
                for item in items:
                    [word, tag] = item.rsplit("/", 1)
                    words.append(word.lower())
                    tags.append(tag)
                dataset.append((words, tags))
            else:
                # If tags are not present, append word list to the dataset
                dataset.append([word.lower() for word in items])
    return dataset


def dataset_to_dictionary(dataset, absent_pair=None, absent_char=None):
    word_to_idx = {}
    idx_to_word = {}
    tag_to_idx = {}
    idx_to_tag = {}
    
    char_to_idx = {}
    idx_to_char = {}
    
    for (words, tags) in dataset:
        for word in words:
            if word not in word_to_idx:
                idx = len(word_to_idx)
                word_to_idx[word] = idx
                idx_to_word[idx] = word
            for letter in word:
                if letter not in char_to_idx:
                    idx = len(char_to_idx)
                    char_to_idx[letter] = idx
                    idx_to_char[idx] = letter
            
        for tag in tags:
            if tag not in tag_to_idx:
                idx = len(tag_to_idx)
                tag_to_idx[tag] = idx
                idx_to_tag[idx] = tag
                
    if absent_pair is not None:
        absent_word, absent_tag = absent_pair
        if absent_word not in word_to_idx:
            idx = len(word_to_idx)
            word_to_idx[absent_word] = idx
            idx_to_word[idx] = absent_word
        if absent_tag not in tag_to_idx:
            idx = len(tag_to_idx)
            tag_to_idx[absent_tag] = idx
            idx_to_tag[idx] = absent_tag
        for letter in absent_word:
                if letter not in char_to_idx:
                    idx = len(char_to_idx)
                    char_to_idx[letter] = idx
                    idx_to_char[idx] = letter
    if absent_char is not None and absent_char not in char_to_idx:
        idx = len(char_to_idx)
        char_to_idx[absent_char] = idx
        idx_to_char[idx] = absent_char
        
    return word_to_idx, tag_to_idx, char_to_idx, idx_to_word, idx_to_tag, idx_to_char


def prepare_sequence(sequence, dictionary, absent_key=None, random_key=None, random_chance=0.1):  
    """
    Translate sequence according to dictionary.
    Args:
        sequence (list): list of keys
        dictionary (dict): mapping from key to integer
        absent_key (str): key which will substitute absent keys in sequence.
                            if None, absent keys will be ignored
        random_key (bool): key which will substitute keys in sequence 
                            with some chance (10% maybe)
                            if None, random substitution will not be used.
    Returns:
        list of transformed sequence
    """
    translated_seq = []
    for key in sequence:
        # Handle absent keys if absent_key specified
        if key not in dictionary:
            if absent_key is not None:
                translated_seq.append(dictionary[absent_key])
        # Random substitute if random_key specified
        elif random_key is not None and torch.rand(1)[0]<random_chance:
            translated_seq.append(dictionary[random_key])
        else:
            translated_seq.append(dictionary[key])
    return torch.tensor(translated_seq, dtype=torch.long)

In [202]:
train_dataset = read_dataset("corpus.train", with_tags=True)

In [203]:
word_to_idx, tag_to_idx, char_to_idx, idx_to_word, idx_to_tag, idx_to_char = dataset_to_dictionary(train_dataset, 
                                                                                                   absent_pair=ABSENT_PAIR, 
                                                                                                   absent_char=PADDING_CHAR)

In [207]:
class ComplicatedModel(nn.Module):
    def __init__(self, char_emb_dim, word_emb_dim, hidden_dim, vocab_size, charset_size, tagset_size, window, l):
        super(ComplicatedModel, self).__init__()
        self.char_embeddings = nn.Embedding(charset_size, char_emb_dim)
        self.word_embeddings = nn.Embedding(vocab_size, word_emb_dim)
        
        self.conv1 = nn.Conv1d(char_emb_dim, l, window, padding=(window-1)//2)
        
        self.lstm = nn.LSTM(word_emb_dim+l, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
    def forward(self, sentence, words):
        # Pass each window through CNN, max_pool the results for each word
        cnn_word_vecs = []
        for chars in words:
            chars_batch = char_embeddings(chars)
            chars_batch = chars_batch.permute(0,2,1)
            
            conv_out = self.conv1(chars_batch)
            
            pool_out, _ = torch.max(conv_out, dim=2)
            pool_out = torch.reshape(pool_out, (-1,))
            
            cnn_word_vecs.append(pool_out)
            
        cnn_word_vecs = torch.stack(cnn_word_vecs)
        word_embeds = self.word_embeddings(sentence)
    
        concated = torch.cat((word_embeds, cnn_word_vecs), dim=1)
        lstm_out, _ = self.lstm(concated.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores        

In [208]:
model = ComplicatedModel(char_emb_dim=10,
                         word_emb_dim=10,
                         hidden_dim=6,
                         charset_size=len(char_to_idx),
                         vocab_size=len(char_to_idx),
                         tagset_size=len(tag_to_idx),
                         window=WINDOW_LEN, 
                         l = 5)

In [209]:
for sentence in train_dataset:
    words, taggs = sentence
    codded_sentence = prepare_sequence(words, word_to_idx, absent_key=ABSENT_WORD[0], random_key=None)
    codded_words = []
    for word in words:
        codded_word = prepare_sequence(word, char_to_idx, absent_key=PADDING_CHAR)
        codded_word = torch.reshape(codded_word, (1,-1))
        codded_words.append(codded_word)
    print(model(codded_sentence, codded_words).size())
    break


torch.Size([49, 47])
