In [1]:
import torch

In [39]:
ABSENT_PAIR = ("UNKNOWN", "UNKNOWN")
RANDOM_CHANCE=0.1

In [40]:
def read_dataset(file_path, with_tags=True):
    """
    Read the dataset from file
    Args:
        file_path (str): path to the file to read from
        with_tags (bool): flag that indicates the presence of tags in data.
                          Use False to read test data.
    Returns:
        If with_tags is true, the list of tuples, one for each sentence
            One tuple contains list of lowercase words and corresponding list of tags
        Othervise the list of lowercase word lists, one fo each sentence
    """
    
    dataset = []
    with open(file_path, "r") as data_file:
        for line in data_file.readlines():
            # Split each sentence into items
            items = line[:-1].split(" ")
            if with_tags:
                # If tags are present, create separate lists of words and tags
                words = []
                tags = []
                for item in items:
                    [word, tag] = item.rsplit("/", 1)
                    words.append(word.lower())
                    tags.append(tag)
                dataset.append((words, tags))
            else:
                # If tags are not present, append word list to the dataset
                dataset.append([word.lower() for word in items])
    return dataset


def dataset_to_dictionary(dataset, absent_pair=None):
    word_to_idx = {}
    idx_to_word = {}
    tag_to_idx = {}
    idx_to_tag = {}
    for (words, tags) in dataset:
        for word in words:
            if word not in word_to_idx:
                idx = len(word_to_idx)
                word_to_idx[word] = idx
                idx_to_word[idx] = word
                
        for tag in tags:
            if tag not in tag_to_idx:
                idx = len(tag_to_idx)
                tag_to_idx[tag] = idx
                idx_to_tag[idx] = tag
                
    if absent_pair is not None:
        absent_word, absent_tag = absent_pair
        if absent_word not in word_to_idx:
            idx = len(word_to_idx)
            word_to_idx[absent_word] = idx
            idx_to_word[idx] = absent_word
        if absent_tag not in tag_to_idx:
            idx = len(tag_to_idx)
            tag_to_idx[absent_tag] = idx
            idx_to_tag[idx] = absent_tag
    return word_to_idx, tag_to_idx, idx_to_word, idx_to_tag


def prepare_sequence(sequence, dictionary, absent_key=None, random_key=None, random_chance=0.1):  
    """
    Translate sequence according to dictionary.
    Args:
        sequence (list): list of keys
        dictionary (dict): mapping from key to integer
        absent_key (str): key which will substitute absent keys in sequence.
                            if None, absent keys will be ignored
        random_key (bool): key which will substitute keys in sequence 
                            with some chance (10% maybe)
                            if None, random substitution will not be used.
    Returns:
        list of transformed sequence
    """
    translated_seq = []
    for key in sequence:
        # Handle absent keys if absent_key specified
        if key not in dictionary:
            if absent_key is not None:
                translated_seq.append(dictionary[absent_key])
        # Random substitute if random_key specified
        elif random_key is not None and torch.rand(1)[0]<random_chance:
            translated_seq.append(dictionary[random_key])
        else:
            translated_seq.append(dictionary[key])
    return translated_seq

In [41]:
train_dataset = read_dataset("corpus.train", with_tags=True)

In [42]:
word_to_idx, tag_to_idx, idx_to_word, idx_to_tag = dataset_to_dictionary(train_dataset, absent_pair=ABSENT_PAIR)

In [43]:
tag_to_idx

{'IN': 0,
 'DT': 1,
 'NNP': 2,
 'CD': 3,
 'NN': 4,
 '``': 5,
 "''": 6,
 'POS': 7,
 '-LRB-': 8,
 'VBN': 9,
 'NNS': 10,
 'VBP': 11,
 ',': 12,
 'CC': 13,
 '-RRB-': 14,
 'VBD': 15,
 'RB': 16,
 'TO': 17,
 '.': 18,
 'VBZ': 19,
 'NNPS': 20,
 'PRP': 21,
 'PRP$': 22,
 'VB': 23,
 'JJ': 24,
 'MD': 25,
 'VBG': 26,
 'RBR': 27,
 ':': 28,
 'WP': 29,
 'WDT': 30,
 'JJR': 31,
 'PDT': 32,
 'RBS': 33,
 'WRB': 34,
 'JJS': 35,
 '$': 36,
 'RP': 37,
 'FW': 38,
 'EX': 39,
 'SYM': 40,
 '#': 41,
 'LS': 42,
 'UH': 43,
 'WP$': 44,
 '': 45,
 'UNKNOWN': 46}

In [44]:
prepare_sequence(["innopolis","is","not","very","big","town"], word_to_idx, absent_key=ABSENT_PAIR[0], random_key=ABSENT_PAIR[0], random_chance=RANDOM_CHANCE) 

[38472, 305, 119, 578, 1936, 2510]