In [46]:
import itertools
import nltk
import os
import string
import torch
import numpy as np
import pandas as pd
from collections import Counter
from torch import nn
from typing import Dict, List, Tuple

#### In this notebook I will show how I will preprocess text and transform it to Embedding Layer in PyTorch

In [3]:
def get_glue_df(glue_df) -> pd.DataFrame:
    glue_df = glue_df.dropna(axis=0, how='any').reset_index(drop=True)
    glue_df_fin = pd.DataFrame({
        'id_left': glue_df['qid1'],
        'id_right': glue_df['qid2'],
        'text_left': glue_df['question1'],
        'text_right': glue_df['question2'],
        'label': glue_df['is_duplicate'].astype(int)
    })
    return glue_df_fin

In [4]:
parent_dir = os.path.abspath(os.path.join('', os.pardir))
train_df = pd.read_csv(parent_dir + '/data/raw/QQP/train.tsv', sep='\t')
train_df = get_glue_df(train_df)

##### First step is punctuation and other unnecessary symbols deletion

In [7]:
def handle_punctuation(inp_str: str) -> str:
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    new_str = inp_str.translate(translator)
    return new_str

In [10]:
handle_punctuation('return!None')

'return None'

##### Second step is lowering and tokenization

In [11]:
# delete punctuation -> lower -> tokenize
def simple_preproc(inp_str: str) -> List[str]:
    no_punctuation_str = handle_punctuation(inp_str)
    lowered_str = no_punctuation_str.lower()
    splitted_doc = nltk.word_tokenize(lowered_str)
    return splitted_doc

In [12]:
simple_preproc('return!None')

['return', 'none']

##### Third step is filter our words with lower occurences and create list with all tokens for creation of Embedding Matrix

In [19]:
def _filter_rare_words(vocab: Dict[str, int], min_occurancies: int) -> Dict[str, int]:
    filtered_vocab = {x: count for x, count in vocab.items() if count >= min_occurancies}
    return filtered_vocab

def get_all_tokens(list_of_df: List[pd.DataFrame], min_occurancies: int) -> List[str]:
    preped_series = []
    for df in list_of_df:
        preped_question1 = df['text_left'].apply(simple_preproc)
        preped_question2 = df['text_right'].apply(simple_preproc)
        preped_series.append(preped_question1)
        preped_series.append(preped_question2)

    concat_series = pd.concat(preped_series)
    one_list_of_tokens = list(itertools.chain.from_iterable(concat_series.to_list()))
    vocab = dict(Counter(one_list_of_tokens))
    vocab = _filter_rare_words(vocab, min_occurancies)
    return list(vocab.keys())

In [22]:
%%time
all_tokens = get_all_tokens([train_df], min_occurancies=1)

CPU times: total: 58.7 s
Wall time: 59.5 s


In [23]:
all_tokens[:10]

['how', 'is', 'the', 'life', 'of', 'a', 'math', 'student', 'could', 'you']

##### Next step is creating Embedding Layer

In [33]:
def _read_glove_embeddings(file_path: str) -> Dict[str, List[str]]:
    with open(file_path, encoding='utf-8') as file:
        glove_dict = {}
        for line in file:
            splitted_line = line.split()
            word, embedding = splitted_line[0], splitted_line[1:]
            glove_dict[word] = embedding
    return glove_dict
    
def create_glove_emb_from_file(file_path: str, inner_keys: List[str],
                               random_seed: int, rand_uni_bound: float
                               ) -> Tuple[np.ndarray, Dict[str, int], List[str]]:
    np.random.seed(random_seed)
    glove_dict = _read_glove_embeddings(file_path)
    emb_dim = len(glove_dict['the'])
    
    emb_matrix = []
    pad_vec = np.random.uniform(low=-rand_uni_bound, high=rand_uni_bound, size=emb_dim)
    oov_vec = np.random.uniform(low=-rand_uni_bound, high=rand_uni_bound, size=emb_dim)
    emb_matrix.append(pad_vec)
    emb_matrix.append(oov_vec)
    
    vocab = {}
    unk_words = []
    vocab['PAD'], vocab['OOV'] = 0, 1
    for ind, token in enumerate(inner_keys, 2):
        if token in glove_dict.keys():
            emb_matrix.append(glove_dict[token])
            vocab[token] = ind
        else:
            unk_words.append(token)
            vocab[token] = ind
            random_emb = np.random.uniform(low=-rand_uni_bound, high=rand_uni_bound, size=emb_dim)
            emb_matrix.append(random_emb)
    emb_matrix = np.array(emb_matrix).astype(float)
    return (emb_matrix, vocab, unk_words)

##### We retrieve pretrained Glove vectors and in case we didn't find word we replace it with uniform vector

In [37]:

%%time
emb_matrix, vocab, unk_words = create_glove_emb_from_file(
    parent_dir + '/data/raw/glove.6B.50d.txt', all_tokens, 0, 0.2)

CPU times: total: 7.41 s
Wall time: 7.49 s


In [42]:
# we add two special symbols
len(unk_words), len(all_tokens), len(emb_matrix)

(24217, 83203, 83205)

In [43]:
# percentage of unknown words
len(unk_words) / len(all_tokens)

0.2910592166147855

In [47]:
# trying Torch API
emb_matrix = torch.nn.Embedding.from_pretrained(torch.FloatTensor(emb_matrix), freeze=True, padding_idx=0)
emb_matrix

Embedding(83205, 50, padding_idx=0)

In [48]:
# retrieve 2 docs with 3 words example
# this is how it will be used in model
indices = torch.LongTensor([
    [1, 33, 2],
    [2, 4, 3]]
)

In [49]:
emb_matrix(indices).shape

torch.Size([2, 3, 50])