In [1]:
import os
import torch
from src.data.text_retriever import TextRetriever
from src.features.build_embedding_matrix import EmbeddingMatrixBuilder

  from .autonotebook import tqdm as notebook_tqdm


#### In this notebook I will show how I will preprocess text and transform it to Embedding Layer in PyTorch

In [2]:
PARENT_DIR = os.path.abspath(os.path.join('', os.pardir))
TRAIN_PATH = PARENT_DIR + '/data/raw/QQP/train.tsv'
VAL_PATH = PARENT_DIR + '/data/raw/QQP/dev.tsv'
text_retriever = TextRetriever(TRAIN_PATH, VAL_PATH)

##### First step is punctuation and other unnecessary symbols deletion

In [3]:
text_retriever._handle_punctuation('return!None')

'return None'

##### Second step is lowering and tokenization

In [4]:
text_retriever.lower_and_tokenize_words('return!None')

['return', 'none']

##### Third step is filter our words with lower occurences and create list with all tokens for creation of Embedding Matrix

In [5]:
%%time
TOKENS_SAVE_PATH = PARENT_DIR + '/data/processed/tokens.pickle'
all_tokens = text_retriever.get_all_tokens(min_occurancies=1, save_path=TOKENS_SAVE_PATH)

CPU times: total: 57.6 s
Wall time: 1min


In [6]:
all_tokens[:10]

['how', 'is', 'the', 'life', 'of', 'a', 'math', 'student', 'could', 'you']

In [7]:
TOKENS_SAVE_PATH = PARENT_DIR + '/data/processed/documents.json'
text_retriever.get_and_save_documents(TOKENS_SAVE_PATH)

##### Next step is creating Embedding Layer

In [9]:
emb_builder = EmbeddingMatrixBuilder(random_vec_bound=1.0, random_seed=0)

##### We retrieve pretrained Glove vectors and in case we didn't find word we replace it with uniform vector

In [10]:

%%time
GLOVE_PATH = PARENT_DIR + '/data/raw/glove.6B.50d.txt'
emb_matrix, vocab, unk_words = emb_builder.create_glove_emb_from_file(GLOVE_PATH, all_tokens)

CPU times: total: 10.3 s
Wall time: 10.8 s


In [11]:
emb_matrix.shape

(87164, 50)

In [12]:
# we add two special symbols
len(unk_words), len(all_tokens), len(emb_matrix)

(26197, 87162, 87164)

In [13]:
# percentage of unknown words
len(unk_words) / len(all_tokens)

0.3005552878547991

In [14]:
# trying Torch API
emb_matrix = torch.nn.Embedding.from_pretrained(torch.FloatTensor(emb_matrix), freeze=True, padding_idx=0)
emb_matrix

Embedding(87164, 50, padding_idx=0)

In [15]:
# retrieve 2 docs with 3 words example
# this is how it will be used in model
indices = torch.LongTensor([
    [1, 33, 2],
    [2, 4, 3]]
)

In [16]:
emb_matrix(indices).shape

torch.Size([2, 3, 50])