In [1]:
import itertools
import math
import nltk
import string
import torch
import numpy as np
import pandas as pd
from collections import Counter
from typing import Dict, List, Tuple, Union, Callable
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gwyn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 1. Preproccesing and tokenization

In [2]:
# 1.1 handle_punctuation
def handle_punctuation(inp_str: str) -> str:
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    new_str = inp_str.translate(translator)
    return new_str

In [3]:
handle_punctuation('return!None')

'return None'

In [4]:
nltk.word_tokenize(handle_punctuation('return!None'))

['return', 'None']

In [5]:
# 1.2 simple_preproc
# delete punctuation -> lower -> tokenize
def simple_preproc(inp_str: str) -> List[str]:
    no_punctuation_str = handle_punctuation(inp_str)
    lowered_str = no_punctuation_str.lower()
    splitted_doc = nltk.word_tokenize(lowered_str)
    return splitted_doc

In [6]:
simple_preproc('return!None')

['return', 'none']

In [7]:
# 1.3 get_all_tokens and filter rare words

def _filter_rare_words(vocab: Dict[str, int], min_occurancies: int) -> Dict[str, int]:
    filtered_vocab = {x: count for x, count in vocab.items() if count >= min_occurancies}
    return filtered_vocab

def get_all_tokens(list_of_df: List[pd.DataFrame], min_occurancies: int) -> List[str]:
    preped_series = []
    for df in list_of_df:
        preped_question1 = df['text_left'].apply(simple_preproc)
        preped_question2 = df['text_right'].apply(simple_preproc)
        preped_series.append(preped_question1)
        preped_series.append(preped_question2)

    concat_series = pd.concat(preped_series)
    one_list_of_tokens = list(itertools.chain.from_iterable(concat_series.to_list()))
    vocab = dict(Counter(one_list_of_tokens))
    vocab = _filter_rare_words(vocab, min_occurancies)
    return [key for key, _ in vocab.items()]

In [8]:
def get_glue_df(glue_df) -> pd.DataFrame:
    glue_df = glue_df.dropna(axis=0, how='any').reset_index(drop=True)
    glue_df_fin = pd.DataFrame({
        'id_left': glue_df['qid1'],
        'id_right': glue_df['qid2'],
        'text_left': glue_df['question1'],
        'text_right': glue_df['question2'],
        'label': glue_df['is_duplicate'].astype(int)
    })
    return glue_df_fin

In [9]:
train_df = pd.read_csv('data/QQP/train.tsv', sep='\t')
dev_df = pd.read_csv('data/QQP/dev.tsv', sep='\t')
train_df = get_glue_df(train_df)
dev_df = get_glue_df(dev_df)
list_of_df = [train_df, dev_df]

In [10]:
train_df.head()

Unnamed: 0,id_left,id_right,text_left,text_right,label
0,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [11]:
all_tokens = get_all_tokens(list_of_df, min_occurancies=1)

In [12]:
all_tokens[:10]

['how', 'is', 'the', 'life', 'of', 'a', 'math', 'student', 'could', 'you']

### 2. Embedding matrix implementation

In [13]:
# basic retrieving of GLOVE embeddings from txt file
# unk words replace with uniform vector with values inside [-0.2, 0.2]
# return emb_matrix, vocabulary and unk_words

def _read_glove_embeddings(file_path: str) -> Dict[str, List[str]]:
    with open(file_path, encoding='utf-8') as file:
        glove_dict = {}
        for line in file:
            splitted_line = line.split()
            word, embedding = splitted_line[0], splitted_line[1:]
            glove_dict[word] = embedding
    return glove_dict
    
def create_glove_emb_from_file(file_path: str, inner_keys: List[str],
                               random_seed: int, rand_uni_bound: float
                               ) -> Tuple[np.ndarray, Dict[str, int], List[str]]:
    np.random.seed(random_seed)
    glove_dict = _read_glove_embeddings(file_path)
    emb_dim = len(glove_dict['the'])
    
    emb_matrix = []
    pad_vec = np.random.uniform(low=-rand_uni_bound, high=rand_uni_bound, size=emb_dim)
    oov_vec = np.random.uniform(low=-rand_uni_bound, high=rand_uni_bound, size=emb_dim)
    emb_matrix.append(pad_vec)
    emb_matrix.append(oov_vec)
    
    vocab = {}
    unk_words = []
    vocab['PAD'], vocab['OOV'] = 0, 1
    for ind, token in enumerate(inner_keys, 2):
        if token in glove_dict.keys():
            emb_matrix.append(glove_dict[token])
            vocab[token] = ind
        else:
            unk_words.append(token)
            vocab[token] = ind
            random_emb = np.random.uniform(low=-rand_uni_bound, high=rand_uni_bound, size=emb_dim)
            emb_matrix.append(random_emb)
    emb_matrix = np.array(emb_matrix).astype(float)
    return (emb_matrix, vocab, unk_words)

In [14]:
emb_matrix, vocab, unk_words = create_glove_emb_from_file('data/glove.6B.50d.txt', all_tokens, 0, 0.2)

In [15]:
vocab['the']

4

In [16]:
len(unk_words)

26197

In [17]:
len(all_tokens)

87162

In [18]:
len(emb_matrix)

87164

In [19]:
len(unk_words) / len(all_tokens)

0.3005552878547991

In [20]:
# trying Torch API
emb_matrix = torch.nn.Embedding.from_pretrained(torch.FloatTensor(emb_matrix), freeze=True, padding_idx=0)
emb_matrix

Embedding(87164, 50, padding_idx=0)

In [21]:
# retrieve 2 docs with 3 words example
indices = torch.LongTensor([
    [1, 33, 2],
    [2, 4, 3]]
)

In [22]:
print(emb_matrix(indices).shape)

torch.Size([2, 3, 50])


### 3. Implementation of KNRM and Gaussian Kernel

In [7]:
# We use gaussian kernel to get understanding if query or document is similar
# We have cosine similarities from [-1, 1] and kernels with Mus across [-1, 1]
# More words similar with each words in documents the bigger value we will have from
# Kernels in ranges (0.7, 1] and lesser values in other kernels

class GaussianKernel(torch.nn.Module):
    def __init__(self, mu: float = 1., sigma: float = 1.):
        super().__init__()
        self.mu = mu
        self.sigma = sigma

    def forward(self, x):
        numerator = -torch.pow((x - self.mu), 2)
        denominator = 2 * self.sigma**2
        return torch.exp(numerator / denominator)

In [24]:
gk = GaussianKernel()

In [25]:
a = torch.Tensor([[2, 3, 4],
                  [1, 2, 3]]) 
b = torch.Tensor([[1, 1, 4],
                  [1, 2, 3]]) 

In [26]:
gk(a)

tensor([[0.6065, 0.1353, 0.0111],
        [1.0000, 0.6065, 0.1353]])

In [27]:
1/20

0.05

In [28]:
np.append(np.linspace(-0.95, 0.95, 20), 1.0)

array([-0.95, -0.85, -0.75, -0.65, -0.55, -0.45, -0.35, -0.25, -0.15,
       -0.05,  0.05,  0.15,  0.25,  0.35,  0.45,  0.55,  0.65,  0.75,
        0.85,  0.95,  1.  ])

In [29]:
np.append(np.array([1,2] + [3]), 4)

array([1, 2, 3, 4])

In [30]:
# Idea in KNRM that we will have several Gaussian kernels with different MUs
# We will match every word of query with every word of documents and then we will have matching matrix
# Mij - cosine similarity between i word of query and j word of document 
# Applying Kernels to matching matrix we will likely know how query and document similar or dissimilar

In [2]:
class KNRM(torch.nn.Module):
    def __init__(self, embedding_matrix: np.ndarray, freeze_embeddings: bool, kernel_num: int = 21,
                 sigma: float = 0.1, exact_sigma: float = 0.001,
                 out_layers: List[int] = [10, 5]):
        super().__init__()
        self.embeddings = torch.nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            freeze=freeze_embeddings,
            padding_idx=0
        )

        self.kernel_num = kernel_num
        self.sigma = sigma
        self.exact_sigma = exact_sigma
        self.out_layers = out_layers

        self.kernels = self._get_kernels_layers()

        self.mlp = self._get_mlp()

        self.out_activation = torch.nn.Sigmoid()

    def _get_kernels_layers(self) -> torch.nn.ModuleList:
        kernels = torch.nn.ModuleList()
        # my code here
        shrink_len = 1.0 /  (self.kernel_num - 1)
        left, right = -1.0 + shrink_len, 1.0 - shrink_len
        mus = np.append(np.linspace(left, right, self.kernel_num-1), 1.0)
        sigmas = np.array((self.kernel_num-1)*[self.sigma] + [self.exact_sigma])
        
        for mu, sigma in zip(mus, sigmas):
            kernels.append(GaussianKernel(mu=mu, sigma=sigma))
        return kernels

    def _get_mlp(self) -> torch.nn.Sequential:
       # my code here
        if len(self.out_layers) == 0:
            return torch.nn.Sequential(torch.nn.Linear(self.kernel_num, 1))
        
        layers = []
        layers.append(torch.nn.Linear(self.kernel_num, self.out_layers[0]))
        layers.append(torch.nn.ReLU())
        for i in range(1, len(self.out_layers)):
            layers.append(torch.nn.Linear(self.out_layers[i-1], self.out_layers[i]))
            layers.append(torch.nn.ReLU())
        layers.append(torch.nn.Linear(self.out_layers[-1], 1))
        return torch.nn.Sequential(*layers)

    def forward(self, input_1: Dict[str, torch.Tensor], input_2: Dict[str, torch.Tensor]) -> torch.FloatTensor:
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)

        logits_diff = logits_1 - logits_2

        out = self.out_activation(logits_diff)
        return out

    def _get_matching_matrix(self, query: torch.Tensor, doc: torch.Tensor) -> torch.FloatTensor:
        # my code here
        # https://stackoverflow.com/questions/50411191/
        # how-to-compute-the-cosine-similarity-in-pytorch-for-all-rows-in-a-matrix-with-re
        eps = 1e-8
        query_m, doc_m = self.embeddings(query), self.embeddings(doc)
        query_norm, doc_norm = query_m.norm(dim=2)[:, :, None], doc_m.norm(dim=2)[:, :, None]
        query_normalised = query_m / torch.clamp(query_norm, min=eps)
        doc_normalised = doc_m / torch.clamp(doc_norm, min=eps)
        similarity_m = torch.bmm(query_normalised, doc_normalised.transpose(1, 2))
        return similarity_m

    def _apply_kernels(self, matching_matrix: torch.FloatTensor) -> torch.FloatTensor:
        KM = []
        for kernel in self.kernels:
            # shape = [B]
            K = torch.log1p(kernel(matching_matrix).sum(dim=-1)).sum(dim=-1)
            KM.append(K)

        # shape = [B, K]
        kernels_out = torch.stack(KM, dim=1)
        return kernels_out

    def predict(self, inputs: Dict[str, torch.Tensor]) -> torch.FloatTensor:
        # shape = [Batch, Left], [Batch, Right]
        query, doc = inputs['query'], inputs['document']

        # shape = [Batch, Left, Right]
        matching_matrix = self._get_matching_matrix(query, doc)
        # shape = [Batch, Kernels]
        kernels_out = self._apply_kernels(matching_matrix)
        # shape = [Batch]
        out = self.mlp(kernels_out)
        return out

In [32]:
emb_matrix, vocab, unk_words = create_glove_emb_from_file('data/glove.6B.50d.txt', all_tokens, 0, 0.2)

In [33]:
type(emb_matrix)

numpy.ndarray

In [34]:
emb_matrix.astype(float)

array([[ 0.0195254 ,  0.08607575,  0.04110535, ..., -0.14842948,
        -0.07382866, -0.05451569],
       [ 0.02807871, -0.02455939,  0.19534954, ..., -0.19195698,
         0.13157601, -0.19812181],
       [ 0.68938   , -0.10644   ,  0.17083   , ...,  0.41761   ,
        -0.22504   ,  0.61412   ],
       ...,
       [-0.60955   ,  0.62538   , -0.035572  , ...,  1.4036    ,
         0.81419   ,  0.097825  ],
       [-0.17766197,  0.16951815,  0.13465873, ..., -0.08008661,
        -0.11563979,  0.12498598],
       [ 0.11233   ,  1.4166    , -1.0127    , ...,  0.012063  ,
        -1.0092    , -0.37959   ]])

In [35]:
knrm = KNRM(emb_matrix, False)

In [36]:
knrm._get_mlp()

Sequential(
  (0): Linear(in_features=21, out_features=10, bias=True)
  (1): ReLU()
  (2): Linear(in_features=10, out_features=5, bias=True)
  (3): ReLU()
  (4): Linear(in_features=5, out_features=1, bias=True)
)

In [37]:
# Batch 
a.shape

torch.Size([2, 3])

In [38]:
a = torch.Tensor(
    [[[1, 3, 4],
     [1, 1, 1]],
     [[1, 3, 4],
     [1, 2, 3]]]
    )
b = torch.Tensor(
    [[[1, 1, 4],
     [1, 10, 3]],
     [[-1, 1, -4],
     [2, 2, 3]]]
     )
eps = 1e-8
a_n, b_n = a_n, b_n = a.norm(dim=2)[:, :, None], b.norm(dim=2)[:, :, None]
a_norm = a / torch.clamp(a_n, min=eps)
b_norm = b / torch.clamp(b_n, min=eps)
sim_mt = torch.bmm(a_norm, b_norm.transpose(1, 2))
sim_mt, sim_mt.shape

(tensor([[[ 0.9245,  0.8041],
          [ 0.8165,  0.7707]],
 
         [[-0.6472,  0.9513],
          [-0.6929,  0.9723]]]),
 torch.Size([2, 2, 2]))

In [39]:
matching_matrix = torch.einsum(
    'bld,brd->blr',
    torch.nn.functional.normalize(a, p=2, dim=-1),
    torch.nn.functional.normalize(b, p=2, dim=-1)
)
matching_matrix

tensor([[[ 0.9245,  0.8041],
         [ 0.8165,  0.7707]],

        [[-0.6472,  0.9513],
         [-0.6929,  0.9723]]])

In [40]:
# check similarity of first word of query and second word of document
aa = np.array([1, 3, 4])
bb = np.array([1, 10, 3])
cos_sim = np.dot(aa, bb)/(np.linalg.norm(aa)*np.linalg.norm(bb))
cos_sim

0.8040544114072732

In [41]:
def _get_kernels_layers() -> torch.nn.ModuleList:
    kernels = torch.nn.ModuleList()
    # my code here
    shrink_len = 1.0 / 10
    left, right = -1.0 + shrink_len, 1.0 - shrink_len
    mus = np.append(np.linspace(left, right, 10), 1.0)

    for mu in mus:
        kernels.append(GaussianKernel(mu=mu))

    return kernels

In [42]:
kernels = _get_kernels_layers()

In [43]:
def _apply_kernels(kernels, matching_matrix: torch.FloatTensor) -> torch.FloatTensor:
    KM = []
    for kernel in kernels:
        # shape = [B]
        K = torch.log1p(kernel(matching_matrix).sum(dim=-1)).sum(dim=-1)
        KM.append(K)
    
    # shape = [B, K]
    kernels_out = torch.stack(KM, dim=1)
    return kernels_out

In [44]:
b, sim_mt

(tensor([[[ 1.,  1.,  4.],
          [ 1., 10.,  3.]],
 
         [[-1.,  1., -4.],
          [ 2.,  2.,  3.]]]),
 tensor([[[ 0.9245,  0.8041],
          [ 0.8165,  0.7707]],
 
         [[-0.6472,  0.9513],
          [-0.6929,  0.9723]]]))

In [45]:
kernels[-1].mu

1.0

In [46]:
kernels[-2](sim_mt)

tensor([[[0.9997, 0.9954],
         [0.9965, 0.9917]],

        [[0.3021, 0.9987],
         [0.2812, 0.9974]]])

In [47]:
kernels[0](sim_mt).sum(dim=-1)

tensor([[0.4234, 0.4769],
        [1.1487, 1.1521]])

In [48]:
torch.log1p(kernels[0](sim_mt).sum(dim=-1))

tensor([[0.3531, 0.3899],
        [0.7649, 0.7664]])

In [49]:
torch.log1p(kernels[0](sim_mt).sum(dim=-1)).sum(dim=-1)

tensor([0.7430, 1.5313])

In [50]:
_apply_kernels(kernels, sim_mt)

tensor([[0.7430, 0.9680, 1.2062, 1.4431, 1.6645, 1.8580, 2.0135, 2.1238, 2.1840,
         2.1916, 2.1756],
        [1.5313, 1.6225, 1.6908, 1.7381, 1.7666, 1.7783, 1.7741, 1.7536, 1.7152,
         1.6568, 1.6194]])

### 4. Implementation of Dataset and Dataloader for train and validation sets

In [51]:
def get_idx_to_text_mapping(inp_df: pd.DataFrame) -> Dict[str, str]:
    left_dict = (
        inp_df[
            ['id_left', 'text_left']
        ].drop_duplicates()
        .set_index('id_left')
        ['text_left'].to_dict()
    )
    right_dict = (
        inp_df[
            ['id_right', 'text_right']
        ].drop_duplicates()
        .set_index('id_right')
        ['text_right'].to_dict()
    )
    left_dict.update(right_dict)
    return left_dict

In [52]:
d = get_idx_to_text_mapping(train_df)
list(d.items())[:5]

[(213221,
  'How is the life of a math student? Could you describe your own experiences?'),
 (536040, 'How do I control my horny emotions?'),
 (364011, 'What causes stool color to change to yellow?'),
 (155721, 'What can one do after MBBS?'),
 (279958,
  'Where can I find a power outlet for my laptop at Melbourne Airport?')]

In [53]:
train_df.head()

Unnamed: 0,id_left,id_right,text_left,text_right,label
0,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [54]:
### walkthrough def create_val_pairs. Can be found in solution

In [55]:
inp_df_select = train_df[['id_left', 'id_right', 'label']]
inf_df_group_sizes = inp_df_select.groupby('id_left').size()

In [56]:
min_group_size=2
glue_dev_leftids_to_use = list(inf_df_group_sizes[inf_df_group_sizes >= min_group_size].index)

In [57]:
len(glue_dev_leftids_to_use)

47573

In [58]:
groups = inp_df_select[inp_df_select.id_left.isin(glue_dev_leftids_to_use)].groupby('id_left')
all_ids = set(train_df['id_left']).union(set(train_df['id_right']))

In [59]:
for id_left, group in groups:
    id_left, grp = id_left, group
    break

In [60]:
print(grp)

        id_left  id_right  label
37749         3    282170      0
175701        3    488853      0
297478        3         4      0
317731        3    380197      0


In [61]:
ones_ids = group[group.label > 0].id_right.values
zeroes_ids = group[group.label == 0].id_right.values
ones_ids, zeroes_ids

(array([], dtype=int64), array([282170, 488853,      4, 380197], dtype=int64))

In [62]:
sum_len = len(ones_ids) + len(zeroes_ids)
num_pad_items = max(0, 15 - sum_len)
num_pad_items

11

In [63]:
if num_pad_items > 0:
    cur_chosen = set(ones_ids).union(
        set(zeroes_ids)).union({id_left})
    pad_sample = np.random.choice(
        list(all_ids - cur_chosen), num_pad_items,
        replace=False).tolist()
else:
    pad_sample = []

In [64]:
out_pairs = []
for i in ones_ids:
    out_pairs.append([id_left, i, 2])
for i in zeroes_ids:
    out_pairs.append([id_left, i, 1])
for i in pad_sample:
    out_pairs.append([id_left, i, 0])

In [65]:
out_pairs

[[3, 282170, 1],
 [3, 488853, 1],
 [3, 4, 1],
 [3, 380197, 1],
 [3, 476710, 0],
 [3, 486530, 0],
 [3, 334570, 0],
 [3, 249341, 0],
 [3, 56180, 0],
 [3, 254444, 0],
 [3, 158257, 0],
 [3, 431212, 0],
 [3, 120488, 0],
 [3, 264336, 0],
 [3, 343946, 0]]

In [10]:
class RankingDataset(torch.utils.data.Dataset):
    def __init__(self, index_pairs_or_triplets: List[List[Union[str, float]]],
                 idx_to_text_mapping: Dict[int, str],
                 vocab: Dict[str, int],
                 oov_val: int,
                 preproc_func: Callable, max_len: int = 30):
        self.index_pairs_or_triplets = index_pairs_or_triplets
        self.idx_to_text_mapping = idx_to_text_mapping
        self.vocab = vocab
        self.oov_val = oov_val
        self.preproc_func = preproc_func
        self.max_len = max_len

    def __len__(self):
        return len(self.index_pairs_or_triplets)

    def _tokenized_text_to_index(self, tokenized_text: List[str]) -> List[int]:
        # my code here
        token_idxs = []
        text = tokenized_text[:self.max_len]
        for token in text:
            token_idxs.append(self.vocab.get(token, self.oov_val))
        return token_idxs

    def _convert_text_idx_to_token_idxs(self, idx: int) -> List[int]:
        # my code here
        text = self.idx_to_text_mapping[idx]
        tokenized_text = self.preproc_func(text)
        token_idxs = self._tokenized_text_to_index(tokenized_text)
        return token_idxs

    def __getitem__(self, idx: int):
        pass

In [67]:
list(d.items())[:5]

[(213221,
  'How is the life of a math student? Could you describe your own experiences?'),
 (536040, 'How do I control my horny emotions?'),
 (364011, 'What causes stool color to change to yellow?'),
 (155721, 'What can one do after MBBS?'),
 (279958,
  'Where can I find a power outlet for my laptop at Melbourne Airport?')]

In [68]:
out_pairs[:3]

[[3, 282170, 1], [3, 488853, 1], [3, 4, 1]]

In [69]:
len(out_pairs[:3])

3

In [70]:
list(range(3))

[0, 1, 2]

In [71]:
np.random.choice(list(range(3)), 1).item()

1

In [11]:
class TrainTripletsDataset(RankingDataset):
    def __getitem__(self, idx):
        # my code here
        triplets = self.index_pairs_or_triplets[idx]
        query_tokens = self._convert_text_idx_to_token_idxs(str(triplets[0]))
        left_doc_tokens = self._convert_text_idx_to_token_idxs(str(triplets[1]))
        right_doc_tokens = self._convert_text_idx_to_token_idxs(str(triplets[2]))
        label = triplets[3]

        left_query_doc = {'query': query_tokens, 'document': left_doc_tokens}
        right_query_doc = {'query': query_tokens, 'document': right_doc_tokens}
        return left_query_doc, right_query_doc, label


class ValPairsDataset(RankingDataset):
    def __getitem__(self, idx):
        # my code here
        pairs = self.index_pairs_or_triplets[idx]
        query_tokens = self._convert_text_idx_to_token_idxs(str(pairs[0]))
        doc_tokens = self._convert_text_idx_to_token_idxs(str(pairs[1]))
        label = pairs[2]

        query_doc = {'query': query_tokens, 'document': doc_tokens}
        return query_doc, label

In [73]:
train_df.head()

Unnamed: 0,id_left,id_right,text_left,text_right,label
0,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [74]:
train_df[train_df['id_left']==3]

Unnamed: 0,id_left,id_right,text_left,text_right,label
37749,3,282170,What is the story of Kohinoor (Koh-i-Noor) Dia...,Is it possible to melt down diamonds?,0
175701,3,488853,What is the story of Kohinoor (Koh-i-Noor) Dia...,Could India keep the Koh-I-Noor safe?,0
297478,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
317731,3,380197,What is the story of Kohinoor (Koh-i-Noor) Dia...,What are some interesting facts about Kohinoor...,0


In [78]:
def sample_data_for_train_iter(inp_df: pd.DataFrame, seed: int
                               ) -> List[List[Union[str, float]]]:
    inp_df_select = train_df[['id_left', 'id_right', 'label']]
    inf_df_group_sizes = inp_df_select.groupby('id_left').size()
    glue_dev_leftids_to_use = list(inf_df_group_sizes[inf_df_group_sizes >= 3].index)
    glue_dev_leftids_to_use = np.random.choice(list(glue_dev_leftids_to_use), size=3000, replace=False)
    groups = inp_df_select[inp_df_select.id_left.isin(
        glue_dev_leftids_to_use)].groupby('id_left')

    all_ids = set(train_df['id_left']).union(set(train_df['id_right']))

    out_triplets = []

    np.random.seed(seed)
    negative_example = np.random.choice(list(all_ids), size=1).item()
    for id_left, group in groups:
        right_ids = np.array(group['id_right'].to_list())
        np.random.shuffle(right_ids)
        all_groups_ids = set([id_left]).union(set(right_ids))
        candidates = list(itertools.combinations(right_ids, 2))
        candidates_inds = np.random.choice(list(range(len(candidates))), size=3, replace=False)
        
        for ind in candidates_inds:
            candidate_left, candidate_right = candidates[ind][0], candidates[ind][1]
            left_label = group[group['id_right']==candidate_left]['label'].item()
            right_label = group[group['id_right']==candidate_right]['label'].item()
            label_diff = left_label - right_label
            if label_diff > 0:
                out_triplets.append([id_left, candidate_left, candidate_right, 1])
            else:
                out_triplets.append([id_left, candidate_left, candidate_right, 0])
                
        #negative_example = np.random.choice(list(all_ids), size=1).item()
        out_triplets.append([id_left, candidate_right, negative_example, 0])
        negative_example = id_left
    out_triplets = np.array(out_triplets)
    #train_inds = np.random.choice(list(range(len(out_triplets))), size=10000, replace=False)
    return out_triplets#[train_inds]

In [79]:
group

Unnamed: 0,id_left,id_right,label
37749,3,282170,0
175701,3,488853,0
297478,3,4,0
317731,3,380197,0


In [80]:
triplets = sample_data_for_train_iter(train_df, 0)

In [81]:
triplets

array([[    51,     52, 161854,      0],
       [    51,     52, 291074,      0],
       [    51,     52, 102258,      0],
       ...,
       [493505, 354541, 128937,      0],
       [493505, 354541,  44893,      0],
       [493505,  44893, 475986,      0]])

In [82]:
type(triplets[0][-1])

numpy.int32

In [3]:
def collate_fn(
        batch_objs: List[Union[Dict[str, torch.Tensor], torch.FloatTensor]]):
    max_len_q1 = -1
    max_len_d1 = -1
    max_len_q2 = -1
    max_len_d2 = -1

    is_triplets = False
    for elem in batch_objs:
        if len(elem) == 3:
            left_elem, right_elem, label = elem
            is_triplets = True
        else:
            left_elem, label = elem

        max_len_q1 = max(len(left_elem['query']), max_len_q1)
        max_len_d1 = max(len(left_elem['document']), max_len_d1)
        if len(elem) == 3:
            max_len_q2 = max(len(right_elem['query']), max_len_q2)
            max_len_d2 = max(len(right_elem['document']), max_len_d2)

    q1s = []
    d1s = []
    q2s = []
    d2s = []
    labels = []

    for elem in batch_objs:
        if is_triplets:
            left_elem, right_elem, label = elem
        else:
            left_elem, label = elem

        pad_len1 = max_len_q1 - len(left_elem['query'])
        pad_len2 = max_len_d1 - len(left_elem['document'])
        if is_triplets:
            pad_len3 = max_len_q2 - len(right_elem['query'])
            pad_len4 = max_len_d2 - len(right_elem['document'])

        q1s.append(left_elem['query'] + [0] * pad_len1)
        d1s.append(left_elem['document'] + [0] * pad_len2)
        if is_triplets:
            q2s.append(right_elem['query'] + [0] * pad_len3)
            d2s.append(right_elem['document'] + [0] * pad_len4)
        labels.append([float(label)])
    q1s = torch.LongTensor(q1s)
    d1s = torch.LongTensor(d1s)
    if is_triplets:
        q2s = torch.LongTensor(q2s)
        d2s = torch.LongTensor(d2s)
    labels = torch.FloatTensor(labels)

    ret_left = {'query': q1s, 'document': d1s}
    if is_triplets:
        ret_right = {'query': q2s, 'document': d2s}
        return ret_left, ret_right, labels
    else:
        return ret_left, labels

In [84]:
triplets.shape

(12000, 4)

In [85]:
condition = triplets[:, 3] == 0
triplets[condition].shape

(11531, 4)

In [86]:
condition = triplets[:, 3] == 1
triplets[condition].shape

(469, 4)

In [87]:
pad_vec = np.zeros_like((50,1))

In [88]:
pad_vec

array([0, 0])

In [89]:
pad_vec = np.zeros((1, 50))
oov_vec = np.random.uniform(low=-1,
                            high=1,
                            size=50)

In [90]:
oov_vec.shape

(50,)

In [26]:
class Solution:
    def __init__(self, glue_qqp_dir: str,
                 glove_vectors_path: str,
                 min_token_occurancies: int = 1,
                 random_seed: int = 0,
                 emb_rand_uni_bound: float = 0.2,
                 freeze_knrm_embeddings: bool = True,
                 knrm_kernel_num: int = 21,
                 knrm_out_mlp: List[int] = [],
                 dataloader_bs: int = 1024,
                 train_lr: float = 0.1,
                 change_train_loader_ep: int = 10
                 ):
        self.glue_qqp_dir = glue_qqp_dir
        self.glove_vectors_path = glove_vectors_path
        self.glue_train_df = self.get_glue_df('train')
        self.glue_dev_df = self.get_glue_df('dev')
        self.dev_pairs_for_ndcg = self.create_val_pairs(self.glue_dev_df)
        self.min_token_occurancies = min_token_occurancies
        self.all_tokens = self.get_all_tokens(
            [self.glue_train_df, self.glue_dev_df], self.min_token_occurancies)

        self.random_seed = random_seed
        self.emb_rand_uni_bound = emb_rand_uni_bound
        self.freeze_knrm_embeddings = freeze_knrm_embeddings
        self.knrm_kernel_num = knrm_kernel_num
        self.knrm_out_mlp = knrm_out_mlp
        self.dataloader_bs = dataloader_bs
        self.train_lr = train_lr
        self.change_train_loader_ep = change_train_loader_ep

        self.model, self.vocab, self.unk_words = self.build_knrm_model()
        self.idx_to_text_mapping_train = self.get_idx_to_text_mapping(self.glue_train_df)
        self.idx_to_text_mapping_dev = self.get_idx_to_text_mapping(self.glue_dev_df)

        self.val_dataset = ValPairsDataset(self.dev_pairs_for_ndcg,
                                           self.idx_to_text_mapping_dev,
                                           vocab=self.vocab,
                                           oov_val=self.vocab['OOV'],
                                           preproc_func=self.simple_preproc)
        self.val_dataloader = torch.utils.data.DataLoader(
            self.val_dataset, batch_size=self.dataloader_bs, num_workers=0,
            collate_fn=collate_fn, shuffle=False)

    def get_glue_df(self, partition_type: str) -> pd.DataFrame:
        assert partition_type in ['dev', 'train']
        glue_df = pd.read_csv(self.glue_qqp_dir + f'/{partition_type}.tsv', sep='\t', dtype=object)
        glue_df = glue_df.dropna(axis=0, how='any').reset_index(drop=True)
        glue_df_fin = pd.DataFrame({
            'id_left': glue_df['qid1'],
            'id_right': glue_df['qid2'],
            'text_left': glue_df['question1'],
            'text_right': glue_df['question2'],
            'label': glue_df['is_duplicate'].astype(int)
        })
        return glue_df_fin

    def handle_punctuation(self, inp_str: str) -> str:
        # my code below
        translator = str.maketrans(string.punctuation,
                                   ' '*len(string.punctuation))
        new_str = inp_str.translate(translator)
        return new_str

    def simple_preproc(self, inp_str: str) -> List[str]:
        # my code below
        no_punctuation_str = self.handle_punctuation(inp_str)
        lowered_str = no_punctuation_str.strip().lower()
        splitted_doc = nltk.word_tokenize(lowered_str)
        return splitted_doc

    def _filter_rare_words(self, vocab: Dict[str, int],
                           min_occurancies: int) -> Dict[str, int]:
        # my code below
        filtered_vocab = {x: count for x, count in vocab.items() if
                          count >= min_occurancies}
        return filtered_vocab

    def get_all_tokens(self, list_of_df: List[pd.DataFrame], min_occurancies: int) -> List[str]:
        def flatten(t): return [item for sublist in t for item in sublist]
        tokens = []
        for df in list_of_df:
            unique_texts = set(
                df[['text_left', 'text_right']].values.reshape(-1))
            df_tokens = flatten(map(self.simple_preproc, unique_texts))
            tokens.extend(list(df_tokens))
        count_filtered = self._filter_rare_words(
            Counter(tokens), min_occurancies)
        return list(count_filtered.keys())

    def _read_glove_embeddings(self, file_path: str) -> Dict[str, List[str]]:
        # my code below
        with open(file_path, encoding='utf-8') as file:
            glove_dict = {}
            for line in file:
                splitted_line = line.rstrip().split()
                word, embedding = splitted_line[0], splitted_line[1:]
                glove_dict[word] = embedding
        return glove_dict

    def create_glove_emb_from_file(self, file_path: str,
                                   inner_keys: List[str],
                                   random_seed: int,
                                   rand_uni_bound: float
                                   ) -> Tuple[np.ndarray,
                                              Dict[str, int],
                                              List[str]]:
        # my code below
        np.random.seed(random_seed)
        glove_dict = self._read_glove_embeddings(file_path)
        emb_dim = len(glove_dict['the'])

        emb_matrix = []
        pad_vec = np.zeros((emb_dim, ))
        oov_vec = np.random.uniform(low=-rand_uni_bound,
                                    high=rand_uni_bound,
                                    size=emb_dim)
        emb_matrix.append(pad_vec)
        emb_matrix.append(oov_vec)

        vocab = {}
        unk_words = []
        vocab['PAD'], vocab['OOV'] = 0, 1
        for ind, token in enumerate(inner_keys, 2):
            if token in glove_dict.keys():
                emb_matrix.append(glove_dict[token])
                vocab[token] = ind
            else:
                random_emb = np.random.uniform(low=-rand_uni_bound,
                                               high=rand_uni_bound,
                                               size=emb_dim)
                emb_matrix.append(random_emb)
                unk_words.append(token)
                vocab[token] = ind
        emb_matrix = np.array(emb_matrix).astype(float)
        return emb_matrix, vocab, unk_words

    def build_knrm_model(self) -> Tuple[
            torch.nn.Module, Dict[str, int], List[str]]:
        emb_matrix, vocab, unk_words = \
            self.create_glove_emb_from_file(self.glove_vectors_path,
                                            self.all_tokens,
                                            self.random_seed,
                                            self.emb_rand_uni_bound)
        torch.manual_seed(self.random_seed)
        knrm = KNRM(emb_matrix,
                    freeze_embeddings=self.freeze_knrm_embeddings,
                    out_layers=self.knrm_out_mlp,
                    kernel_num=self.knrm_kernel_num)
        return knrm, vocab, unk_words

    def sample_data_for_train_iter(self, inp_df: pd.DataFrame, seed: int
                                   ) -> List[List[Union[str, float]]]:
        groups = inp_df[['id_left', 'id_right', 'label']].groupby('id_left')
        pairs_w_labels = []
        np.random.seed(seed)
        all_right_ids = inp_df.id_right.values
        for id_left, group in groups:
            labels = group.label.unique()
            if len(labels) > 1:
                for label in labels:
                    same_label_samples = group[group.label ==
                                               label].id_right.values
                    if label == 0 and len(same_label_samples) > 1:
                        sample = np.random.choice(
                            same_label_samples, 2, replace=False)
                        pairs_w_labels.append(
                            [id_left, sample[0], sample[1], 0.5])
                    elif label == 1:
                        less_label_samples = group[group.label <
                                                   label].id_right.values
                        pos_sample = np.random.choice(
                            same_label_samples, 1, replace=False)
                        if len(less_label_samples) > 0:
                            neg_sample = np.random.choice(
                                less_label_samples, 1, replace=False)
                        else:
                            neg_sample = np.random.choice(
                                all_right_ids, 1, replace=False)
                        pairs_w_labels.append(
                            [id_left, pos_sample[0], neg_sample[0], 1])
        return pairs_w_labels

    def create_val_pairs(self,
                         inp_df: pd.DataFrame,
                         fill_top_to: int = 15,
                         min_group_size: int = 2,
                         seed: int = 0) -> List[List[Union[str, float]]]:
        inp_df_select = inp_df[['id_left', 'id_right', 'label']]
        inf_df_group_sizes = inp_df_select.groupby('id_left').size()
        glue_dev_leftids_to_use = list(
            inf_df_group_sizes[inf_df_group_sizes >= min_group_size].index)
        groups = inp_df_select[inp_df_select.id_left.isin(
            glue_dev_leftids_to_use)].groupby('id_left')

        all_ids = set(inp_df['id_left']).union(set(inp_df['id_right']))

        out_pairs = []

        np.random.seed(seed)

        for id_left, group in groups:
            ones_ids = group[group.label > 0].id_right.values
            zeroes_ids = group[group.label == 0].id_right.values
            sum_len = len(ones_ids) + len(zeroes_ids)
            num_pad_items = max(0, fill_top_to - sum_len)
            if num_pad_items > 0:
                cur_chosen = set(ones_ids).union(
                    set(zeroes_ids)).union({id_left})
                pad_sample = np.random.choice(
                    list(all_ids - cur_chosen), num_pad_items,
                    replace=False).tolist()
            else:
                pad_sample = []
            for i in ones_ids:
                out_pairs.append([id_left, i, 2])
            for i in zeroes_ids:
                out_pairs.append([id_left, i, 1])
            for i in pad_sample:
                out_pairs.append([id_left, i, 0])
        return out_pairs

    def get_idx_to_text_mapping(self, inp_df: pd.DataFrame) -> Dict[str, str]:
        left_dict = (
            inp_df[
                ['id_left', 'text_left']
            ].drop_duplicates()
            .set_index('id_left')
            ['text_left'].to_dict()
        )
        right_dict = (
            inp_df[
                ['id_right', 'text_right']
            ].drop_duplicates()
            .set_index('id_right')
            ['text_right'].to_dict()
        )
        left_dict.update(right_dict)
        return left_dict

    def _dcg(self, ys_true: np.array, ys_pred: np.array, k: int) -> float:
        indices = np.argsort(-ys_pred)
        ys_true = ys_true[indices[:k]]

        sum_dcg = 0
        for i, y_true in enumerate(ys_true, 1):
            sum_dcg += (2 ** y_true - 1) / math.log2(i + 1)
        return sum_dcg
    
    
    def ndcg_k(self, ys_true: np.array, ys_pred: np.array,
               ndcg_top_k: int = 10) -> float:
        ideal_dcg = self._dcg(ys_true, ys_true, ndcg_top_k)
        case_dcg = self._dcg(ys_true, ys_pred, ndcg_top_k)
        return float(case_dcg / ideal_dcg)

    def valid(self, model: torch.nn.Module,
              val_dataloader: torch.utils.data.DataLoader) -> float:
        labels_and_groups = val_dataloader.dataset.index_pairs_or_triplets
        labels_and_groups = pd.DataFrame(labels_and_groups,
                                         columns=['left_id', 'right_id',
                                                  'rel'])

        all_preds = []
        for batch in (val_dataloader):
            inp_1, y = batch
            preds = model.predict(inp_1)
            preds_np = preds.detach().numpy()
            all_preds.append(preds_np)
        all_preds = np.concatenate(all_preds, axis=0)
        labels_and_groups['preds'] = all_preds

        ndcgs = []
        for cur_id in labels_and_groups.left_id.unique():
            cur_df = labels_and_groups[labels_and_groups.left_id == cur_id]
            ndcg = self.ndcg_k(cur_df.rel.values.reshape(-1),
                               cur_df.preds.values.reshape(-1))
            if np.isnan(ndcg):
                ndcgs.append(0)
            else:
                ndcgs.append(ndcg)
        return np.mean(ndcgs)
    
    def train(self, n_epochs: int):
        opt = torch.optim.SGD(self.model.parameters(), lr=self.train_lr)
        criterion = torch.nn.BCELoss()
        for ep in range(n_epochs):
            if ep % self.change_train_loader_ep == 0:
                sampled_train_triplets = self.sample_data_for_train_iter(self.glue_train_df, seed=ep)
                train_dataset = TrainTripletsDataset(sampled_train_triplets, 
                        self.idx_to_text_mapping_train, 
                        vocab=self.vocab, oov_val=self.vocab['OOV'], 
                        preproc_func=self.simple_preproc)
                train_dataloader = torch.utils.data.DataLoader(
                    train_dataset, batch_size=self.dataloader_bs, num_workers=0, 
                    collate_fn=collate_fn, shuffle=True,)
            for j, data in enumerate(train_dataloader):
                opt.zero_grad()
                query_left_docs, query_right_docs, labels = data
                outputs = self.model(query_left_docs, query_right_docs)
                loss = criterion(outputs, labels)
                loss.backward()
                opt.step()

            val_ndcg = self.valid(self.model, self.val_dataloader)
            print(f'Epoch: {ep}, validation ndcg {val_ndcg}')
            if val_ndcg > 0.925:
                 break

In [27]:
slt = Solution(glue_qqp_dir='data/QQP', glove_vectors_path='data/glove.6B.50d.txt')

In [28]:
# 10 iteraions. NDCG starts from ~0.4
slt.train(20)

Epoch: 0, validation ndcg 0.8489403954532958
Epoch: 1, validation ndcg 0.8367880379232742
Epoch: 2, validation ndcg 0.9372785146576229
