In [1]:
import itertools
import os
import numpy as np
import pandas as pd
from typing import List, Union

In [14]:
def get_glue_df(glue_df) -> pd.DataFrame:
    glue_df = glue_df.dropna(axis=0, how='any').reset_index(drop=True)
    glue_df_fin = pd.DataFrame({
        'id_left': glue_df['qid1'],
        'id_right': glue_df['qid2'],
        'text_left': glue_df['question1'],
        'text_right': glue_df['question2'],
        'label': glue_df['is_duplicate'].astype(int)
    })
    return glue_df_fin

In [15]:
parent_dir = os.path.abspath(os.path.join('', os.pardir))
train_df = pd.read_csv(parent_dir + '/data/raw/QQP/train.tsv', sep='\t')
train_df = get_glue_df(train_df)

In [16]:
def sample_data_for_train_step(inp_df: pd.DataFrame,
                            seed: int) -> List[List[Union[str, float]]]:
    np.random.seed(seed)
    NUM_ZERO_ONE_REL_PAIRS = 4
    NUM_EQUAL_REL_PAIRS = 2
    inp_df_select = inp_df[['id_left', 'id_right', 'label']]
    inf_df_group_sizes = inp_df_select.groupby('id_left').size()
    train_leftids_to_use = list(inf_df_group_sizes[inf_df_group_sizes >= 3].index)
    groups = inp_df_select[inp_df_select.id_left.isin(train_leftids_to_use)].groupby('id_left')

    out_triplets = []
    for id_left, group in groups:
        if group['label'].sum() == 0:
            continue
        ones_df = group[group['label'] == 1]
        zeros_df = group[group['label'] == 0]

        if len(zeros_df) > 1:
            ones_ids = ones_df['id_right'].to_list()
            zeros_ids = zeros_df['id_right'].to_list()
            zero_one_permutations = [(one_id, zero_id) for one_id in ones_ids for zero_id in zeros_ids]
            np.random.shuffle(zero_one_permutations)
            for ids in zero_one_permutations[:NUM_ZERO_ONE_REL_PAIRS]:
                out_triplets.append([id_left, ids[0], ids[1], 1.0])

            zeros_ids_permutations = list(itertools.combinations(zeros_ids, 2))
            np.random.shuffle(zeros_ids_permutations)
            for ids in zeros_ids_permutations[:NUM_EQUAL_REL_PAIRS]:
                out_triplets.append([id_left, ids[0], ids[1], 0.5])
    return out_triplets

In [17]:
%%time
triplets = sample_data_for_train_step(train_df, 0)

CPU times: total: 5.67 s
Wall time: 5.68 s


In [18]:
len(triplets)

6535

In [30]:
triplets[:6]

[[57, 55585, 58, 1.0],
 [57, 35933, 58, 1.0],
 [57, 6800, 58, 1.0],
 [57, 28280, 174911, 1.0],
 [57, 58, 174911, 0.5],
 [119, 64307, 120, 1.0]]

##### Everything is fine. 55585 is more relevant to 57 than 58 so we labeled it with 1.0 relevancy.

In [28]:
train_df[(train_df['id_left']==57) & (train_df['id_right'].isin([55585, 58]))]

Unnamed: 0,id_left,id_right,text_left,text_right,label
66948,57,55585,What is best way to make money online?,What is the best way for making money online?,1
154537,57,58,What is best way to make money online?,What is best way to ask for money online?,0


##### Finalized versions of scripts responsible for Train and Validation Datasets creation are available in src/data/make_dataset.py