In [1]:
import ast
from collections import defaultdict
import numpy as np
import pandas as pd
import pickle
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Checking the random negative samples created automatically in the pipeline

(negative samples: same sentence as positive samples, but with another random word masked.)

In [2]:
with open('../data/wikitext/sample_sents_m1.pickle', 'rb') as f:
    sample_sents = pd.DataFrame(pickle.load(f), columns=['token', 'token_id', 'positive_samples', 'negative_samples'])

sample_sents

Unnamed: 0,token,token_id,positive_samples,negative_samples
0,a,1037,"[[101, 2728, 8945, 11314, 2121, 2003, 2019, 23...","[[101, 2728, 8945, 11314, 2121, 2003, 2019, 23..."
1,aa,9779,"[[101, 3424, 1030, 1011, 1030, 2948, 4721, 200...","[[101, 3424, 1030, 1011, 1030, 2948, 4721, 200..."
2,abandon,10824,"[[101, 2004, 1996, 2154, 2979, 1010, 2007, 929...","[[101, 2004, 1996, 2154, 2979, 1010, 2007, 929..."
3,abandoned,4704,"[[101, 2014, 3535, 2001, 7736, 1998, 9610, 767...","[[101, 2014, 3535, 2001, 7736, 1998, 9610, 767..."
4,abandoning,19816,"[[101, 2174, 1010, 1999, 1996, 6234, 3134, 380...","[[101, 2174, 1010, 103, 1996, 6234, 3134, 3805..."
...,...,...,...,...
9075,zone,4224,"[[101, 2006, 1996, 20198, 3483, 1005, 1055, 21...","[[101, 2006, 1996, 20198, 3483, 1005, 1055, 21..."
9076,zones,10019,"[[101, 6059, 3387, 2435, 1996, 3036, 6987, 315...","[[101, 6059, 3387, 2435, 1996, 3036, 6987, 315..."
9077,zoo,9201,"[[101, 2350, 14345, 2421, 1996, 17692, 2103, 1...","[[101, 2350, 14345, 2421, 1996, 17692, 2103, 1..."
9078,zoom,24095,"[[101, 27916, 1005, 7444, 2806, 2001, 15063, 1...","[[101, 27916, 1005, 7444, 2806, 2001, 15063, 1..."


In [3]:
sample_row = sample_sents.sample(1)
sample_row

Unnamed: 0,token,token_id,positive_samples,negative_samples
6071,prediction,17547,"[[101, 1996, 4034, 4358, 2184, 3371, 8760, 726...","[[101, 1996, 4034, 4358, 2184, 3371, 8760, 726..."


In [4]:
tokenizer = AutoTokenizer.from_pretrained('google/multiberts-seed_0')

pos_sent_ids = sample_row['positive_samples'].tolist()[0][0]
neg_sent_ids = sample_row['negative_samples'].tolist()[0][0]

pos_sent = tokenizer.convert_ids_to_tokens(pos_sent_ids)
neg_sent = tokenizer.convert_ids_to_tokens(neg_sent_ids)

print(pos_sent)
print(neg_sent)



In [5]:
# All positive and negative samples have the same size
sample_sents[sample_sents.apply(lambda row: len(row['positive_samples']) != len(row['negative_samples']), axis=1)]

Unnamed: 0,token,token_id,positive_samples,negative_samples


### Creating negative samples within the same POS category

In [6]:
def disarrange_within_pos(df, token_column, pos_column):
    """Downside: the negative samples won't necessarily have the same size as the positive samples"""

    # dictionary holding lists of indices for each POS tag
    pos_to_indices = defaultdict(list)

    for idx, pos_list in df[pos_column].items():
        for pos in pos_list:
            pos_to_indices[pos].append(idx)
    
    # Shuffle tokens within the same POS category
    dissarranged_df = df.copy()
    for pos, indices in pos_to_indices.items():
        if len(indices) > 1:
            values = dissarranged_df.loc[indices, token_column].values
            shuffled = np.random.permutation(len(values))
            # np.random.shuffle(shuffled)

            not_deranged = values == values[shuffled]
            while np.any(not_deranged):
                to_shuffle = shuffled[not_deranged]
                if len(to_shuffle) > 1:
                    np.random.shuffle(to_shuffle)
                    shuffled[not_deranged] = to_shuffle
                else:
                    idx = to_shuffle[0]
                    possible_indices = np.delete(shuffled, idx)
                    new_idx = np.random.choice(possible_indices)
                    shuffled[idx], shuffled[new_idx] = shuffled[new_idx], shuffled[idx]
                not_deranged = values == values[shuffled]

            dissarranged_df.loc[indices, token_column] = values[shuffled]

    return dissarranged_df.rename(columns={'positive_samples': 'negative_samples'})

In [7]:
with open('../data/wikitext/sample_sents_m1.pickle', 'rb') as f:
    samples = pd.DataFrame(pickle.load(f), columns=['token', 'token_id', 'positive_samples', 'negative_samples']).sort_values('token').reset_index(drop=True)

wordbank = pd.read_csv('../data/wikitext/wikitext_wordbank.tsv', sep='\t')

samples = pd.concat([samples.drop(columns=['negative_samples']), wordbank['POS'].apply(ast.literal_eval)], axis=1)
samples = samples[samples.positive_samples.apply(lambda x: len(x) != 0)].reset_index(drop=True)
samples

Unnamed: 0,token,token_id,positive_samples,POS
0,a,1037,"[[101, 2728, 8945, 11314, 2121, 2003, 2019, 23...",[DET]
1,aa,9779,"[[101, 3424, 1030, 1011, 1030, 2948, 4721, 200...","[PROPN, NOUN]"
2,abandon,10824,"[[101, 2004, 1996, 2154, 2979, 1010, 2007, 929...",[VERB]
3,abandoned,4704,"[[101, 2014, 3535, 2001, 7736, 1998, 9610, 767...",[VERB]
4,abandoning,19816,"[[101, 2174, 1010, 1999, 1996, 6234, 3134, 380...",[VERB]
...,...,...,...,...
9073,zone,4224,"[[101, 2006, 1996, 20198, 3483, 1005, 1055, 21...",[NOUN]
9074,zones,10019,"[[101, 6059, 3387, 2435, 1996, 3036, 6987, 315...",[NOUN]
9075,zoo,9201,"[[101, 2350, 14345, 2421, 1996, 17692, 2103, 1...","[PROPN, NOUN]"
9076,zoom,24095,"[[101, 27916, 1005, 7444, 2806, 2001, 15063, 1...",[NOUN]


In [8]:
shuffled_df = disarrange_within_pos(samples, 'token', 'POS').sort_values('token').reset_index(drop=True)
shuffled_df

Unnamed: 0,token,token_id,negative_samples,POS
0,a,3183,"[[101, 2002, 2018, 2019, 6422, 2567, 1010, 204...",[PRON]
1,aa,2974,"[[101, 1996, 12637, 1997, 15986, 4409, 2006, 2...",[NOUN]
2,abandon,8752,"[[101, 1999, 1996, 2397, 6641, 1996, 2543, 103...",[VERB]
3,abandoned,24195,"[[101, 1996, 2206, 2305, 2006, 6315, 1010, 464...",[VERB]
4,abandoning,9124,"[[101, 2144, 2010, 5328, 2020, 5263, 2000, 210...",[VERB]
...,...,...,...,...
9073,zone,13171,"[[101, 17214, 2226, 8026, 8525, 7630, 1998, 14...",[NOUN]
9074,zones,8479,"[[101, 1996, 5082, 1997, 11865, 6499, 1005, 10...",[NOUN]
9075,zoo,20714,"[[101, 1999, 11040, 1010, 2002, 2649, 1037, 31...",[ADJ]
9076,zoom,17808,"[[101, 9690, 1010, 1996, 2640, 2001, 5837, 213...",[VERB]


In [9]:
# checking the results
tokenizer = AutoTokenizer.from_pretrained('google/multiberts-seed_0')
a_pos_sample = samples[samples.token == 'a'].positive_samples[0][1]
a_neg_sample = shuffled_df[shuffled_df.token == 'a'].negative_samples[0][1]
print(tokenizer.decode(a_pos_sample))
print(tokenizer.decode(a_neg_sample))

[CLS] this was followed by [MASK] starring role in the play herons written by simon stephens, which was performed in 2001 at the royal court theatre. [SEP] he had a guest role in the television series judge john deed in 2002. [SEP]
[CLS] the ghost, a combination of many literary figures, was originally addressed in the poem as " ser brunetto " before being revised as an ambiguous " you ". " [SEP] ser brunetto " was dante's way of addressing brunetto latini, a former mentor [MASK] he meets in hell to which he has been condemned for sodomy. [SEP]


In [10]:
# concatenating the negative samples to the samples df
pos_neg_samples = pd.concat([samples, shuffled_df.negative_samples], axis=1).drop(columns=['POS'])
pos_neg_samples

Unnamed: 0,token,token_id,positive_samples,negative_samples
0,a,1037,"[[101, 2728, 8945, 11314, 2121, 2003, 2019, 23...","[[101, 2002, 2018, 2019, 6422, 2567, 1010, 204..."
1,aa,9779,"[[101, 3424, 1030, 1011, 1030, 2948, 4721, 200...","[[101, 1996, 12637, 1997, 15986, 4409, 2006, 2..."
2,abandon,10824,"[[101, 2004, 1996, 2154, 2979, 1010, 2007, 929...","[[101, 1999, 1996, 2397, 6641, 1996, 2543, 103..."
3,abandoned,4704,"[[101, 2014, 3535, 2001, 7736, 1998, 9610, 767...","[[101, 1996, 2206, 2305, 2006, 6315, 1010, 464..."
4,abandoning,19816,"[[101, 2174, 1010, 1999, 1996, 6234, 3134, 380...","[[101, 2144, 2010, 5328, 2020, 5263, 2000, 210..."
...,...,...,...,...
9073,zone,4224,"[[101, 2006, 1996, 20198, 3483, 1005, 1055, 21...","[[101, 17214, 2226, 8026, 8525, 7630, 1998, 14..."
9074,zones,10019,"[[101, 6059, 3387, 2435, 1996, 3036, 6987, 315...","[[101, 1996, 5082, 1997, 11865, 6499, 1005, 10..."
9075,zoo,9201,"[[101, 2350, 14345, 2421, 1996, 17692, 2103, 1...","[[101, 1999, 11040, 1010, 2002, 2649, 1037, 31..."
9076,zoom,24095,"[[101, 27916, 1005, 7444, 2806, 2001, 15063, 1...","[[101, 9690, 1010, 1996, 2640, 2001, 5837, 213..."


In [11]:
# list_of_tuples = [tuple(row) for row in pos_neg_samples.itertuples(index=False, name=None)]
# with open('../data/wikitext/shuffled_sample_sents.pickle', 'wb') as f: 
#     pickle.dump(list_of_tuples, f)

In [12]:
# Positive and negative samples vary in size
pos_neg_samples[pos_neg_samples.apply(lambda row: len(row['positive_samples']) != len(row['negative_samples']), axis=1)]

Unnamed: 0,token,token_id,positive_samples,negative_samples
0,a,1037,"[[101, 2728, 8945, 11314, 2121, 2003, 2019, 23...","[[101, 2002, 2018, 2019, 6422, 2567, 1010, 204..."
1,aa,9779,"[[101, 3424, 1030, 1011, 1030, 2948, 4721, 200...","[[101, 1996, 12637, 1997, 15986, 4409, 2006, 2..."
2,abandon,10824,"[[101, 2004, 1996, 2154, 2979, 1010, 2007, 929...","[[101, 1999, 1996, 2397, 6641, 1996, 2543, 103..."
3,abandoned,4704,"[[101, 2014, 3535, 2001, 7736, 1998, 9610, 767...","[[101, 1996, 2206, 2305, 2006, 6315, 1010, 464..."
4,abandoning,19816,"[[101, 2174, 1010, 1999, 1996, 6234, 3134, 380...","[[101, 2144, 2010, 5328, 2020, 5263, 2000, 210..."
...,...,...,...,...
9073,zone,4224,"[[101, 2006, 1996, 20198, 3483, 1005, 1055, 21...","[[101, 17214, 2226, 8026, 8525, 7630, 1998, 14..."
9074,zones,10019,"[[101, 6059, 3387, 2435, 1996, 3036, 6987, 315...","[[101, 1996, 5082, 1997, 11865, 6499, 1005, 10..."
9075,zoo,9201,"[[101, 2350, 14345, 2421, 1996, 17692, 2103, 1...","[[101, 1999, 11040, 1010, 2002, 2649, 1037, 31..."
9076,zoom,24095,"[[101, 27916, 1005, 7444, 2806, 2001, 15063, 1...","[[101, 9690, 1010, 1996, 2640, 2001, 5837, 213..."
