> The idea of this notebook is to create a model to predictc tokens sequence probabilities with the objective of replacing the unknown tokens on the test data with this predicted tokens. The chosen model is a BiLSTM_NN with softmax activation due to these kind of models are state of the art for such tasks if we dont consider transformers or other kind of pretrained models.

# 1. Data preprocessing

In [1]:
# %pip install pandas

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [4]:
# Read data
df1 = pd.read_json('domain1_train_data.json', lines=True)
df2 = pd.read_json('domain2_train_data.json', lines=True)

# Join data in both domains for trating them jointly (Augmentation)
df = pd.concat([df1, df2]).reset_index(drop=True)['text']
df = [[str(ch) for ch in l] for l in df]

# Build Text Generator model

In [5]:
from collections import Counter
# Function to get corpus
def get_vocab(txt):
    l_ = []
    for l in txt:
        l_.extend(l)
    corpus = Counter(l_)
    return corpus
#  Get vocab
unigram_counts = get_vocab([txt for txt in df])
vocab = list(set(unigram_counts.keys()))
vocab_size = len(vocab)

In [73]:
from collections import defaultdict
def convert_sentence(word, token): # Helper to add starting token
    return [token] + [w for w in word]
    
# Function to create the bigrams
def bigram_counter(set, token):
    bigram_counts=defaultdict(Counter)
    # collect bigram counts
    for word in set:
        word = convert_sentence(word, token) # Add starting token
        # Count bigrams
        for first, second in zip(word, word[1:]):
            bigram_counts[first][second] += 1
    return bigram_counts

# Function to create the trigrams
def trigram_counter(set, token):
    trigram_counts = defaultdict(lambda: defaultdict(Counter))
    for word in set:
        word = convert_sentence(word, token) # Add starting token
        # Count trigrams
        for first, second, third in zip(word, word[1:], word[2:]):
            trigram_counts[first][second][third] += 1
    return trigram_counts

# Function to create the quatrigrams
def quadrigram_counter(set, token):
    quadrigram_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(Counter)))
    for word in set:
        word = convert_sentence(word, token) # Add starting token
        # Count quatrigrams
        for first, second, third, fourth in zip(word, word[1:], word[2:], word[3:]):
            quadrigram_counts[first][second][third][fourth] += 1
    return quadrigram_counts

In [88]:
# Instanciate the n-igrams
bigram_counts_l = bigram_counter(df, '<s>')
bigram_counts_r = bigram_counter([''.join(list(reversed(w))) for w in df], token= '</s>')    
trigram_counts_l = trigram_counter(df, token= '<s>')                                              # Left trigram counts
trigram_counts_r = trigram_counter([''.join(list(reversed(w))) for w in df], token= '</s>')       # Rigth trigram counts
quadrigram_counts_r = quadrigram_counter(df, token= '<s>')                                        # Left cuatrigram counts
quadrigram_counts_l = quadrigram_counter([''.join(list(reversed(w))) for w in df], token= '</s>') # Rigth cuatrigram counts

# Replace masked tokens

In [89]:
test = pd.read_json('test_data.json', lines=True)['text']

In [128]:
def replace_char(idx, t):
    l_char = 0
    if quadrigram_counts_l[str(t[idx-3])][str(t[idx-2])][str(t[idx-1])]:
        l_char = quadrigram_counts_l[str(t[idx-3])][str(t[idx-2])][str(t[idx-1])].most_common()[0][0]
    elif trigram_counts_l[str(t[idx-2])][str(t[idx-1])]:
        l_char = trigram_counts_l[str(t[idx-2])][str(t[idx-1])].most_common()[0][0]
    elif bigram_counts_l[str(t[idx-1])]:
        l_char = bigram_counts_l[str(t[idx-1])].most_common()[0][0]
    else:
        l_char = 1

    r_char = 0
    # if quadrigram_counts_r[str(t[idx+3])][str(t[idx+2])][str(t[idx+1])]:
    #     l_char = quadrigram_counts_r[str(t[idx+3])][str(t[idx+2])][str(t[idx+1])].most_common()[0][0]
    # elif trigram_counts_r[str(t[idx+2])][str(t[idx+1])]:
    #     l_char = trigram_counts_r[str(t[idx+2])][str(t[idx+1])].most_common()[0][0]
    if bigram_counts_r[str(t[idx+1])]:
        r_char = bigram_counts_r[str(t[idx+1])].most_common()[0][0]
    else:
        r_char = 1
    if r_char != 1:
        token = (int(l_char) + int(r_char))/2
    else:
        token = int(l_char)
    return token

# Function to replace
def replace(t):
    t = ['<s>', '<s>', '<s>'] + [str(w) for w in t] + ['</s>', '</s>', '</s>']
    for idx, tok in enumerate(t):
        if tok == '0':
            t[idx] = replace_char(idx, t)
    t = t[3:-3] # Drop padding token
    t = [int(tok) for tok in t]
    return t

In [129]:
test_4 = [replace(t) for t in test]

In [50]:
indexes = [[i for i,j in enumerate (k) if j == 0] for k in test]
def replacements(test):
    l = []
    for txt in test:
        l_ = []
        for idx, val in enumerate(txt):
            if idx in indexes[0]:
                l_.append(test[0][idx])
        l.append(l_)
    print(l)

In [127]:
replacements(test_4)

[[12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508, 25226, 1, 1, 991, 10], [12, 5081, 3508], [12, 5081, 3508, 25226, 1, 1, 991, 1

In [130]:
replacements(test_4)

[[12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508], [12, 5081, 3508, 25226, 1, 1, 48, 10], [12, 5081, 3508], [1

In [124]:
replacements(test_4)

[[6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754], [6, 5081, 1754, 12613, 1, 1, 991, 10], [6, 5081, 1754], [6, 5

In [22]:
pd.Series(test_).to_csv('../Data/test_Data_unmasked.csv',)

In [26]:
import json
with open('../Data/test_Data_unmasked.json', 'w') as f:
    json.dump(test_, f)