> The idea of this notebook is to create a model to predictc tokens sequence probabilities with the objective of replacing the unknown tokens on the test data with this predicted tokens. The chosen model is a BiLSTM_NN with softmax activation due to these kind of models are state of the art for such tasks if we dont consider transformers or other kind of pretrained models.

# 1. Data preprocessing

In [1]:
# %pip install pandas

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [3]:
# Read data
df1 = pd.read_json('../Data/domain1_train_data.json', lines=True)
df2 = pd.read_json('../Data/domain2_train_data.json', lines=True)

# Join data in both domains for trating them jointly (Augmentation)
df = pd.concat([df1, df2]).reset_index(drop=True)['text']
df = [[str(ch) for ch in l] for l in df]

# Build Text Generator model

In [4]:
from collections import Counter
# Function to get corpus
def get_vocab(txt):
    l_ = []
    for l in txt:
        l_.extend(l)
    corpus = Counter(l_)
    return corpus
#  Get vocab
unigram_counts = get_vocab([txt for txt in df])
vocab = list(set(unigram_counts.keys()))
vocab_size = len(vocab)

In [5]:
from collections import defaultdict
def convert_sentence(word, token): # Helper to add starting token
    return [token] + [w for w in word]
    
# Function to create the bigrams
def bigram_counter(set, token):
    bigram_counts=defaultdict(Counter)
    # collect bigram counts
    for word in set:
        word = convert_sentence(word, token) # Add starting token
        # Count bigrams
        for first, second in zip(word, word[1:]):
            bigram_counts[first][second] += 1
    return bigram_counts

In [6]:
# Instanciate the bigrams
bigram_counts = bigram_counter(df, '<s>')
bigram_counts_r = bigram_counter([''.join(list(reversed(w))) for w in df], token= '</s>')    

# Replace masked tokens

In [8]:
test = pd.read_json('../Data/test_data.json', lines=True)['text']

In [9]:
def replace_char(idx, t):
    if t[idx-1] != 0 and bigram_counts[str(t[idx-1])]:
        l_char = bigram_counts[str(t[idx-1])].most_common()[0][0]
    else:
        l_char = 1

    if t[idx+1] != 0 and bigram_counts_r[str(t[idx+1])]:
        r_char = bigram_counts_r[str(t[idx+1])].most_common()[0][0]
    else:
        r_char = 1
    return l_char

# Function to replace
def replace(t):
    t = ['<s>'] + [str(w) for w in t] + ['</s>']
    for idx, tok in enumerate(t):
        if tok == '0':
            t[idx] = replace_char(idx, t)
    t = t[1:-1] # Drop padding token
    t = [int(tok) for tok in t]
    return t

In [10]:
test_ = [replace(t) for t in test]

In [22]:
pd.Series(test_).to_csv('../Data/test_Data_unmasked.csv',)

In [26]:
import json
with open('../Data/test_Data_unmasked.json', 'w') as f:
    json.dump(test_, f)