> The idea of this notebook is to create a model to predictc tokens sequence probabilities with the objective of replacing the unknown tokens on the test data with this predicted tokens. The chosen model is a BiLSTM_NN with softmax activation due to these kind of models are state of the art for such tasks if we dont consider transformers or other kind of pretrained models.

# 1. Data preprocessing

In [1]:
# %pip install pandas

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [132]:
# Read data
df1 = pd.read_json('domain1_train_data.json', lines=True)
df2 = pd.read_json('domain2_train_data.json', lines=True)

# Join data in both domains for trating them jointly (Augmentation)
df = pd.concat([df1, df2]).reset_index(drop=True)['text']
df = [[str(ch) for ch in l] for l in df]

# Build Text Generator model

In [133]:
from collections import Counter
# Function to get corpus
def get_vocab(txt):
    l_ = []
    for l in txt:
        l_.extend(l)
    corpus = Counter(l_)
    return corpus
#  Get vocab
unigram_counts = get_vocab([txt for txt in df])
vocab = list(set(unigram_counts.keys()))
vocab_size = len(vocab)

In [134]:
from collections import defaultdict
def convert_sentence(word, token): # Helper to add starting token
    return [token] + [w for w in word]
    
# Function to create the bigrams
def bigram_counter(set, token):
    bigram_counts=defaultdict(Counter)
    # collect bigram counts
    for word in set:
        word = convert_sentence(word, token) # Add starting token
        # Count bigrams
        for first, second in zip(word, word[1:]):
            bigram_counts[first][second] += 1
    return bigram_counts

In [135]:
# Instanciate the bigrams
bigram_counts = bigram_counter(df, '<s>')
bigram_counts_r = bigram_counter([''.join(list(reversed(w))) for w in df], token= '</s>')    

# Replace masked tokens

In [219]:
test = pd.read_json('test_data.json', lines=True)['text']

In [263]:
def replace_char(idx, t):
    if t[idx-1] != 0 and bigram_counts[str(t[idx-1])]:
        l_char = bigram_counts[str(t[idx-1])].most_common()[0][0]
    else:
        l_char = 1

    if t[idx+1] != 0 and bigram_counts_r[str(t[idx+1])]:
        r_char = bigram_counts_r[str(t[idx+1])].most_common()[0][0]
    else:
        r_char = 1
    return l_char

# Function to replace
def replace(t):
    t = ['<s>'] + [str(w) for w in t] + ['</s>']
    for idx, tok in enumerate(t):
        if tok == '0':
            t[idx] = replace_char(idx, t)
    t = t[1:-1] # Drop padding token
    t = [int(tok) for tok in t]
    return t

In [268]:
test_ = [replace(t) for t in test]

In [269]:
test_

[[12,
  10839,
  1083,
  2881,
  12159,
  2356,
  6267,
  3,
  1426,
  18,
  21776,
  6,
  5528,
  6,
  1105,
  3427,
  1354,
  1,
  13364,
  935,
  4820,
  10104,
  1426,
  18,
  19,
  37,
  19,
  1052,
  3,
  1827,
  8676,
  6,
  4491,
  8676,
  1,
  2,
  37325,
  48,
  2356,
  6,
  1993,
  6326,
  18,
  2356,
  11544,
  19,
  18144,
  1105,
  19,
  6,
  2,
  3,
  9547,
  7010,
  11544,
  14,
  73872,
  13,
  18,
  5033,
  6,
  1871,
  14513,
  1],
 [12,
  858,
  7,
  1179,
  944,
  1485,
  10,
  2,
  4532,
  12245,
  499,
  254,
  307,
  4,
  7,
  360,
  1528,
  404,
  179,
  179,
  519,
  949,
  1,
  10,
  106,
  120,
  3,
  949,
  236,
  6,
  106,
  209,
  85,
  12,
  98,
  11,
  48,
  595,
  27,
  5591,
  230,
  877,
  1,
  39,
  11,
  14,
  16,
  13,
  2,
  281,
  598,
  411,
  14,
  179,
  519,
  404,
  179,
  13,
  949,
  22,
  743,
  4,
  58,
  7629,
  4,
  19809,
  3,
  3435,
  411,
  14,
  404,
  13,
  85,
  6,
  14,
  1194,
  13,
  3,
  55,
  7048,
  92,
  20,
  9,
  2459,