In [24]:
import pandas as pd

df = pd.read_csv(f"Dataset1/Dataset1/spa.txt.csv")
df

Unnamed: 0,English,Translated
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
128079,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
128080,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
128081,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
128082,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [25]:
import re


def preproc(x):
    x = re.sub(r"[^a-zA-Z0-9\s]", "", x)
    x = x.lower()
    return x


def preproc_spanish(text):
    text = re.sub(r"[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ\s]", "", text)
    text = text.lower()
    return text

def spec_tokens(text):
    text = "<SOS> " + text + " <EOS>"
    return text

df["English"] = df["English"].apply(preproc)
df["Spanish"] = df["Translated"].apply(preproc_spanish)
df["Spanish"] = df["Spanish"].apply(spec_tokens)
df["English"] = df["English"].apply(spec_tokens)
df.drop(columns=["Translated"], inplace=True)
df

Unnamed: 0,English,Spanish
0,<SOS> go <EOS>,<SOS> ve <EOS>
1,<SOS> go <EOS>,<SOS> vete <EOS>
2,<SOS> go <EOS>,<SOS> vaya <EOS>
3,<SOS> go <EOS>,<SOS> váyase <EOS>
4,<SOS> hi <EOS>,<SOS> hola <EOS>
...,...,...
128079,<SOS> there are four main causes of alcoholrel...,<SOS> hay cuatro causas principales de muertes...
128080,<SOS> there are mothers and fathers who will l...,<SOS> hay madres y padres que se quedan despie...
128081,<SOS> a carbon footprint is the amount of carb...,<SOS> una huella de carbono es la cantidad de ...
128082,<SOS> since there are usually multiple website...,<SOS> como suele haber varias páginas web sobr...


In [26]:
from tokenizers import ByteLevelBPETokenizer

corpus = "\n".join(df["English"].tolist() + df["Spanish"].tolist())

with open("joint_corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)

In [27]:
tokenizer = ByteLevelBPETokenizer()
special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
tokenizer.train(files="joint_corpus.txt", vocab_size=40000, special_tokens=special_tokens)






In [28]:
tokenizer.save_model("EnglishOrSpanish")

['EnglishOrSpanish/vocab.json', 'EnglishOrSpanish/merges.txt']

In [29]:
def encode(text, tokenizer):
    return tokenizer.encode(text).ids


df["English"] = df["English"].apply(lambda x: encode(x, tokenizer))
df["Spanish"] = df["Spanish"].apply(lambda x: encode(x, tokenizer))
df

Unnamed: 0,English,Spanish
0,"[2, 382, 224, 3]","[2, 537, 224, 3]"
1,"[2, 382, 224, 3]","[2, 5706, 224, 3]"
2,"[2, 382, 224, 3]","[2, 2776, 224, 3]"
3,"[2, 382, 224, 3]","[2, 16315, 224, 3]"
4,"[2, 978, 224, 3]","[2, 6848, 224, 3]"
...,...,...
128079,"[2, 568, 449, 3260, 7570, 9289, 362, 4959, 285...","[2, 664, 3275, 15912, 14216, 294, 7440, 298, 2..."
128080,"[2, 568, 449, 6644, 481, 4992, 703, 575, 3003,...","[2, 664, 21802, 292, 1880, 316, 336, 6915, 167..."
128081,"[2, 265, 15190, 31251, 334, 300, 7099, 362, 15...","[2, 410, 16988, 294, 15191, 318, 321, 7229, 29..."
128082,"[2, 2131, 568, 449, 2266, 4749, 76, 1158, 3286...","[2, 669, 5330, 952, 4635, 9314, 3977, 1086, 25..."


In [32]:
length_counts = df["English"].apply(len).value_counts().sort_index(ascending=False)
cumulative_counts = length_counts.cumsum()[::-1]
cumulative_counts

English
4     128084
5     128018
6     124820
7     114283
8      95252
9      72829
10     51504
11     34047
12     21868
13     13610
14      8555
15      5230
16      3271
17      2073
18      1279
19       784
20       504
21       365
22       267
23       186
24       119
25        95
26        66
27        51
28        40
29        27
30        24
31        19
32        18
33        17
34        15
35        14
37         7
38         5
39         3
49         2
52         1
Name: count, dtype: int64

In [33]:
l = []
max_len = 18
for _, row in df.iterrows():
    eng_chunks = []
    spa_chunks = []
    english = row["English"]
    spanish = row["Spanish"]

    for i in range(0, len(english), max_len - 1):
        eng_chunk = english[i : i + max_len - 1]
        while len(eng_chunk) < max_len:
            eng_chunk.append(0)
        eng_chunks.append(eng_chunk)

    for i in range(0, len(spanish), max_len - 1):
        spa_chunk = spanish[i : i + max_len - 1]
        while len(spa_chunk) < max_len:
            spa_chunk.append(0)
        spa_chunks.append(spa_chunk)

    chunks = list(zip(eng_chunks, spa_chunks))
    l.extend(chunks)

df1 = pd.DataFrame(l, columns=["English", "Spanish"])
df1

Unnamed: 0,English,Spanish
0,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 537, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 5706, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 2776, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 16315, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[2, 978, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 6848, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...
128977,"[272, 300, 1341, 7231, 1578, 722, 23869, 481, ...","[265, 321, 4484, 6173, 983, 2809, 387, 23869, ..."
128978,"[2, 658, 304, 429, 272, 2368, 498, 265, 4848, ...","[2, 453, 1050, 7008, 669, 323, 7359, 5947, 208..."
128979,"[300, 1647, 4942, 1092, 481, 1092, 332, 300, 1...","[410, 292, 1400, 786, 294, 321, 2587, 2731, 32..."
128980,"[1092, 481, 1092, 2014, 538, 461, 830, 361, 13...","[5529, 82, 410, 292, 1400, 786, 1207, 316, 349..."


In [38]:
sum(df1["Spanish"].apply(len) > 18)

0

In [39]:
def pad_sequence(sequence, max_len):
    return sequence + [0] * (max_len - len(sequence))

df2 = df.copy()
df2["English"] = df["English"].apply(lambda x: pad_sequence(x, 52))
df2["Spanish"] = df["Spanish"].apply(lambda x: pad_sequence(x, 53))
df2

Unnamed: 0,English,Spanish
0,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 537, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 5706, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 2776, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,"[2, 382, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 16315, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[2, 978, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 6848, 224, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...
128079,"[2, 568, 449, 3260, 7570, 9289, 362, 4959, 285...","[2, 664, 3275, 15912, 14216, 294, 7440, 298, 2..."
128080,"[2, 568, 449, 6644, 481, 4992, 703, 575, 3003,...","[2, 664, 21802, 292, 1880, 316, 336, 6915, 167..."
128081,"[2, 265, 15190, 31251, 334, 300, 7099, 362, 15...","[2, 410, 16988, 294, 15191, 318, 321, 7229, 29..."
128082,"[2, 2131, 568, 449, 2266, 4749, 76, 1158, 3286...","[2, 669, 5330, 952, 4635, 9314, 3977, 1086, 25..."


In [44]:
sum(df2["English"].apply(len)) / len(df2)

52.0

In [45]:
df.to_csv("EnglishOrSpanish/output_joint.csv", index=False)
df1.to_csv("EnglishOrSpanish/output_joint1.csv", index=False)
df2.to_csv("EnglishOrSpanish/output_joint2.csv", index=False)