In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shusrith/machine-trainslation")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shusrith/machine-trainslation?dataset_version_number=4...


100%|██████████| 169M/169M [00:04<00:00, 38.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shusrith/machine-trainslation/versions/4


In [4]:
!ls $path/Dataset1

Dataset1


In [5]:
import pandas as pd

df = pd.read_csv(f"{path}/Dataset1/Dataset1/spa.txt.csv")
df

Unnamed: 0,English,Translated
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
128079,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
128080,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
128081,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
128082,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [21]:
import re

def preproc(x):
    x = re.sub(r"[^a-zA-Z0-9\s]", "", x)
    x = x.lower()
    return x

def preproc_spanish(text):
    text = re.sub(r"[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ\s]", "", text)
    text = text.lower()
    return text

df["English"] = df["English"].apply(preproc)
df["Spanish"] = df["Spanish"].apply(preproc_spanish)
# df.drop(columns=["Translated"], inplace=True)
df

Unnamed: 0,English,Spanish
0,go,ve
1,go,vete
2,go,vaya
3,go,váyase
4,hi,hola
...,...,...
128079,there are four main causes of alcoholrelated d...,hay cuatro causas principales de muertes relac...
128080,there are mothers and fathers who will lie awa...,hay madres y padres que se quedan despiertos d...
128081,a carbon footprint is the amount of carbon dio...,una huella de carbono es la cantidad de contam...
128082,since there are usually multiple websites on a...,como suele haber varias páginas web sobre cual...


In [22]:
corpus = " ".join(df["English"].tolist())
with open("corpus.txt", "w") as f:
    f.write(corpus)

In [23]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(["corpus.txt"], vocab_size=30000)

In [None]:
eng_vocab = tokenizer.get_vocab()
eng_vocab = sorted(eng_vocab.items(), key=lambda x: x[1])
eng_vocab

In [28]:
corpus = " ".join(df["Spanish"].tolist())
with open("corpus.txt", "w") as f:
    f.write(corpus)

In [29]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(["corpus.txt"], vocab_size=30000)

In [None]:
spa_vocab = tokenizer.get_vocab()
spa_vocab = sorted(spa_vocab.items(), key=lambda x: x[1])
spa_vocab

In [None]:
e_vocab = {i : j for i, j in eng_vocab}
s_vocab = {i : j for i, j in spa_vocab}
e_vocab

In [37]:
import json
with open("eng_vocab.json", "w", encoding="utf-8") as f:
    json.dump(e_vocab, f, ensure_ascii=False, indent=4)
    f.close()

import json

with open("spa_vocab.json", "w", encoding="utf-8") as f:
    json.dump(s_vocab, f, ensure_ascii=False, indent=4)



In [38]:
def encode(text, vocab):
    return [vocab.get(word, len(vocab) + 1) for word in text.split()]

df["English"] = df["English"].apply(lambda x: encode(x, e_vocab))
df["Spanish"] = df["Spanish"].apply(lambda x: encode(x, s_vocab))
df

Unnamed: 0,English,Spanish
0,[4024],[491]
1,[4024],[10639]
2,[4024],[23720]
3,[4024],[23720]
4,[7954],[23720]
...,...,...
128079,"[14450, 714, 7053, 14450, 14450, 11907, 14450,...","[23720, 23720, 23720, 23720, 331, 23720, 23720..."
128080,"[14450, 714, 14450, 462, 14450, 14450, 14450, ...","[23720, 23720, 88, 23720, 568, 424, 23720, 237..."
128081,"[64, 14450, 14450, 283, 14450, 14450, 11907, 1...","[8327, 23720, 331, 23720, 277, 312, 23720, 331..."
128082,"[14450, 14450, 714, 14450, 14450, 14450, 277, ...","[23720, 23720, 23720, 23720, 23720, 23720, 237..."


In [41]:
length_counts = df["English"].apply(len).value_counts().sort_index(ascending=False)
cumulative_counts = length_counts.cumsum()[::-1]
cumulative_counts

Unnamed: 0_level_0,count
English,Unnamed: 1_level_1
1,128084
2,128018
3,124761
4,114016
5,94735
6,72168
7,50863
8,33431
9,21332
10,13195


In [42]:
length_counts = df["Spanish"].apply(len).value_counts().sort_index(ascending=False)
cumulative_counts = length_counts.cumsum()[::-1]
cumulative_counts

Unnamed: 0_level_0,count
Spanish,Unnamed: 1_level_1
1,128084
2,127324
3,122001
4,108153
5,87430
6,65786
7,46446
8,31089
9,20138
10,12978


In [43]:
max_len = 12
new_data = []

for i, row in df.iterrows():
    full_review = row["English"]
    sentiment = row["Spanish"]
    eng_chunks = [full_review[j : j + max_len] for j in range(0, len(full_review), max_len)]
    spa_chunks = [sentiment[j : j + max_len] for j in range(0, len(sentiment), max_len)]
    chunks = list(zip(eng_chunks, spa_chunks))
    for eng_chunk, spa_chunk in chunks:
        new_data.append((eng_chunk, spa_chunk))

df1 = pd.DataFrame(new_data, columns=["English", "Spanish"])

In [44]:
def pad(text, max_len):
    return text + [0] * (max_len - len(text))

df1["English"] = df1["English"].apply(lambda x: pad(x, 12))
df1["Spanish"] = df1["Spanish"].apply(lambda x: pad(x, 12))

In [45]:
df2 = df.copy()
df2["English"] = df2["English"].apply(lambda x: pad(x, 47))
df2["Spanish"] = df2["Spanish"].apply(lambda x: pad(x, 49))
df2

Unnamed: 0,English,Spanish
0,"[4024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[491, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,"[4024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10639, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[4024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"[4024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,"[7954, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[23720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...
128079,"[14450, 714, 7053, 14450, 14450, 11907, 14450,...","[23720, 23720, 23720, 23720, 331, 23720, 23720..."
128080,"[14450, 714, 14450, 462, 14450, 14450, 14450, ...","[23720, 23720, 88, 23720, 568, 424, 23720, 237..."
128081,"[64, 14450, 14450, 283, 14450, 14450, 11907, 1...","[8327, 23720, 331, 23720, 277, 312, 23720, 331..."
128082,"[14450, 14450, 714, 14450, 14450, 14450, 277, ...","[23720, 23720, 23720, 23720, 23720, 23720, 237..."


In [46]:
df.to_csv("output_bpe.csv", index=False)
df1.to_csv("output1_bpe.csv", index=False)
df2.to_csv("output2_bpe.csv", index=False)