# Description

In this notebook, I will prepare the dataset:
- Split `pho_mt` dataset into train set (80) and test set (20).

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import regex as re
from sklearn.model_selection import train_test_split

from utils import *

In [8]:
PATH_EN_PHO_MT_FILE = r"processed_data/pho_mt_en_sent.txt"
PATH_VI_PHO_MT_FILE = r"processed_data/pho_mt_vi_sent.txt"

PATH_FOLDER_PROCESS = "processed_data"

TEST_SIZE = 0.2

# 1. Load dataset

In [5]:
list_en_sentence = read_text_file(PATH_EN_PHO_MT_FILE)
list_vi_sentence = read_text_file(PATH_VI_PHO_MT_FILE)
assert len(list_en_sentence) == len(list_vi_sentence)

print(f"Number of pair sentence: {len(list_en_sentence)}")

Number of pair sentence: 3010915


In [7]:
idx = np.random.randint(0, len(list_en_sentence))

en_sentence = list_en_sentence[idx]
vi_sentence = list_vi_sentence[idx]

print(f"English: {en_sentence}")
print(f"Vietname: {vi_sentence}")

English: mark zuckerberg wants to create a global community .
Vietname: mark zuckerberg muốn tạo ra một cộng đồng toàn cầu .


# 2. Train test split

In [10]:
list_en_sentence_train, list_en_sentence_test, list_vi_sentence_train, list_vi_sentence_test = \
    train_test_split(list_en_sentence, list_vi_sentence, test_size=TEST_SIZE, random_state=42)

In [11]:
print(f"Number of pair train sentence: {len(list_en_sentence_train)}")
print(f"Number of pair test sentence: {len(list_en_sentence_test)}")

Number of pair train sentence: 2408732
Number of pair test sentence: 602183


# 3. Number of words

In this section, we will explore the number of words in English corpus and Vietnamese corpus.

In [12]:
en_vocab = get_vocab_from_list_sentence(list_en_sentence_train)
print(f"Number of word in English vocab: {len(en_vocab)}")
en_vocab[:10]

Number of word in English vocab: 416448


['xiangkhouang',
 'schpielt',
 'houria',
 'santander")',
 'dadaratatatah',
 '11february',
 't25%',
 '(kaigun',
 'ninkasi',
 "'bila"]

In [13]:
vi_vocab = get_vocab_from_list_sentence(list_vi_sentence_train)
print(f"Number of word in Vietnamese vocab: {len(vi_vocab)}")
vi_vocab[:10]

Number of word in Vietnamese vocab: 345761


['schpielt',
 'houria',
 'santander")',
 'dadaratatatah',
 'mgsa-α)',
 '(kaigun',
 'ninkasi',
 "tụ'",
 '("log")',
 "'bila"]

# 4. Save dataset

In [14]:
save_sentences_to_file(list_en_sentence_train, os.path.join(PATH_FOLDER_PROCESS, "en_sent_train.txt"))
save_sentences_to_file(list_vi_sentence_train, os.path.join(PATH_FOLDER_PROCESS, "vi_sent_train.txt"))

In [15]:
save_sentences_to_file(list_en_sentence_test, os.path.join(PATH_FOLDER_PROCESS, "en_sent_test.txt"))
save_sentences_to_file(list_vi_sentence_test, os.path.join(PATH_FOLDER_PROCESS, "vi_sent_test.txt"))