In [None]:
!pip install sentencepiece




In [None]:
import os
import re
import json
import math
import string
import random
import warnings
import sentencepiece as spm
from timeit import default_timer as timer
from typing import Iterable, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import Transformer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader


In [None]:
file_path = "/content/drive/MyDrive/Dataset/PhoMT/preprocessed_200k.csv"
df = pd.read_csv(file_path)


en_sents = df["en"].tolist()
vi_sents = df["vi"].tolist()
print(f"Số mẫu: {len(en_sents)}")
df.head()

Số mẫu: 200000


Unnamed: 0,en,vi
0,it begins with a countdown,câu chuyện bắt đầu với buổi lễ đếm ngược
1,on august 14th 1947 a woman in bombay goes int...,ngày 14 tháng 8 năm 1947 gần nửa đêm ở bombay ...
2,across india people hold their breath for the ...,cùng lúc trên khắp đất ấn người ta nín thở chờ...
3,and at the stroke of midnight a squirming infa...,khi đồng hồ điểm thời khắc nửa đêm một đứa trẻ...
4,these events form the foundation of midnight s...,những sự kiện này là nền móng tạo nên những đứ...


In [None]:
def preprocessing(df):
    df = df[df['en'].str.strip() != '']
    df = df[df['vi'].str.strip() != '']

    df["en"] = df["en"].apply(lambda ele: ele.translate(str.maketrans('', '', string.punctuation)))
    df["vi"] = df["vi"].apply(lambda ele: ele.translate(str.maketrans('', '', string.punctuation)))
    df["en"] = df["en"].apply(lambda ele: ele.lower())
    df["vi"] = df["vi"].apply(lambda ele: ele.lower())
    df["en"] = df["en"].apply(lambda ele: ele.strip())
    df["vi"] = df["vi"].apply(lambda ele: ele.strip())
    df["en"] = df["en"].apply(lambda ele: re.sub("\s+", " ", ele))
    df["vi"] = df["vi"].apply(lambda ele: re.sub("\s+", " ", ele))
    return df

In [None]:
df = preprocessing(df)
df.head()

Unnamed: 0,en,vi
0,it begins with a countdown,câu chuyện bắt đầu với buổi lễ đếm ngược
1,on august 14th 1947 a woman in bombay goes int...,ngày 14 tháng 8 năm 1947 gần nửa đêm ở bombay ...
2,across india people hold their breath for the ...,cùng lúc trên khắp đất ấn người ta nín thở chờ...
3,and at the stroke of midnight a squirming infa...,khi đồng hồ điểm thời khắc nửa đêm một đứa trẻ...
4,these events form the foundation of midnight s...,những sự kiện này là nền móng tạo nên những đứ...


In [None]:
import os

# 2. Hàm xuất dữ liệu ra file
def export_text_file(df, filename, col):
    with open(filename, 'w', encoding='utf-8') as f:
        for line in df[col]:
            f.write(str(line).strip() + '\n')
    print(f"Exported {len(df)} lines to {filename}")

# 3. Hàm huấn luyện SentencePiece
def train_sentencepiece(input_file, model_prefix, vocab_size=32000):
    print(f"Training SentencePiece model for {input_file} ...")
    spm.SentencePieceTrainer.Train(
        f'--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --character_coverage=1.0 --model_type=bpe'
    )
    print(f"Model {model_prefix} training done!")


# Xuất dữ liệu
export_text_file(df, 'en.txt', 'en')
export_text_file(df, 'vi.txt', 'vi')

# Train tokenizer riêng
train_sentencepiece('en.txt', 'spm_en', vocab_size=32000)
train_sentencepiece('vi.txt', 'spm_vi', vocab_size=32000)



Exported 200000 lines to en.txt
Exported 200000 lines to vi.txt
Training SentencePiece model for en.txt ...
Model spm_en training done!
Training SentencePiece model for vi.txt ...
Model spm_vi training done!


In [None]:
def print_file_content(filename, num_lines=10):
    print(f"--- Nội dung đầu {num_lines} dòng của file {filename} ---")
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_lines:
                break
            print(line.strip())
    print('--- Kết thúc ---\n')

# In 10 dòng đầu file en.txt và vi.txt
print_file_content('en.txt', num_lines=10)
print_file_content('vi.txt', num_lines=10)

--- Nội dung đầu 10 dòng của file en.txt ---
it begins with a countdown
on august 14th 1947 a woman in bombay goes into labor as the clock ticks towards midnight
across india people hold their breath for the declaration of independence after nearly two centuries of british occupation and rule
and at the stroke of midnight a squirming infant and two new nations are born in perfect synchronicity
these events form the foundation of midnight s children a dazzling novel by the british indian author salman rushdie
the baby who is the exact same age as the nation is saleem sinai the novel s protagonist
his narrative stretches over 30 years of his life jumping backwards and forwards in time to speculate on family secrets and deep seated mysteries
these include the greatest enigma of all saleem has magic powers and they re somehow related to the time of his birth
and he s not the only one
all children born in and around the stroke of midnight are imbued with extraordinary powers like parvati th

In [None]:
def print_first_n_vocab(filename, n=1000):
    print(f"--- First {n} vocab tokens from {filename} ---")
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            print(line.strip())
    print('--- End of vocab ---')

print_first_n_vocab('spm_en.vocab', 1000)  # In 1000 vocab tiếng Anh
print_first_n_vocab('spm_vi.vocab', 1000)  # In 1000 vocab tiếng Việt


--- First 1000 vocab tokens from spm_en.vocab ---
<unk>	0
<s>	0
</s>	0
▁t	-0
▁a	-1
▁th	-2
▁i	-3
▁s	-4
▁w	-5
▁the	-6
re	-7
▁o	-8
in	-9
at	-10
er	-11
nd	-12
ou	-13
▁c	-14
▁b	-15
▁m	-16
▁h	-17
ing	-18
▁and	-19
en	-20
▁to	-21
▁p	-22
▁f	-23
▁d	-24
on	-25
es	-26
▁l	-27
an	-28
or	-29
▁of	-30
al	-31
▁in	-32
▁n	-33
ed	-34
is	-35
▁y	-36
it	-37
ar	-38
▁that	-39
▁we	-40
▁g	-41
om	-42
▁you	-43
▁wh	-44
as	-45
le	-46
ic	-47
▁it	-48
ve	-49
▁e	-50
▁re	-51
us	-52
▁be	-53
ll	-54
ow	-55
▁is	-56
ly	-57
ut	-58
ion	-59
ot	-60
et	-61
▁on	-62
▁ha	-63
ent	-64
ct	-65
ld	-66
▁this	-67
ver	-68
gh	-69
ay	-70
▁so	-71
se	-72
▁st	-73
ur	-74
id	-75
st	-76
▁do	-77
ro	-78
▁he	-79
ke	-80
▁for	-81
ch	-82
ce	-83
▁li	-84
▁they	-85
ra	-86
ght	-87
▁was	-88
▁an	-89
ir	-90
im	-91
▁are	-92
op	-93
▁u	-94
ith	-95
▁go	-96
▁have	-97
▁but	-98
th	-99
▁k	-100
▁with	-101
am	-102
▁se	-103
ally	-104
▁ab	-105
▁what	-106
ri	-107
▁as	-108
▁mo	-109
ol	-110
▁j	-111
ation	-112
▁sh	-113
out	-114
▁wor	-115
▁ne	-116
▁me	-117
▁nt	-118
▁can	-119
▁ch	

In [None]:
import shutil


shutil.copy('spm_vi.model', '/content/drive/MyDrive/vocab_folder/spm_vi.model')
shutil.copy('spm_vi.vocab', '/content/drive/MyDrive/vocab_folder/spm_vi.vocab')


'/content/drive/MyDrive/vocab_folder/spm_vi.vocab'

In [None]:
def save_sentences_to_file(df, filename, col):
    with open(filename, 'w', encoding='utf-8') as f:
        for sentence in df[col]:
            f.write(str(sentence).strip() + '\n')
    print(f"Saved {len(df)} sentences to {filename}")

In [None]:
df.to_csv('preprocessed_200k.csv', index=False, encoding='utf-8')


In [None]:
shutil.copy('preprocessed_200k.csv', '/content/drive/MyDrive/Dataset/PhoMT/preprocessed_200k.csv')

'/content/drive/MyDrive/Dataset/PhoMT/preprocessed_200k.csv'