# Import libraries

In [1]:
import pandas as pd
import numpy as np
import string as pystring
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras import backend as K

# Load Dataset

In [2]:
!gdown --id 1USrci90pyG2E76_VFH1igIYj2i8BvlmE

Downloading...
From: https://drive.google.com/uc?id=1USrci90pyG2E76_VFH1igIYj2i8BvlmE
To: /content/truyen_kieu.txt
100% 140k/140k [00:00<00:00, 49.6MB/s]


In [3]:
with open('truyen_kieu.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Preprocessing

In [4]:
PUNCT_TO_REMOVE = pystring.punctuation + pystring.digits + "\n"
def clean_text (text):
  """custom function to removal: punctuations and digits"""
  text = text.translate(str.maketrans(' ',' ', PUNCT_TO_REMOVE))
  text = text.lower()
  return text
clean_text(lines[0])

'trăm năm trong cõi người ta'

In [5]:
corpus = [clean_text(line) for line in lines]
print(corpus[0])

trăm năm trong cõi người ta


# Build Data

In [6]:
def get_centers_and_contexts(corpus, max_window_size=2):
  centers, contexts = [], []
  for line in corpus:
    line = line.split()
    if len(line) <= 2*max_window_size:
      continue
    for i in range(max_window_size, len(line)-max_window_size):
      centers.append(line[1])
      idxs = list(range(i-max_window_size, i+max_window_size+1))
      idxs.remove(i)
      contexts.append(" ".join([line[idx] for idx in idxs]))
  return centers, contexts

In [7]:
centers, contexts = get_centers_and_contexts(corpus)
len(centers), len(contexts)

(9778, 9778)

In [8]:
centers[:2], contexts[:2]

(['năm', 'năm'], ['trăm năm cõi người', 'năm trong người ta'])

# Representation

In [9]:
max_length = 4
embedding_size = 200

In [10]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)

In [11]:
vocab_size = len(tokenizer.index_word) + 1

In [12]:

train_seq = tokenizer.texts_to_sequences(contexts)
train_seq_pad = pad_sequences(train_seq, maxlen=max_length, truncating='post', padding="post")

In [13]:
train_labels = [to_categorical(tokenizer.word_index[label], len(tokenizer.word_index) + 1) for label in centers]

In [14]:
train_labels = np.array(train_labels)

# CBOW Model

In [15]:
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=4))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 200)            482400    
                                                                 
 lambda (Lambda)             (None, 200)               0         
                                                                 
 dense (Dense)               (None, 2412)              484812    
                                                                 
Total params: 967212 (3.69 MB)
Trainable params: 967212 (3.69 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
cbow.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
cbow.fit(train_seq_pad, train_labels, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7be90cc64940>

# Predict

In [17]:
sample_text = 'trăm năm cõi người'
sample_seq = tokenizer.texts_to_sequences([sample_text])
sample_seq_pad = pad_sequences(sample_seq, maxlen=max_length, truncating='post', padding="post")
cbow.predict(sample_seq_pad)



array([[1.5228716e-13, 1.8188395e-13, 6.2183113e-07, ..., 1.6474524e-13,
        1.8357078e-13, 1.5568460e-13]], dtype=float32)

In [18]:
tokenizer.index_word[np.argmax(cbow.predict(sample_seq_pad))]



'năm'