## 1. Import Library 

In [2]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from sklearn.metrics  import accuracy_score,f1_score,precision_score,recall_score
from keras.activations import relu
from keras.utils import to_categorical
import numpy as np 
import regex as re 
np.set_printoptions(threshold=np.inf)

## 2. Data Proprocesing

#### 2.1. Read the file and divide it into sentences

In [3]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?])\s+', text) if sentence.strip()]
    return sentences 

#### 2.2. Create a dictionary for each word in the text file

In [4]:
file_path = 'train.txt'
text_data = file_to_sentence_list(file_path)
print(text_data)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)


['Ho Chi Minh City University of Technology and Education, with a long and prestigious history in the field of education and research, is a modern school with a rich and diverse learning and research environment.', 'Located in the city center, Ho Chi Minh City University of Technology and Education is not only a provider of professional knowledge but also a vibrant and creative academic community.', 'With modern facilities, classrooms, laboratories and libraries are fully equipped, creating favorable conditions for students and lecturers in the learning and research process.', 'The school is also proud of its team of experienced, highly qualified lecturers who are committed to providing students with the best support and development.', 'Training programs are diverse, flexible and reflect the actual needs of the labor market, giving students the opportunity to develop themselves and prepare for their future careers.', 'In addition, the school also promotes international research and coo

#### 2.3. Create Input 

In [59]:
def CreateInput(text_data):
    input_sequences = []
    for line in text_data:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences
input_sequences=CreateInput(text_data)
print(input_sequences)

[[20, 21], [20, 21, 22], [20, 21, 22, 12], [20, 21, 22, 12, 13], [20, 21, 22, 12, 13, 3], [20, 21, 22, 12, 13, 3, 23], [20, 21, 22, 12, 13, 3, 23, 1], [20, 21, 22, 12, 13, 3, 23, 1, 9], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4, 2], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4, 2, 32], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4, 2, 32, 3], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4, 2, 32, 3, 9], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4, 2, 32, 3, 9, 1], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31, 4, 2, 32, 3, 9, 1, 5], [20, 21, 22, 12, 13, 3, 23, 1, 9, 6, 7, 29, 1, 30, 31

#### 2.4.Convert to full vectors

In [60]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

#### 2.5 Divide the data set into training set and testing set

In [61]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y =to_categorical(y, num_classes=total_words)

### 3 RNN Model

#### 3.1 Configuration RNN

In [62]:
model = Sequential()
model.add(Embedding(total_words, 10, name='embedding_layer')) 
model.add(SimpleRNN(8, return_sequences=True, activation=relu, name='rnn_layer_1'))
model.add(SimpleRNN(9, return_sequences=True, activation=relu, name='rnn_layer_2'))
model.add(SimpleRNN(9, activation=relu, name='rnn_layer_3'))
model.add(Dense(total_words, activation='softmax', name='output_layer'))


#### 3.2. Trainning RNN

In [63]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=300, verbose=1)

Epoch 1/300


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.0090 - loss: 4.6247    
Epoch 2/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0581 - loss: 4.6212
Epoch 3/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1037 - loss: 4.6178 
Epoch 4/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1141 - loss: 4.6136
Epoch 5/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0913 - loss: 4.6100
Epoch 6/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1055 - loss: 4.6038 
Epoch 7/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0616 - loss: 4.6007   
Epoch 8/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1044 - loss: 4.5905
Epoch 9/300
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x1c203b77f10>

#### 3.3 Model Summary

In [64]:
model.summary()

#### 3.4 Predicting 

In [65]:
next_words = 4
seed_text="In short, the University of Technical Education is an ideal"
y_predict=[]
def Recommend(seed_text):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list)
        predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
        seed_text += " " + predicted_word
        y_predict.append(predicted_word)
    return seed_text
y_result=Recommend(seed_text)
print(y_result)
print(y_predict)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
In short, the University of Technical Education is an ideal learning and research environment
['learning', 'and', 'research', 'environment']


#### 3.5 Evaluating 

In [66]:
y_true = ['learning', 'and', 'research', 'environment']
accuracy = accuracy_score(y_true, y_predict)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_true, y_predict, average='weighted')
print(f"Precision: {precision:.4f}")

recall = recall_score(y_true, y_predict, average='weighted')
print(f"Recall: {recall:.4f}")

f1score = f1_score(y_true, y_predict, average='weighted')
print(f"F1-score: {f1score}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0
