# lab6_rnns_text_classification
- Task 0: Data loading & label encoding
- Task 1: Pipeline TF-IDF + Logistic Regression
- Task 2: Pipeline Word2Vec (Trung bình) + Dense Layer
- Task 3: Mô hình Nâng cao (Embedding Pre-trained + LSTM)
- Task 4: Mô hình Nâng cao (Embedding học từ đầu + LSTM)
- Task 5: Đánh giá, So sánh và Phân tích

In [1]:
# Imports and utilities
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, f1_score, log_loss
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [2]:
import pandas as pd
# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv('C:\\Users\\ADMIN\\.vscode\\NLP_APP\\hwu\\train.csv', sep=',')
df_val = pd.read_csv('C:\\Users\\ADMIN\\.vscode\\NLP_APP\\hwu\\val.csv', sep=',')
df_test = pd.read_csv('C:\\Users\\ADMIN\\.vscode\\NLP_APP\\hwu\\test.csv', sep=',')
print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)
Test shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [3]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()

all_intents = pd.concat(
    [df_train['category'], 
     df_val['category'], 
     df_test['category']]).dropna()

labelEncoder.fit(all_intents)
y_train = labelEncoder.transform(df_train['category'])
y_val = labelEncoder.transform(df_val['category'])
y_test = labelEncoder.transform(df_test['category'])

print("Number of classes:", len(labelEncoder.classes_))

Number of classes: 64


**Task 1: Pipeline TF-IDF + Logistic Regression**

In [4]:
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
)

tfidf_lr_pipeline.fit(df_train['text'], y_train)
y_pred = tfidf_lr_pipeline.predict(df_test['text'])

print(classification_report(y_test, y_pred, target_names=labelEncoder.classes_))

                          precision    recall  f1-score   support

             alarm_query       0.90      0.95      0.92        19
            alarm_remove       1.00      0.73      0.84        11
               alarm_set       0.77      0.89      0.83        19
       audio_volume_down       1.00      0.75      0.86         8
       audio_volume_mute       0.92      0.80      0.86        15
         audio_volume_up       0.93      1.00      0.96        13
          calendar_query       0.45      0.53      0.49        19
         calendar_remove       0.89      0.89      0.89        19
            calendar_set       0.87      0.68      0.76        19
          cooking_recipe       0.59      0.68      0.63        19
        datetime_convert       0.67      0.75      0.71         8
          datetime_query       0.74      0.89      0.81        19
        email_addcontact       0.78      0.88      0.82         8
             email_query       0.83      0.79      0.81        19
      ema

**Task 2: Pipeline Word2Vec (Trung bình) + Dense Layer**

In [5]:
# 1. Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 2. Viết hàm để chuyển mỗi câu thành vector trung bình
def sentence_to_avg_vector(text, model):
    word = text.split()
    word_vector = []
    for w in word:
        if w in model.wv:
            word_vector.append(model.wv[w])
    if len(word_vector) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vector, axis=0)

# 3. Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg = np.array([sentence_to_avg_vector(t, w2v_model) for t in df_train['text']])
X_val_avg = np.array([sentence_to_avg_vector(t, w2v_model) for t in df_val['text']])
X_test_avg = np.array([sentence_to_avg_vector(t, w2v_model) for t in df_test['text']])

# 4. Xây dựng mô hình Sequential của Keras
model = Sequential([
    Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)),
    Dropout(0.5),
    Dense(len(labelEncoder.classes_), activation='softmax')])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

# 5. Compile, huấn luyện và đánh giá mô hình
history = model.fit(X_train_avg, y_train,
                    validation_data=(X_val_avg, y_val),
                    epochs=150,
                    batch_size=64)

print(classification_report(y_test, model.predict(X_test_avg).argmax(axis=1), target_names=labelEncoder.classes_))


Epoch 1/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.0141 - loss: 4.1649 - val_accuracy: 0.0362 - val_loss: 4.1124
Epoch 2/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0296 - loss: 4.1139 - val_accuracy: 0.0762 - val_loss: 4.0727
Epoch 3/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0454 - loss: 4.0722 - val_accuracy: 0.0688 - val_loss: 4.0089
Epoch 4/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0555 - loss: 4.0122 - val_accuracy: 0.0688 - val_loss: 3.9294
Epoch 5/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0634 - loss: 3.9346 - val_accuracy: 0.0753 - val_loss: 3.8418
Epoch 6/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0660 - loss: 3.8543 - val_accuracy: 0.0818 - val_loss: 3.7624
Epoch 7/150
[1m140/14

**Task 3: Mô hình Nâng cao (Embedding Pre-trained + LSTM)**

In [6]:
# 1. Tiền xử lý cho mô hình chuỗi
# a. Tokenizer: Tạo vocab và chuyển text thành chuỗi chỉ số
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(pd.concat([df_train['text'], df_val['text']]))
train_sequences = tokenizer.texts_to_sequences(df_train['text'])
val_sequences = tokenizer.texts_to_sequences(df_val['text'])
test_sequences = tokenizer.texts_to_sequences(df_test['text'])

# b. Padding: Đảm bảo các chuỗi có cùng độ dài
max_len = 50
X_train_pad = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(val_sequences, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# 2. Tạo ma trận trọng số cho Embedding Layer từ Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# 3. Xây dựng mô hình Sequential với LSTM
lstm_model_pretrained = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        # Khởi tạo trọng số
        weights=[embedding_matrix],
        input_length=max_len,
        # Đóng băng lớp Embedding
        trainable=False),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(labelEncoder.classes_), activation='softmax')])

# 4. Compile, huấn luyện (sử dụng EarlyStopping) và đánh giá
lstm_model_pretrained.compile(optimizer='adam',
                              loss='sparse_categorical_crossentropy',
                              metrics=['accuracy'])
lstm_model_pretrained.summary()
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = lstm_model_pretrained.fit(X_train_pad, y_train,
                                    validation_data=(X_val_pad, y_val),
                                    epochs=150,
                                    batch_size=64,
                                    callbacks=[early_stopping])
print(classification_report(y_test, lstm_model_pretrained.predict(X_test_pad).argmax(axis=1), target_names=labelEncoder.classes_))



Epoch 1/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 68ms/step - accuracy: 0.0148 - loss: 4.1493 - val_accuracy: 0.0335 - val_loss: 4.0594
Epoch 2/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 63ms/step - accuracy: 0.0280 - loss: 4.0631 - val_accuracy: 0.0586 - val_loss: 3.8880
Epoch 3/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 56ms/step - accuracy: 0.0462 - loss: 3.9568 - val_accuracy: 0.0548 - val_loss: 3.9517
Epoch 4/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.0472 - loss: 3.9202 - val_accuracy: 0.0660 - val_loss: 3.8092
Epoch 5/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.0545 - loss: 3.8492 - val_accuracy: 0.0762 - val_loss: 3.7591
Epoch 6/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.0529 - loss: 3.8371 - val_accuracy: 0.0716 - val_loss: 3.7283
Epoch 7/150
[1

**Task 4: Mô hình Nâng cao (Embedding học từ đầu + LSTM)**

In [7]:
# 1. Xây dựng mô hình
lstm_model_scratch = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=100,  # Chọn một chiều embedding, ví dụ 100
              input_length=max_len,
              trainable=True),  # Không có weights từ Word2Vec, học từ đầu
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(labelEncoder.classes_), activation='softmax')])

# 2. Compile, huấn luyện và đánh giá mô hình
lstm_model_scratch.compile(optimizer='adam',
                           loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])

lstm_model_scratch.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = lstm_model_scratch.fit(X_train_pad, y_train,
                                 validation_data=(X_val_pad, y_val),
                                 epochs=150,
                                 batch_size=64,
                                 callbacks=[early_stopping],
                                 verbose=1)

print("\nScratch Embedding + LSTM Classification Report:")
print(classification_report(y_test, lstm_model_scratch.predict(X_test_pad).argmax(axis=1), target_names=labelEncoder.classes_))

Epoch 1/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step - accuracy: 0.0186 - loss: 4.1502 - val_accuracy: 0.0177 - val_loss: 4.1302
Epoch 2/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.0145 - loss: 4.1355 - val_accuracy: 0.0177 - val_loss: 4.1291
Epoch 3/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.0196 - loss: 4.1344 - val_accuracy: 0.0177 - val_loss: 4.1286
Epoch 4/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.0160 - loss: 4.1333 - val_accuracy: 0.0177 - val_loss: 4.1286
Epoch 5/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.0168 - loss: 4.1322 - val_accuracy: 0.0177 - val_loss: 4.1284
Epoch 6/150
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.0129 - loss: 4.1331 - val_accuracy: 0.0177 - val_loss: 4.1275
Epoch 7/150
[1m

**Task 5: Đánh giá, So sánh và Phân tích**

In [8]:
# 1. Bảng tổng hợp F1-score (macro) và Test Loss cho 4 pipeline
def safe_predict_proba(model, X):
    try:
        return model.predict_proba(X)
    except Exception:
        return None

# TF-IDF + Logistic Regression metrics
y_pred_tfidf = tfidf_lr_pipeline.predict(df_test['text'])
y_proba_tfidf = safe_predict_proba(tfidf_lr_pipeline, df_test['text'])
tfidf_macro_f1 = f1_score(y_test, y_pred_tfidf, average='macro')
tfidf_test_loss = log_loss(y_test, y_proba_tfidf) if y_proba_tfidf is not None else None

# Word2Vec (Avg) + Dense metrics
y_proba_avg = model.predict(X_test_avg)
y_pred_avg = y_proba_avg.argmax(axis=1)
avg_macro_f1 = f1_score(y_test, y_pred_avg, average='macro')
avg_test_loss = log_loss(y_test, y_proba_avg)

# Pretrained Embedding + LSTM metrics
y_proba_lstm_pre = lstm_model_pretrained.predict(X_test_pad)
y_pred_lstm_pre = y_proba_lstm_pre.argmax(axis=1)
pre_macro_f1 = f1_score(y_test, y_pred_lstm_pre, average='macro')
pre_test_loss = log_loss(y_test, y_proba_lstm_pre)

# Scratch Embedding + LSTM metrics
y_proba_lstm_scratch = lstm_model_scratch.predict(X_test_pad)
y_pred_lstm_scratch = y_proba_lstm_scratch.argmax(axis=1)
scratch_macro_f1 = f1_score(y_test, y_pred_lstm_scratch, average='macro')
scratch_test_loss = log_loss(y_test, y_proba_lstm_scratch)

results_df = pd.DataFrame({
    'Pipeline': ['TF-IDF + Logistic Regression', 'Word2Vec (Avg) + Dense', 'Embedding (Pre-trained) + LSTM', 'Embedding (Scratch) + LSTM'],
    'F1-score (Macro)': [tfidf_macro_f1, avg_macro_f1, pre_macro_f1, scratch_macro_f1],
    'Test Loss': [tfidf_test_loss, avg_test_loss, pre_test_loss, scratch_test_loss]
})
print('\n=== Bảng Tổng Hợp Kết Quả ===')
display(results_df)

# 2. Phân tích định tính với các câu khó
difficult_sentences = [
    ('can you remind me to not call my mom', 'reminder_create'),
    ('is it going to be sunny or rainy tomorrow', 'weather_query'),
    ('find a flight from new york to london but not through paris', 'flight_search')
]

def predict_pipeline(sentence):
    # TF-IDF
    p_tfidf_idx = tfidf_lr_pipeline.predict([sentence])[0]
    p_tfidf = labelEncoder.inverse_transform([p_tfidf_idx])[0]
    # Word2Vec avg
    avg_vec = np.array([sentence_to_avg_vector(sentence, w2v_model)])
    p_avg_idx = model.predict(avg_vec).argmax(axis=1)[0]
    p_avg = labelEncoder.inverse_transform([p_avg_idx])[0]
    # Pretrained LSTM
    seq_pre = pad_sequences(tokenizer.texts_to_sequences([sentence]), maxlen=max_len, padding='post')
    p_pre_idx = lstm_model_pretrained.predict(seq_pre).argmax(axis=1)[0]
    p_pre = labelEncoder.inverse_transform([p_pre_idx])[0]
    # Scratch LSTM
    p_scr_idx = lstm_model_scratch.predict(seq_pre).argmax(axis=1)[0]
    p_scr = labelEncoder.inverse_transform([p_scr_idx])[0]
    return p_tfidf, p_avg, p_pre, p_scr

analysis_rows = []
for sent, true_label in difficult_sentences:
    ptfidf, pavg, ppre, pscr = predict_pipeline(sent)
    analysis_rows.append({
        'Sentence': sent,
        'True Intent': true_label,
        'TF-IDF+LR': ptfidf,
        'W2V Avg + Dense': pavg,
        'Pretrained LSTM': ppre,
        'Scratch LSTM': pscr
    })

qual_df = pd.DataFrame(analysis_rows)
print('\n=== Phân Tích Định Tính Các Câu Khó ===')
display(qual_df)



[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

=== Bảng Tổng Hợp Kết Quả ===


Unnamed: 0,Pipeline,F1-score (Macro),Test Loss
0,TF-IDF + Logistic Regression,0.835298,1.050197
1,Word2Vec (Avg) + Dense,0.304154,2.452722
2,Embedding (Pre-trained) + LSTM,0.376478,2.108491
3,Embedding (Scratch) + LSTM,0.178246,2.868297


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step

=== Phân Tích Định Tính Các Câu Khó ===


Unnamed: 0,Sentence,True Intent,TF-IDF+LR,W2V Avg + Dense,Pretrained LSTM,Scratch LSTM
0,can you remind me to not call my mom,reminder_create,calendar_set,general_quirky,takeaway_query,email_sendemail
1,is it going to be sunny or rainy tomorrow,weather_query,weather_query,qa_maths,qa_maths,takeaway_order
2,find a flight from new york to london but not ...,flight_search,general_negate,transport_query,email_sendemail,calendar_set
