In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import re





In [2]:
MAX_WORDS = 5000
MAX_LEN = 970
EMBED_DIM = 128
BATCH_SIZE = 32
EPOCHS = 10
NUM_CLASSES = 4  # 0: Ko đc đề cập, 1: positive, 2: negative, 3: neutral


df_train = pd.read_csv(r"C:\Users\leduc\OneDrive\Desktop\bap tap uit\NLP-CS221\DoAn\dataset\train.csv")
df_test = pd.read_csv(r"C:\Users\leduc\OneDrive\Desktop\bap tap uit\NLP-CS221\DoAn\dataset\test.csv")
df_eval = pd.read_csv(r"C:\Users\leduc\OneDrive\Desktop\bap tap uit\NLP-CS221\DoAn\dataset\val.csv")

ASPECT_NAMES = df_train.columns[1:]
NUM_ASPECTS = len(ASPECT_NAMES)


all_reviews = pd.concat([df_train["Review"], df_test["Review"], df_eval["Review"]])
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(all_reviews)

def preprocess_text(df):
    sequences = tokenizer.texts_to_sequences(df["Review"])
    return pad_sequences(sequences, maxlen=MAX_LEN, padding='post')

X_train = preprocess_text(df_train)
X_test = preprocess_text(df_test)
X_eval = preprocess_text(df_eval)


def preprocess_labels(df):
    labels = []
    for aspect in ASPECT_NAMES:
        labels.append(to_categorical(df[aspect], num_classes=NUM_CLASSES))
    return np.stack(labels, axis=1)

y_train = preprocess_labels(df_train)  
y_test = preprocess_labels(df_test)
y_eval = preprocess_labels(df_eval)



In [3]:
df_train["Review"].iloc[0]

'_ Ảnh chụp từ hôm qua, đi chơi với gia đình và 1 nhà họ hàng đang sống tại Sài Gòn. _ Hôm qua đi ăn trưa muộn, ai cũng đói hết nên lúc có đồ ăn là nhào vô ăn liền, bởi vậy mới quên chụp các phần gọi thêm với nước mắm, chỉ chụp món chính thôi! _ Đói quá nên không biết đánh giá đồ ăn kiểu gì luôn 😅😅😅_ Chọn cái này vì thấy nó lạ với tui.'

In [4]:
df_train.drop(columns=["Review"]).iloc[0].to_list()

[0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0]

In [5]:
X_train[0]

array([ 652,  551,   95,  281,  211,   39,  658,   30,  232,  568,    9,
         20,  156,  914,  133,  396,  498,  261,  569,  589,  281,  211,
         39,    2,  304, 1419,  209,    6,  661,   70,   18,  168,    5,
         73,    2,    4, 2939,  220,    2,  499,  861,  178,   81,  546,
        551,   56,   63,   78,   48,   30,   13,  150,   64,  551,   19,
        429,   74,  661,   44,   18,   10,  180,  668,   24,   73,    2,
        192,  137,   25, 3292,  239,   58,   17,  102,   37,  240,  165,
         30,  716,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [6]:
y_test[0]

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [7]:
input_layer = Input(shape=(MAX_LEN,))
x = Embedding(input_dim=MAX_WORDS, output_dim=EMBED_DIM)(input_layer)
x = LSTM(64)(x)
x = Dense(128, activation='relu')(x)

def sanitize(name):
    return re.sub(r'[^A-Za-z0-9_.\\/>-]', '_', name)

outputs = [
    Dense(NUM_CLASSES, activation='softmax', name=f"{sanitize(aspect)}_output")(x)
    for aspect in ASPECT_NAMES
]

model = Model(inputs=input_layer, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


model.fit(
    X_train,
    [y_train[:, i] for i in range(NUM_ASPECTS)],
    validation_data=(X_test, [y_test[:, i] for i in range(NUM_ASPECTS)]),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[EarlyStopping(patience=2, restore_best_weights=True)]
)


y_pred_probs = model.predict(X_eval) 
y_pred_labels = np.stack([np.argmax(p, axis=1) for p in y_pred_probs], axis=1)


y_true_labels = np.argmax(y_eval, axis=2)


for i, aspect in enumerate(ASPECT_NAMES):
    print(f"\n[Aspect] {aspect}")
    print(classification_report(y_true_labels[:, i], y_pred_labels[:, i], digits=3))




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 970)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 970, 128)             640000    ['input_1[0][0]']             
                                                                                                  
 lstm (LSTM)                 (None, 64)                   49408     ['embedding[0][0]']           
                                                                                                  
 dense (Dense)               (None, 128)                  8320      ['lstm[0][0]']                
                                                                                            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize