In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import nltk

# Đặt thư mục riêng để lưu stopwords
nltk_data_dir = "/workspace/competitions/NCKH/Week2/nltk_data"
os.makedirs(nltk_data_dir, exist_ok=True)

# Thêm đường dẫn vào nltk
nltk.data.path.append(nltk_data_dir)

# Tải stopwords về thư mục cụ thể
nltk.download('stopwords', download_dir=nltk_data_dir)

# Đọc dữ liệu
data = pd.read_csv("/workspace/competitions/NCKH/Week2/IMDB Dataset.csv")

# Khởi tạo tokenizer và stopwords
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.PorterStemmer()

# 1. Loại bỏ thẻ HTML
def strip_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# 2. Loại bỏ nội dung trong []
def remove_between_square_brackets(text):
    return re.sub(r'\[[^]]*\]', '', text)

# 3. Loại bỏ ký tự đặc biệt
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-Z0-9\s]'
    return re.sub(pattern, '', text)

# 4. Stemming
def simple_stemmer(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

# 5. Loại bỏ stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    return ' '.join(filtered_tokens)

# 6. Tổng hợp hàm làm sạch
def clean_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    text = text.lower()
    text = remove_stopwords(text, is_lower_case=True)
    text = simple_stemmer(text)
    return text

# Áp dụng hàm làm sạch cho cột review
data['review'] = data['review'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /workspace/competitions/NCKH/Week2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive


In [4]:
data.shape

(50000, 2)

In [5]:
type(data)

pandas.core.frame.DataFrame

In [6]:
data.tail()

Unnamed: 0,review,sentiment
49995,thought movi right good job wasnt creativ orig...,positive
49996,bad plot bad dialogu bad act idiot direct anno...,negative
49997,cathol taught parochi elementari school nun ta...,negative
49998,im go disagre previou comment side maltin one ...,negative
49999,one expect star trek movi high art fan expect ...,negative


In [7]:
data["sentiment"].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [8]:
# one hot encoding
# label encoder

In [9]:
# positive -> 1
# negative -> 0
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [10]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [11]:
data.tail()

Unnamed: 0,review,sentiment
49995,thought movi right good job wasnt creativ orig...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,cathol taught parochi elementari school nun ta...,0
49998,im go disagre previou comment side maltin one ...,0
49999,one expect star trek movi high art fan expect ...,0


In [12]:
data["sentiment"].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [13]:
# LSTM -> LONG SHORT TERM MEMORY
# RNN -> TEXTUAL DATA

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tách tập train/test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tiền xử lý văn bản
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

# Lưu kết quả độ chính xác
accuracies = []

for run in range(3):
    print(f"\n Run {run + 1}/3")

    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=32, input_length=200))
    model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

    model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=2)

    test_loss, test_accuracy = model.evaluate(X_test, Y_test, verbose=0)
    print(f" Test Accuracy (Run {run + 1}): {test_accuracy:.4f}")
    accuracies.append(test_accuracy)

# Tính trung bình và độ lệch chuẩn
mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)

print("\n Final Result for this configuration:")
print(f"Average Test Accuracy: {mean_acc:.4f}")
print(f"Standard Deviation: {std_acc:.4f}")


2025-04-16 21:49:01.019865: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-16 21:49:01.829786: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/ai_intern/.wasmedge/lib
2025-04-16 21:49:01.829852: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/ai_intern/.wasmedge/lib



 Run 1/3


2025-04-16 21:49:14.248712: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/ai_intern/.wasmedge/lib
2025-04-16 21:49:14.248782: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2025-04-16 21:49:14.249266: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropria

Epoch 1/5
1000/1000 - 78s - loss: 0.3937 - accuracy: 0.8260 - val_loss: 0.3079 - val_accuracy: 0.8717 - 78s/epoch - 78ms/step
Epoch 2/5
1000/1000 - 69s - loss: 0.2816 - accuracy: 0.8867 - val_loss: 0.2936 - val_accuracy: 0.8766 - 69s/epoch - 69ms/step
Epoch 3/5
1000/1000 - 67s - loss: 0.2567 - accuracy: 0.8988 - val_loss: 0.2916 - val_accuracy: 0.8804 - 67s/epoch - 67ms/step
Epoch 4/5
1000/1000 - 72s - loss: 0.2455 - accuracy: 0.9050 - val_loss: 0.3513 - val_accuracy: 0.8640 - 72s/epoch - 72ms/step
Epoch 5/5
1000/1000 - 70s - loss: 0.2346 - accuracy: 0.9081 - val_loss: 0.2825 - val_accuracy: 0.8809 - 70s/epoch - 70ms/step
 Test Accuracy (Run 1): 0.8863

 Run 2/3
Epoch 1/5
1000/1000 - 70s - loss: 0.3967 - accuracy: 0.8200 - val_loss: 0.2969 - val_accuracy: 0.8811 - 70s/epoch - 70ms/step
Epoch 2/5
1000/1000 - 66s - loss: 0.2830 - accuracy: 0.8864 - val_loss: 0.2938 - val_accuracy: 0.8758 - 66s/epoch - 66ms/step
Epoch 3/5
1000/1000 - 65s - loss: 0.2580 - accuracy: 0.8980 - val_loss: 0.298