In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import kagglehub
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.utils import to_categorical

# Tải dataset từ Kaggle
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
dataset_path = '/root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv'
df = pd.read_csv(dataset_path)
print("Dữ liệu tải thành công")

# Hiển thị vài dòng đầu của dữ liệu
print(df.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:02<00:00, 12.2MB/s]

Extracting files...





Dữ liệu tải thành công
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Tải stop words và khởi tạo stemmer
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)  # Loại bỏ HTML
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Loại bỏ ký tự đặc biệt và dấu câu
    text = re.sub(r'\s+', ' ', text).strip()  # Loại bỏ khoảng trắng thừa
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Áp dụng hàm làm sạch cho cột review
df['cleaned_review'] = df['review'].apply(clean_text)
print(df[['review', 'cleaned_review']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one review mention watch oz episod youll hook ...  
1  wonder littl product film techniqu unassum old...  
2  thought wonder way spend time hot summer weeke...  
3  basic there famili littl boy jake think there ...  
4  petter mattei love time money visual stun film...  


In [None]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
# Các siêu tham số cho mô hình
vocab_size = 5000  # Kích thước từ vựng
embedding_dim = 100  # Số chiều của embedding
max_length = 100  # Độ dài tối đa của mỗi câu
padding_type = 'post'
trunc_type = 'post'
oov_token = "<OOV>"

# Khởi tạo Tokenizer và chuẩn bị dữ liệu cho CNN
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df['cleaned_review'])

# Chuyển văn bản thành chuỗi chỉ số và áp dụng padding
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Chia dữ liệu thành tập train và test
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'].values, test_size=0.2, random_state=42)

In [None]:
# Xây dựng mô hình CNN
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')  # Phân loại nhị phân
])

# Compile mô hình
model.build(input_shape=(None, max_length))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Huấn luyện mô hình
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 1.0000 - loss: 1.6342e-06 - val_accuracy: 0.8591 - val_loss: 1.0974
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 1.0000 - loss: 8.2730e-07 - val_accuracy: 0.8589 - val_loss: 1.1478
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.0000 - loss: 4.1271e-07 - val_accuracy: 0.8594 - val_loss: 1.1976
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 1.0000 - loss: 2.1014e-07 - val_accuracy: 0.8596 - val_loss: 1.2477
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 1.0000 - loss: 1.1121e-07 - val_accuracy: 0.8594 - val_loss: 1.2971


In [None]:
# Đánh giá mô hình trên tập test
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {test_accuracy}')

313/313 - 1s - 2ms/step - accuracy: 0.8591 - loss: 1.0483
Test Accuracy: 0.8590999841690063


In [None]:
# Các câu nhận xét để kiểm tra
new_reviews = [
    "This movie was absolutely fantastic! The storyline and acting were incredible.",
    "I loved every minute of this film. It was captivating and heartwarming.",
    "An amazing experience! The visuals were stunning, and the performances were top-notch.",
    "This movie was a complete waste of time. The plot was boring, and the acting was terrible.",
    "I didn’t enjoy this film at all. It was too predictable and slow-paced.",
    "The story was all over the place, and the characters were not well-developed."
]

# Làm sạch và chuyển đổi thành padded sequences (nếu sử dụng CNN hoặc RNN)
new_reviews_cleaned = [clean_text(review) for review in new_reviews]
new_reviews_sequences = tokenizer.texts_to_sequences(new_reviews_cleaned)
new_reviews_padded = pad_sequences(new_reviews_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Dự đoán với mô hình
predictions = model.predict(new_reviews_padded)

# Hiển thị kết quả
for i, review in enumerate(new_reviews):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {'Positive' if predictions[i][0] > 0.5 else 'Negative'}")
    print(f"Probability: {predictions[i][0]:.4f}")
    print("------")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424ms/step
Review: This movie was absolutely fantastic! The storyline and acting were incredible.
Predicted Sentiment: Positive
Probability: 1.0000
------
Review: I loved every minute of this film. It was captivating and heartwarming.
Predicted Sentiment: Positive
Probability: 0.9949
------
Review: An amazing experience! The visuals were stunning, and the performances were top-notch.
Predicted Sentiment: Positive
Probability: 1.0000
------
Review: This movie was a complete waste of time. The plot was boring, and the acting was terrible.
Predicted Sentiment: Negative
Probability: 0.0000
------
Review: I didn’t enjoy this film at all. It was too predictable and slow-paced.
Predicted Sentiment: Negative
Probability: 0.0001
------
Review: The story was all over the place, and the characters were not well-developed.
Predicted Sentiment: Negative
Probability: 0.0749
------


In [None]:
# Lưu mô hình
model.save("sentiment_cnn_model.h5")




In [None]:
import joblib

joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']