In [36]:
import os, io, re, zipfile, pathlib, random, pickle
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm

from datasets import load_dataset

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

SEED = 1234
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

BASE_DIR = pathlib.Path.cwd()
DATA_DIR = BASE_DIR / "data"
ARTIFACTS_DIR = BASE_DIR / "artifacts"
DATA_DIR.mkdir(exist_ok=True, parents=True)
ARTIFACTS_DIR.mkdir(exist_ok=True, parents=True)

EMBED_DIM = 100 
VOCAB_SIZE = 20000
MAX_LEN = 300
BATCH_SIZE = 64
EPOCHS = 10
LR = 1e-3

print('BASE_DIR:', BASE_DIR)


BASE_DIR: /home/meu1404/projects/test/Deep_learning_tutorial


In [25]:
print('TF:', tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
print('GPUs:', gpus)
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception as e:
        print('memory_growth error:', e)


TF: 2.20.0
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
memory_growth error: Physical devices cannot be modified after being initialized


In [26]:
ds = load_dataset("stanfordnlp/imdb")
print(ds)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [27]:
train_texts = [r["text"] for r in ds["train"]]
train_labels = np.array([int(r["label"]) for r in ds["train"]], dtype="int32")

test_texts  = [r["text"] for r in ds["test"]]
test_labels = np.array([int(r["label"]) for r in ds["test"]], dtype="int32")

len(train_texts), len(test_texts), sum(train_labels), sum(test_labels)


(25000, 25000, np.int32(12500), np.int32(12500))

In [28]:
def basic_clean(s: str) -> str:
    s = s.replace("<br />", " ")
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"[^A-Za-z0-9'.,!?;:()\- ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

train_clean = [basic_clean(t) for t in train_texts]
test_clean  = [basic_clean(t) for t in test_texts]

print(train_clean[0][:300])


i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u.s. customs if it ever tried to enter this country, therefore being a fan of films considered controversial i really had


In [29]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_clean)

x_train = tokenizer.texts_to_sequences(train_clean)
x_test  = tokenizer.texts_to_sequences(test_clean)

x_train = pad_sequences(x_train, maxlen=MAX_LEN, padding="post", truncating="post")
x_test  = pad_sequences(x_test,  maxlen=MAX_LEN, padding="post", truncating="post")

word_index = tokenizer.word_index
nb_words = min(VOCAB_SIZE, len(word_index) + 1)

with open(ARTIFACTS_DIR / "tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

x_train.shape, x_test.shape, nb_words


((25000, 300), (25000, 300), 20000)

In [30]:
GLOVE_URL = "https://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = keras.utils.get_file(origin=GLOVE_URL, fname="glove.6B.zip", cache_dir=str(DATA_DIR), cache_subdir=".")
glove_root = pathlib.Path(glove_zip_path).parent / "glove.6B"
glove_root.mkdir(exist_ok=True, parents=True)

zip_path = pathlib.Path(glove_zip_path)
with zipfile.ZipFile(zip_path, 'r') as z:
    for name in z.namelist():
        if name.startswith("glove.6B.") and name.endswith(".txt"):
            target = glove_root / pathlib.Path(name).name
            if not target.exists():
                z.extract(name, path=glove_root)

glove_txt = glove_root / f"glove.6B.{EMBED_DIM}d.txt"
print("GloVe file:", glove_txt, "exists:", glove_txt.exists())


GloVe file: /home/meu1404/projects/test/Deep_learning_tutorial/data/glove.6B/glove.6B.100d.txt exists: True


In [31]:
def build_glove_matrix(word_index: dict, glove_txt_path: pathlib.Path, vocab_size: int, emb_dim: int):
    embeddings_index = {}
    with io.open(glove_txt_path, encoding="utf-8") as f:
        for line in f:
            values = line.strip().split()
            word = " ".join(values[:-emb_dim]) if len(values) > emb_dim+1 else values[0]
            coefs = np.asarray(values[-emb_dim:], dtype="float32")
            embeddings_index[word] = coefs
    print(f"GloVe loaded: {len(embeddings_index):,} tokens")

    nb_words = min(vocab_size, len(word_index) + 1)
    embedding_matrix = np.random.normal(scale=0.6, size=(nb_words, emb_dim)).astype("float32")
    found = 0
    for w, i in word_index.items():
        if i >= nb_words: 
            continue
        vec = embeddings_index.get(w)
        if vec is not None and len(vec) == emb_dim:
            embedding_matrix[i] = vec
            found += 1
    print(f"Init embeddings: matched {found:,}/{nb_words:,} tokens")
    return embedding_matrix, nb_words

emb_matrix, nb_words = build_glove_matrix(word_index, glove_txt, nb_words, EMBED_DIM)
emb_matrix.shape, nb_words


GloVe loaded: 400,000 tokens
Init embeddings: matched 19,154/20,000 tokens


((20000, 100), 20000)

In [33]:
def build_model(nb_words, emb_matrix, max_len, emb_dim):
    inp = keras.layers.Input(shape=(max_len,), name="input_ids")
    x = keras.layers.Embedding(nb_words, emb_dim,
                               weights=[emb_matrix],
                               trainable=True, name="embedding")(inp)
    x = keras.layers.SpatialDropout1D(0.2)(x)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.0, return_sequences=False)
    )(x)
    x = keras.layers.Dense(64, activation="relu")(x)
    x = keras.layers.Dropout(0.5)(x)
    out = keras.layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inp, out)
    opt = keras.optimizers.Adam(learning_rate=LR)   # ❌ KHÔNG truyền jit_compile ở đây
    model.compile(
        optimizer=opt,
        loss="binary_crossentropy",
        metrics=[keras.metrics.BinaryAccuracy(name="acc"), keras.metrics.AUC(name="auc")],
        jit_compile=True                              # ✅ jit_compile đặt ở đây
    )
    return model


model = build_model(nb_words, emb_matrix, MAX_LEN, EMBED_DIM)
model.summary()




In [37]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_acc", patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-6)
]

history = model.fit(
    x_train, train_labels,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/10


2025-09-20 11:35:43.988787: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 27000000 exceeds 10% of free system memory.


[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 76ms/step - acc: 0.8631 - auc: 0.9286 - loss: 0.3383 - val_acc: 0.8672 - val_auc: 0.0000e+00 - val_loss: 0.3323 - learning_rate: 7.8125e-06
Epoch 2/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 71ms/step - acc: 0.8600 - auc: 0.9276 - loss: 0.3405 - val_acc: 0.8640 - val_auc: 0.0000e+00 - val_loss: 0.3394 - learning_rate: 7.8125e-06
Epoch 3/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 77ms/step - acc: 0.8638 - auc: 0.9295 - loss: 0.3363 - val_acc: 0.8664 - val_auc: 0.0000e+00 - val_loss: 0.3325 - learning_rate: 3.9063e-06
Epoch 4/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 69ms/step - acc: 0.8568 - auc: 0.9288 - loss: 0.3380 - val_acc: 0.8656 - val_auc: 0.0000e+00 - val_loss: 0.3323 - learning_rate: 1.9531e-06


In [None]:
BATCH_SIZE = 128
split = int(0.9 * len(x_train))
x_tr, y_tr = x_train[:split], train_labels[:split]
x_val, y_val = x_train[split:], train_labels[split:]

ds_tr  = tf.data.Dataset.from_tensor_slices((x_tr,  y_tr)).shuffle(25000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
ds_val = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_acc', patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6)
]

history = model.fit(ds_tr, validation_data=ds_val, epochs=EPOCHS, callbacks=callbacks, verbose=1)


Epoch 1/10


2025-09-20 11:37:36.529685: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 27000000 exceeds 10% of free system memory.


[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 75ms/step - acc: 0.8635 - auc: 0.9298 - loss: 0.3365 - val_acc: 0.8676 - val_auc: 0.0000e+00 - val_loss: 0.3297 - learning_rate: 1.0000e-06
Epoch 2/10
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 76ms/step - acc: 0.8619 - auc: 0.9295 - loss: 0.3363 - val_acc: 0.8660 - val_auc: 0.0000e+00 - val_loss: 0.3336 - learning_rate: 1.0000e-06
Epoch 3/10
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 87ms/step - acc: 0.8608 - auc: 0.9284 - loss: 0.3390 - val_acc: 0.8612 - val_auc: 0.0000e+00 - val_loss: 0.3420 - learning_rate: 1.0000e-06
Epoch 4/10
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 75ms/step - acc: 0.8589 - auc: 0.9289 - loss: 0.3377 - val_acc: 0.8624 - val_auc: 0.0000e+00 - val_loss: 0.3413 - learning_rate: 1.0000e-06


In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

def basic_clean(s: str) -> str:
    s = s.replace("<br />", " ")
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"[^A-Za-z0-9'.,!?;:()\-\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def predict_sentiment(text: str, model=model, tokenizer=tokenizer, max_len=MAX_LEN):
    t = basic_clean(text)
    seq = tokenizer.texts_to_sequences([t])
    seq = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
    prob = float(model.predict(seq, verbose=0)[0][0])
    label = "positive" if prob >= 0.5 else "negative"
    return prob, label

# Nhập từ bàn phím
s = input("Nhập câu đánh giá phim: ")
prob, lab = predict_sentiment(s)
print(f"Câu văn cần đánh giá: {s}")
print(f"Kết quả: {lab.upper()} | score={prob:.3f}")


Câu văn cần đánh giá: The cinematography is stunning; every frame looks like a painting.
Kết quả: POSITIVE | score=0.696
