In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

In [2]:
root = "https://ai.stanford.edu/~amaas/data/sentiment/"
filename = "aclImdb_v1.tar.gz"
filepath = tf.keras.utils.get_file(filename, root + filename, extract=True,
                                   cache_dir=".")
if "_extracted" in filepath:
    path = Path(filepath) / "aclImdb"
else:
    path = Path(filepath).with_name("aclImdb")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 0us/step


In [56]:
def print_tree(path: Path, prefix: str = "", max_entries: int = 5):
    """In ra cấu trúc thư mục dạng cây từ một Path, giới hạn số mục con."""
    if not path.is_dir():
        print(f"{prefix}{path.name} [Not a directory]")
        return

    entries = sorted(path.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower()))
    total = len(entries)
    display_entries = entries[:max_entries]

    for i, entry in enumerate(display_entries):
        connector = "── " if i == len(display_entries) - 1 else "── "
        print(f"{prefix}{connector}{entry.name}")

        if entry.is_dir():
            extension = "    " if i == len(display_entries) - 1 else "|  "
            print_tree(entry, prefix + extension, max_entries=max_entries)

    if total > max_entries:
        print(f"{prefix}── ... ({total - max_entries} more items)")


In [57]:
print_tree(Path("/content/datasets/aclImdb_v1_extracted/aclImdb"))

── test
|  ── neg
|  |  ── 0_2.txt
|  |  ── 10000_4.txt
|  |  ── 10001_1.txt
|  |  ── 10002_3.txt
|  |  ── 10003_3.txt
|  |  ── ... (12495 more items)
|  ── pos
|  |  ── 0_10.txt
|  |  ── 10000_7.txt
|  |  ── 10001_9.txt
|  |  ── 10002_8.txt
|  |  ── 10003_8.txt
|  |  ── ... (12495 more items)
|  ── labeledBow.feat
|  ── urls_neg.txt
|  ── urls_pos.txt
── train
|  ── neg
|  |  ── 0_3.txt
|  |  ── 10000_4.txt
|  |  ── 10001_4.txt
|  |  ── 10002_1.txt
|  |  ── 10003_1.txt
|  |  ── ... (12495 more items)
|  ── pos
|  |  ── 0_9.txt
|  |  ── 10000_8.txt
|  |  ── 10001_10.txt
|  |  ── 10002_7.txt
|  |  ── 10003_8.txt
|  |  ── ... (12495 more items)
|  ── unsup
|  |  ── 0_0.txt
|  |  ── 10000_0.txt
|  |  ── 10001_0.txt
|  |  ── 10002_0.txt
|  |  ── 10003_0.txt
|  |  ── ... (49995 more items)
|  ── labeledBow.feat
|  ── unsupBow.feat
|  ── ... (3 more items)
── imdb.vocab
── imdbEr.txt
── README


# Load tất cả data vào bộ nhớ

In [61]:
path = Path("/content/datasets/aclImdb_v1_extracted/aclImdb")

In [58]:
def review_path(folder_path):
  return [path for path in folder_path.glob("*.txt")]

In [68]:
train_pos = review_path(path/"train"/"pos")

In [69]:
for p in list(train_pos)[:5]:
    print(p)

/content/datasets/aclImdb_v1_extracted/aclImdb/train/pos/10614_7.txt
/content/datasets/aclImdb_v1_extracted/aclImdb/train/pos/6068_9.txt
/content/datasets/aclImdb_v1_extracted/aclImdb/train/pos/696_10.txt
/content/datasets/aclImdb_v1_extracted/aclImdb/train/pos/2791_10.txt
/content/datasets/aclImdb_v1_extracted/aclImdb/train/pos/3887_10.txt


In [70]:
train_neg = review_path(path/"train"/"neg")
test_pos = review_path(path/"test"/"pos")
test_neg = review_path(path/"test"/"neg")

In [71]:
len(train_pos), len(train_neg), len(test_pos), len(test_neg)

(12500, 12500, 12500, 12500)

In [72]:
test_pos = test_pos[:5000]
test_neg = test_neg[:5000]
valid_pos = test_pos[5000:]
valid_neg = test_neg[5000:]

In [81]:
def create_data(filepaths_pos, filepaths_neg):
  reviews = []
  labels = []
  for filepaths, label in ((filepaths_pos,1), (filepaths_neg,0)):
    for filepath in filepaths:
      text = filepath.read_text()
      reviews.append(text)
      labels.append(label)
  return tf.data.Dataset.from_tensor_slices((tf.constant(reviews), tf.constant(labels)))

In [82]:
training_data = create_data(train_pos,train_neg)
validation_data = create_data(valid_pos, valid_neg)
test_data = create_data(test_pos, test_neg)

# Load lần lượt từng batch data

In [91]:
def imdb_dataset(filepath_pos, filepath_neg):
  dataset_pos = tf.data.TextLineDataset(filepath_pos,num_parallel_reads = tf.data.AUTOTUNE)
  dataset_pos = dataset_pos.map(lambda review : (review, 1))

  dataset_neg = tf.data.TextLineDataset(filepath_neg,num_parallel_reads = tf.data.AUTOTUNE)
  dataset_neg = dataset_neg.map(lambda review : (review, 0))

  return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)

In [92]:
batch_size = 32

train_set = imdb_dataset(train_pos, train_neg).shuffle(25000, seed=42)
train_set = train_set.batch(batch_size).prefetch(1)
valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

In [95]:
maxtoken = 10000
vectorize = tf.keras.layers.TextVectorization(max_tokens=maxtoken ,output_mode = 'tf-idf' )
review = train_set.map(lambda review, label: review)
vectorize.adapt(review)

In [101]:
vectorize.get_vocabulary()[:20]

['[UNK]',
 np.str_('the'),
 np.str_('and'),
 np.str_('a'),
 np.str_('of'),
 np.str_('to'),
 np.str_('is'),
 np.str_('in'),
 np.str_('it'),
 np.str_('i'),
 np.str_('this'),
 np.str_('that'),
 np.str_('br'),
 np.str_('was'),
 np.str_('as'),
 np.str_('for'),
 np.str_('with'),
 np.str_('movie'),
 np.str_('but'),
 np.str_('film')]

In [107]:
model = tf.keras.Sequential([
    vectorize,
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_set, epochs=10, validation_data=valid_set)

Epoch 1/10
    781/Unknown [1m27s[0m 28ms/step - accuracy: 0.8221 - loss: 0.4014



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 28ms/step - accuracy: 0.8222 - loss: 0.4012
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 30ms/step - accuracy: 0.9552 - loss: 0.1190
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 32ms/step - accuracy: 0.9844 - loss: 0.0468
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 30ms/step - accuracy: 0.9947 - loss: 0.0207
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 28ms/step - accuracy: 0.9953 - loss: 0.0175
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 27ms/step - accuracy: 0.9983 - loss: 0.0072
Epoch 7/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 28ms/step - accuracy: 0.9974 - loss: 0.0115
Epoch 8/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 28ms/step - acc

<keras.src.callbacks.history.History at 0x7e1a8ca3dc50>

In [108]:
model.evaluate(test_set)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8130 - loss: 1.5903


[1.2667756080627441, 0.8481000065803528]