In [1]:
# ======================================
# 1. CÀI ĐẶT THƯ VIỆN
# ======================================
!pip install kaggle --quiet
import os, zipfile, random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50, VGG16, EfficientNetB0
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_pre
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_pre
from tensorflow.keras.applications.efficientnet import preprocess_input as eff_pre

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
# ======================================
# 2. TẢI DATASET FLOWERS TỪ KAGGLE
# (Upload kaggle.json trước khi chạy)
# ======================================
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d l3llff/flowers -p ./datasets
!unzip -q ./datasets/flowers.zip -d ./datasets

parent_path = "./datasets/flowers/flowers/"
print("Path:", parent_path)


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/cli.py", line 68, in main
    out = args.func(**command_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 1741, in dataset_download_cli
    with self.build_kaggle_client() as kaggle:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 688, in build_kaggle_client
    username=self.config_values['username'],
             ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'username'
unzip:  cannot find or open ./datasets/flowers.zip, ./datasets/flowers.zip.zip or ./datasets/flowers.zip.ZIP.
Path: ./datasets/flowers/flowers/


In [3]:
# ======================================
# 3. EDA CƠ BẢN
# ======================================
classes = os.listdir(parent_path)
print("Các lớp:", classes)

# Đếm số ảnh trong mỗi lớp
class_counts = {cls: len(os.listdir(os.path.join(parent_path, cls))) for cls in classes}
print("Số lượng ảnh mỗi lớp:", class_counts)

# Biểu đồ phân phối
plt.figure(figsize=(8,5))
sns.barplot(x=list(class_counts.keys()), y=list(class_counts.values()))
plt.title("Số lượng ảnh trong từng lớp")
plt.show()

# Hiển thị ảnh mẫu
plt.figure(figsize=(12,8))
for i, cls in enumerate(classes):
    img_path = random.choice(glob(os.path.join(parent_path, cls, "*")))
    img = plt.imread(img_path)
    plt.subplot(2, len(classes)//2, i+1)
    plt.imshow(img)
    plt.title(cls)
    plt.axis("off")
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: './datasets/flowers/flowers/'

In [None]:
# ======================================
# 4. PIPELINE CONFIG
# ======================================
config = {
    "img_size": (224, 224),
    "batch_size": 32,
    "feature_extractor": "ResNet50",   # ["ResNet50", "VGG16", "EfficientNetB0"]
    "ml_models": {
        "LogisticRegression": {"max_iter": 5000},
        "SVM": {"kernel": "linear", "C": 1.0},
        "RandomForest": {"n_estimators": 300, "max_depth": None, "random_state": 42}
    }
}


In [None]:
# ======================================
# 5. FEATURE EXTRACTOR SETUP
# ======================================
feature_extractors = {
    "ResNet50": (ResNet50, resnet_pre),
    "VGG16": (VGG16, vgg_pre),
    "EfficientNetB0": (EfficientNetB0, eff_pre),
}

def get_feature_extractor(name, img_size):
    ModelClass, preprocess_fn = feature_extractors[name]
    model = ModelClass(weights="imagenet", include_top=False, pooling="avg", input_shape=img_size+(3,))
    return model, preprocess_fn


In [None]:
# ======================================
# 6. TIỀN XỬ LÝ VÀ GENERATORS
# ======================================
model_class, preprocess_fn = feature_extractors[config["feature_extractor"]]

datagen = ImageDataGenerator(
    preprocessing_function=preprocess_fn,
    validation_split=0.2
)

train_gen = datagen.flow_from_directory(
    parent_path,
    target_size=config["img_size"],
    batch_size=config["batch_size"],
    class_mode='categorical',
    subset='training',
    shuffle=True
)

val_gen = datagen.flow_from_directory(
    parent_path,
    target_size=config["img_size"],
    batch_size=config["batch_size"],
    class_mode='categorical',
    subset='validation',
    shuffle=False
)


In [None]:
# ======================================
# 7. TRÍCH XUẤT ĐẶC TRƯNG
# ======================================
extractor, _ = get_feature_extractor(config["feature_extractor"], config["img_size"])

X_train = extractor.predict(train_gen, verbose=1)
y_train = train_gen.classes

X_val = extractor.predict(val_gen, verbose=1)
y_val = val_gen.classes

print("Train features:", X_train.shape)
print("Val features:", X_val.shape)

# Lưu .npy theo yêu cầu đề bài
np.save("train_features.npy", X_train)
np.save("train_labels.npy", y_train)
np.save("val_features.npy", X_val)
np.save("val_labels.npy", y_val)


In [None]:
# ======================================
# 8. TRAIN CÁC MÔ HÌNH MACHINE LEARNING
# ======================================
ml_models_map = {
    "LogisticRegression": LogisticRegression,
    "SVM": SVC,
    "RandomForest": RandomForestClassifier
}

results = {}

for name, params in config["ml_models"].items():
    model = ml_models_map[name](**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[name] = {"acc": acc, "y_pred": y_pred}
    print(f"{name} acc: {acc:.4f}")


In [None]:
# ======================================
# 9. ĐÁNH GIÁ & VISUALIZE
# ======================================
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n===== {model_name} =====")
    print(classification_report(y_true, y_pred, target_names=list(train_gen.class_indices.keys())))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=list(train_gen.class_indices.keys()),
                yticklabels=list(train_gen.class_indices.keys()))
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

for name, res in results.items():
    evaluate_model(y_val, res["y_pred"], name)
