In [None]:
print('Hello')

In [None]:
import torch
import os
import random
from PIL import Image
from collections import defaultdict
import re


def is_image_file(file_path):
    # Common image file extensions
    image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]
    # Get file extension
    ext = os.path.splitext(file_path)[-1].lower()
    return ext in image_extensions


def is_valid_part_format(s):
    # Define the pattern
    pattern = r"^part([1-9]|1[0-4])$"
    # Match the string against the pattern
    match = re.match(pattern, s)
    return bool(match)


class TripletDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        root_dir,
        transform=None,
        sample_negatives="epoch",
        limit=-1,
        neg_only_reviews=True,
        type='train'
    ):
        """
        root_dir: Path to dataset (folders as classes)
        transform: Image transformations (e.g., augmentation, normalization)
        sample_negatives:
            - "batch" → Selects a random negative for each sample dynamically.
            - "epoch" → Assigns a negative at the start of each epoch.
            - "fixed" → Precomputed negative samples from a CSV file.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.sample_negatives = sample_negatives
        self.neg_only_reviews = neg_only_reviews

        self.class_to_images = defaultdict(list)  # { class: [image1, image2, ...] }
        self.samples = []  # [(anchor_path, positive_path, class)]

        # Read dataset structure
        for part_folder in os.listdir(root_dir):
            if self.__len__() >= limit and limit > 0:
                break
            if type == 'train':
                part_path = os.path.join(root_dir, part_folder, part_folder)
            else:
                part_path = os.path.join(root_dir, part_folder)
            print(part_path)
            if not os.path.isdir(part_path) or not is_valid_part_format(part_folder):
                continue
            for product_folder in os.listdir(part_path):
                if self.__len__() >= limit and limit > 0:
                    break

                product_path = os.path.join(part_path, product_folder)
                if os.path.isdir(product_path):
                    product_and_review = [
                        os.path.join(product_path, img)
                        for img in os.listdir(product_path)
                    ]
                    if (
                        len(product_and_review) < 2
                    ):  # Ensure at least an anchor-positive pair
                        continue

                    positive = None
                    anchor = None
                    for i in product_and_review:
                        if os.path.isdir(i):  # review
                            reviews = [
                                os.path.join(i, review_img)
                                for review_img in os.listdir(i)
                            ]
                            if len(reviews) == 0:
                                continue
                            # Get only first review image if it have multiple reivews
                            valid_review_imgs = [
                                file for file in reviews if is_image_file(file)
                            ]
                            if len(valid_review_imgs) > 0:
                                positive = valid_review_imgs[0]
                        elif is_image_file(i) and anchor is None:
                            anchor = i

                        if positive is not None and anchor is not None:
                            self.class_to_images[product_folder] = [
                                anchor,
                                positive,
                            ]
                            self.samples.append(
                                (anchor, positive, product_folder)
                            )  # Anchor & positive
                            continue

        # Precompute negatives if needed
        if self.sample_negatives == "epoch":
            self.negative_map = self.assign_negatives()

    def assign_negatives(self):
        """Assigns a random negative from a different class at the start of each epoch."""
        negative_map = {}
        product_list = list(self.class_to_images.keys())

        for product_label in self.class_to_images:
            neg_reviews = [cls for cls in product_list if cls != product_label]
            neg_review = random.choice(neg_reviews)
            if not self.neg_only_reviews:
                negative_map[product_label] = random.choice(
                    self.class_to_images[neg_review]
                )
            else:
                negative_map[product_label] = self.class_to_images[neg_review][1]

        return negative_map

    def __getitem__(self, index):
        anchor_path, positive_path, product_label = self.samples[index]

        # Choose negative based on sampling strategy
        if self.sample_negatives == "batch":
            neg_reviews = [
                cls for cls in self.class_to_images.keys() if cls != product_label
            ]
            neg_review = random.choice(neg_reviews)
            negative_path = random.choice(self.class_to_images[neg_review])
        elif self.sample_negatives == "epoch":
            negative_path = self.negative_map[product_label]
        else:
            raise ValueError("Unsupported sampling strategy. Use 'batch' or 'epoch'.")

        # Load images
        anchor = Image.open(anchor_path).convert("RGB")
        positive = Image.open(positive_path).convert("RGB")
        negative = Image.open(negative_path).convert("RGB")

        if self.transform:
            anchor = self.transform(anchor)
            positive = self.transform(positive)
            negative = self.transform(negative)

        return anchor, positive, negative

    def __len__(self):
        return len(self.samples)

    def update_negatives(self):
        """Call this at the start of each epoch if using 'epoch' sampling."""
        if self.sample_negatives == "epoch":
            self.negative_map = self.assign_negatives()


class EvalTripletDataset(TripletDataset):

    def assign_negatives(self):
        """
        Gán các mẫu âm (negative samples) cho từng sản phẩm.

        Mô tả:
        - Duyệt qua danh sách các sản phẩm (`class_to_images`).
        - Mỗi sản phẩm được gán với một sản phẩm khác làm mẫu âm (negative).
        - Sử dụng kỹ thuật xoay vòng (circular indexing) để đảm bảo mỗi sản phẩm có một mẫu âm hợp lệ.

        Returns:
            dict: Bản đồ ánh xạ từ sản phẩm gốc sang mẫu âm.
        """
        negative_map = {}
        product_list = list(self.class_to_images.keys())
        total_products = len(product_list)

        for idx, product_label in enumerate(self.class_to_images):
            next_idx = (idx + 1) % total_products  # Xoay vòng danh sách
            neg_reviews = product_list[next_idx]
            negative_map[product_label] = self.class_to_images[neg_reviews][1]

        return negative_map


In [None]:
import torch.nn.functional as F

def evaluate_batch(anchor, pos, neg, threshold=0.5, metric="l2"):
    """
    Đánh giá theo batch với metric là cosine similarity hoặc L2 distance.

    Args:
        anchor (torch.Tensor): Batch embedding của anchor, shape (batch_size, embedding_dim)
        pos (torch.Tensor): Batch embedding của positive, shape (batch_size, embedding_dim)
        neg (torch.Tensor): Batch embedding của negative, shape (batch_size, embedding_dim)
        threshold (float): Ngưỡng quyết định mẫu có giống nhau không.
        metric (str): 'cosine' hoặc 'l2' để chọn phương pháp đo khoảng cách.

    Returns:
        tuple: (TP, TN, FP, FN)
    """

    if metric == "cosine":
        # Tính cosine similarity
        sim_pos = F.cosine_similarity(
            anchor, pos, dim=-1
        )  # Cosine similarity giữa anchor và positive
        sim_neg = F.cosine_similarity(
            anchor, neg, dim=-1
        )  # Cosine similarity giữa anchor và negative

        # Xác định TP, FP, TN, FN
        tp = (sim_pos >= threshold).sum().item()  # Dự đoán đúng positive
        fn = (
            (sim_pos < threshold).sum().item()
        )  # Dự đoán sai positive (đáng lẽ giống nhưng bị xem là khác)
        tn = (sim_neg < threshold).sum().item()  # Dự đoán đúng negative
        fp = (
            (sim_neg >= threshold).sum().item()
        )  # Dự đoán sai negative (đáng lẽ khác nhưng bị xem là giống)

    elif metric == "l2":
        # Tính L2 distance
        dist_pos = torch.norm(
            anchor - pos, p=2, dim=-1
        )  # Khoảng cách L2 giữa anchor và positive
        dist_neg = torch.norm(
            anchor - neg, p=2, dim=-1
        )  # Khoảng cách L2 giữa anchor và negative

        # Xác định TP, FP, TN, FN
        tp = (dist_pos <= threshold).sum().item()  # Dự đoán đúng positive
        fn = (dist_pos > threshold).sum().item()  # Dự đoán sai positive
        tn = (dist_neg > threshold).sum().item()  # Dự đoán đúng negative
        fp = (dist_neg <= threshold).sum().item()  # Dự đoán sai negative

    else:
        raise ValueError("Metric must be 'cosine' or 'l2'")

    return tp, tn, fp, fn


def evaluate_metrics(tp, tn, fp, fn):
    # Accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    # Precision
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0

    # Recall
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0

    # F1-Score
    f1_score = (
        2 * (precision * recall) / (precision + recall)
        if (precision + recall) != 0
        else 0
    )

    # False Positive Rate (FPR)
    fpr = fp / (fp + tn) if (fp + tn) != 0 else 0

    # False Negative Rate (FNR)
    fnr = fn / (fn + tp) if (fn + tp) != 0 else 0

    return {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1_score,
        "False Positive Rate (FPR)": fpr,
        "False Negative Rate (FNR)": fnr,
    }


In [None]:
import torch
import wandb
from typing import Literal
from eval import evaluate_batch, evaluate_metrics


class SpamDetectorTrainer:
    def __init__(
        self,
        model: torch.nn.Module,
        criterion: torch.nn.Module,
        train_loader: torch.utils.data.DataLoader,
        valid_loader: torch.utils.data.DataLoader,
        optimizer: torch.optim.Optimizer,
        scheduler=None,
        lr_types: Literal["step", "epoch"] = "step",
        device: torch.device = "cuda",
        epochs: int = 10,
        max_norm: float = 0,
        log_writer: wandb = None,
        patience=3,
    ):
        self.model = model.to(device)
        self.criterion = criterion
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.lr_types = lr_types
        self.device = device
        self.epochs = epochs
        self.max_norm = max_norm
        self.log_writer = log_writer
        self.patience = patience

    def train_one_epoch(self, epoch):
        self.model.train()
        total_loss = 0
        global_step = epoch * len(self.train_loader)  # Global step tracking🔥

        for step, (anchor, positive, negative) in enumerate(self.train_loader):
            anchor, positive, negative = (
                anchor.to(self.device),
                positive.to(self.device),
                negative.to(self.device),
            )

            self.optimizer.zero_grad()
            anchor_embeded = self.model(anchor)
            positive_embeded = self.model(positive)
            negative_embeded = self.model(negative)
            loss = self.criterion(anchor_embeded, positive_embeded, negative_embeded)

            loss.backward()

            # Tính toán Gradient Norm
            total_norm = 0
            param_count = 0

            for param in self.model.parameters():
                if param.grad is not None:
                    param_norm = param.grad.norm().item()
                    total_norm += param_norm
                    param_count += 1

            mean_grad_norm = total_norm / param_count if param_count > 0 else 0

            # Áp dụng Gradient Clipping nếu cần
            if self.max_norm > 0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_norm)

            self.optimizer.step()

            # Step scheduler per step
            if self.scheduler and self.lr_types == "step":
                self.scheduler.step()

            # Log to W&B
            if self.log_writer:
                self.log_writer.log(
                    {
                        "train/loss": loss.item(),
                        "train/learning_rate": self.optimizer.param_groups[0]["lr"],
                        "train/grad_norm": mean_grad_norm,
                        "train/global_step": global_step + step,
                        "train/epoch": epoch + step / len(self.train_loader),
                    }
                )

            total_loss += loss.item()

        epoch_loss = total_loss / len(self.train_loader)
        if self.log_writer:
            self.log_writer.log({"train/mean_loss": epoch_loss})

        # Step scheduler per epoch
        if self.scheduler and self.lr_types == "epoch":
            self.scheduler.step()  # Step based on epoch

        return epoch_loss

    def validate(self):
        self.model.eval()
        running_loss = 0.0
        tp = 0
        tn = 0
        fp = 0
        fn = 0

        with torch.no_grad():
            for step, (anchor, positive, negative) in enumerate(self.valid_loader):
                anchor, positive, negative = (
                    anchor.to(self.device),
                    positive.to(self.device),
                    negative.to(self.device),
                )
                anchor_embeded = self.model(anchor)
                positive_embeded = self.model(positive)
                negative_embeded = self.model(negative)
                loss = self.criterion(
                    anchor_embeded, positive_embeded, negative_embeded
                )
                running_loss += loss.item()

                eval_step_result = evaluate_batch(
                    anchor_embeded, positive_embeded, negative_embeded
                )
                tp += eval_step_result[0]
                tn += eval_step_result[1]
                fp += eval_step_result[2]
                fn += eval_step_result[3]

        metrics = evaluate_metrics(tp, tn, fp, fn)

        # Calculate average loss
        avg_loss = running_loss / len(self.valid_loader)

        # Log metrics and loss to W&B
        if self.log_writer:
            # Log evaluation metrics
            self.log_writer.log(
                {
                    "val/accuracy": metrics["Accuracy"],
                    "val/precision": metrics["Precision"],
                    "val/recall": metrics["Recall"],
                    "val/f1_score": metrics["F1-Score"],
                    "val/False Positive Rate": metrics["False Positive Rate (FPR)"],
                    "val/False Negative Rate": metrics["False Negative Rate (FNR)"],
                    "val/val_loss": avg_loss,
                }
            )
        return avg_loss

    def train(self, resume_from_checkpoint=None):
        start_epoch = 0
        best_val_loss = float("inf")
        epochs_without_improvement = 0  # Track epochs without improvement

        # Load checkpoint if provided
        if resume_from_checkpoint:
            checkpoint = torch.load(resume_from_checkpoint, map_location=self.device)
            self.model.load_state_dict(checkpoint["model_state_dict"])
            self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

            if "scheduler_state_dict" in checkpoint and self.scheduler:
                self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

            start_epoch = checkpoint["epoch"] + 1
            best_val_loss = checkpoint.get("best_val_loss", float("inf"))
            epochs_without_improvement = checkpoint.get("epochs_without_improvement", 0)

            print(f"Resuming training from epoch {start_epoch}")
        else:
            print("Start training!")
        for epoch in range(start_epoch, start_epoch + self.epochs):
            epoch_loss = self.train_one_epoch(epoch)
            val_loss = self.validate()

            print(
                f"Epoch [{epoch+1}/{self.epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}"
            )

            # Check if validation loss improved
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_without_improvement = 0  # Reset counter
                print(
                    f"New best validation loss: {best_val_loss:.4f}. Saving checkpoint."
                )

                checkpoint = {
                    "epoch": epoch,
                    "model_state_dict": self.model.state_dict(),
                    "optimizer_state_dict": self.optimizer.state_dict(),
                    "scheduler_state_dict": (
                        self.scheduler.state_dict() if self.scheduler else None
                    ),
                    "best_val_loss": best_val_loss,
                    "epochs_without_improvement": epochs_without_improvement,
                }
                torch.save(checkpoint, f"checkpoint_{epoch}.pth")
            else:
                epochs_without_improvement += 1
                print(
                    f"No improvement for {epochs_without_improvement}/{self.patience} epochs."
                )

            # Stop training if no improvement for `self.patience` epochs
            if epochs_without_improvement >= self.patience:
                print(
                    f"Validation loss hasn't improved for {self.patience} epochs. Stopping training early."
                )
                break

        print("Training complete.")


In [None]:
from torch import nn


class EmbeddingHead(nn.Module):
    def __init__(self, embedding_size, in_features, dropout=0.2):
        super().__init__()
        self.batchnorm = nn.BatchNorm2d(in_features)
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(in_features, embedding_size, bias=True)

    def forward(self, x):
        x = self.batchnorm(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.fc(x)  # Output: (N, embedding_size)
        return nn.functional.normalize(x, dim=-1)  # Chuẩn hóa theo chiều feature

In [None]:

import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
import torch
from torchvision.models import ConvNeXt_Tiny_Weights, convnext_tiny
import wandb

data_path = "/kaggle/input/review-thesis-datasets"
val_data_path = "/kaggle/input/val-review-thesis-datasets"
# Hyperparameters
base_lr = 2e-5  # Learning rate ban đầu
num_epochs = 50
dataset_size = 5000
val_dataset_size = 1000
batch_size = 64
warmup_ratio = 0.1  # 10% epochs đầu là warmup
device = "cuda" if torch.cuda.is_available() else "cpu"

model = convnext_tiny(weight=ConvNeXt_Tiny_Weights.DEFAULT)
model.classifier = EmbeddingHead(128, 768)
model = torch.nn.DataParallel(model)
model.to(device)

old_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),  # Resize tất cả ảnh về 224x224
        transforms.ToTensor(),  # Chuyển ảnh thành tensor
        transforms.RandomHorizontalFlip(p=0.5),  # Lật ngang ảnh với xác suất 50%
        transforms.ColorJitter(brightness=0.3),  # Điều chỉnh độ sáng (±30%)
        transforms.RandomPerspective(
            distortion_scale=0.5, p=0.5
        ),  # Biến dạng phối cảnh
        transforms.RandomRotation(degrees=30),  # Xoay ảnh trong khoảng ±30 độ
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),  # Resize tất cả ảnh về 224x224
        transforms.ToTensor(),  # Chuyển ảnh thành tensor
        transforms.ColorJitter(brightness=0.3),  # Điều chỉnh độ sáng (±30%)
        transforms.RandomPerspective(
            distortion_scale=0.5, p=0.5
        ),  # Biến dạng phối cảnh
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
val_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),  # Resize tất cả ảnh về 224x224
        transforms.ToTensor(),  # Chuyển ảnh thành tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

print('Load train datasets!')
dataset = TripletDataset(data_path, transform=transform, limit=dataset_size)
print('Load val datasets!')
val_dataset = EvalTripletDataset(val_data_path, transform=val_transform, limit=val_dataset_size, type='val')


train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Khởi tạo Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=base_lr)
criterion = torch.nn.TripletMarginLoss()
lr_scheduler = OneCycleLR(
    optimizer,
    max_lr=base_lr,
    epochs=num_epochs,
    steps_per_epoch=int(dataset_size / batch_size),
    pct_start=warmup_ratio
)

# Lặp qua dataloader để lấy batch
for images, labels, negative in train_loader:
    print(
        f"Batch size: {images.shape}, Labels: {labels.shape}, Negative: {negative.shape}"
    )
    break  # Dừng sau batch đầu tiên

for images, labels, negative in val_loader:
    print(
        f"Batch size: {images.shape}, Labels: {labels.shape}, Negative: {negative.shape}"
    )
    break  # Dừng sau batch đầu tiên

In [None]:
wandb.login(key="c436a4917c43e09e30b67b919bd06e7bf7b0c10d")
wandb.init(project="review thesis project", name="experiment_8")
# wandb.init(project="review thesis project", name="experiment_2", resume="allow")
torch.cuda.empty_cache()
trainer = SpamDetectorTrainer(
    model=model,
    criterion=criterion,
    train_loader=train_loader,
    valid_loader=val_loader,
    optimizer=optimizer,
    scheduler=lr_scheduler,
    device=device,
    epochs=num_epochs,
    log_writer=wandb,
    patience=5
)

trainer.train()

wandb.finish()
