In [4]:
import os
from pathlib import Path

SEED = 42
DATASETS_DIR = Path(os.getcwd()).parent / "data" / "datasets"
SET_NAMES = ["train", "val", "test"]
DATASETS_DIR

WindowsPath('d:/Projects/synthetic-images-detection/data/datasets')

In [6]:
import concurrent.futures
from tqdm.notebook import tqdm
import math
from PIL import Image
from pathlib import Path
import pandas as pd
import numpy as np
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor


class DatasetUtil:
    def __init__(self, dataset_dir, set_names):
        self.dataset_dir = dataset_dir
        self.set_names = set_names
        _datasets = {
            name: {
                "df": self.read_dataset(dataset_dir.parent, file_name=name + ".csv")
            } 
            for name in self.set_names
        }
        for d in _datasets:
            _datasets[d]["emb_matrix"] = self.load_embedding_matrix(_datasets[d]["df"], workers=16)
        self.datasets = _datasets
    
    def get_dataset(self, name: str) -> tuple[pd.DataFrame, np.ndarray]:
        data = self.datasets[name]
        return data["df"], data["emb_matrix"]
    
    @staticmethod
    def load_embedding_matrix(
        df: pd.DataFrame,
        dtype: np.dtype = np.float32,
        dim: int = 768,
        workers: int = 16
    ) -> np.ndarray:
        memmap_paths = df["emb_memmap"]
        df["emb_idx"] = np.arange(len(memmap_paths), dtype=int)

        def read_emb(path: str) -> np.ndarray:
            return np.fromfile(path, dtype=dtype, count=dim)

        with ThreadPoolExecutor(max_workers=workers) as executor:
            embeddings = list(tqdm(
                executor.map(read_emb, memmap_paths),
                total=len(memmap_paths),
                desc="Loading embeddings"
            ))

        return np.stack(embeddings, axis=0)

    @staticmethod
    def read_dataset(dir_path: Path, file_name: str = "processed_with_embeddings.csv") -> pd.DataFrame:
        df = pd.read_csv(dir_path / file_name)
        df["fp"] = df["fp"].apply(lambda x: DATASETS_DIR / Path(x))
        df["emb_memmap"] = df["emb_memmap"].apply(lambda x: DATASETS_DIR / Path(x))
        return df

    @staticmethod
    def save_dataset(df: pd.DataFrame, target_path: Path) -> None:
        df["fp"] = df["fp"].apply(lambda x: Path(x).relative_to(DATASETS_DIR))
        df["lr_fp"] = df["lr_fp"].apply(lambda x: Path(x).relative_to(DATASETS_DIR))
        df["emb_memmap"] = df["emb_memmap"].apply(lambda x: Path(x).relative_to(DATASETS_DIR))
        df.to_csv(target_path, index=False)


In [7]:
datasets = DatasetUtil(DATASETS_DIR, SET_NAMES)

  df = pd.read_csv(dir_path / file_name)


Loading embeddings:   0%|          | 0/198272 [00:00<?, ?it/s]

Loading embeddings:   0%|          | 0/18084 [00:00<?, ?it/s]

Loading embeddings:   0%|          | 0/162773 [00:00<?, ?it/s]

In [8]:
import numpy as np
import pandas as pd
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)
from sklearn.preprocessing import StandardScaler

In [27]:
test_df, test_emb_matrix = datasets.get_dataset("test")
sota_test_df = test_df[test_df.model_name.isin(["FLUX1_dev", "FLUX1_pro"])]
sota_emb = test_emb_matrix[sota_test_df.emb_idx]
sota_test_df.emb_idx = range(len(sota_emb))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sota_test_df.emb_idx = range(len(sota_emb))


In [35]:
def extract_xy(df, emb_matrix):
    X = emb_matrix[df['emb_idx'].values]
    y = df['label'].values
    return X, y

X_train, y_train = extract_xy(*datasets.get_dataset("train"))
X_val, y_val   = extract_xy(*datasets.get_dataset("val"))
X_test, y_test  = extract_xy(sota_test_df, sota_emb)

In [36]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

In [37]:
clf = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)

start_time = time.time()
clf.fit(X_train_scaled, y_train)
print(f"Training completed in {(time.time() - start_time):.2f} seconds")

Training completed in 9.14 seconds


In [40]:
def evaluate(model, X, y, split_name):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    print(f"--- {split_name} Metrics ---")
    print(
        f"Accuracy  : {accuracy_score(y, y_pred):.4f}\n"
        f"Precision : {precision_score(y, y_pred):.4f}\n"
        f"Recall    : {recall_score(y, y_pred):.4f}\n"
        f"F1-score  : {f1_score(y, y_pred):.4f}\n"
        f"ROC AUC   : {roc_auc_score(y, y_prob):.4f}\n"
    )
    print("\nClassification Report:\n", classification_report(y, y_pred, digits=4))
    cm = confusion_matrix(y, y_pred)
    print("Confusion Matrix:\n", cm)

In [41]:
# Validation
evaluate(clf, X_val_scaled, y_val, 'Validation')

# Test
evaluate(clf, X_test_scaled, y_test, 'Test')

--- Validation Metrics ---
Accuracy  : 0.9904
Precision : 0.9868
Recall    : 0.9941
F1-score  : 0.9905
ROC AUC   : 0.9996


Classification Report:
               precision    recall  f1-score   support

           0     0.9941    0.9867    0.9904      9042
           1     0.9868    0.9941    0.9905      9042

    accuracy                         0.9904     18084
   macro avg     0.9905    0.9904    0.9904     18084
weighted avg     0.9905    0.9904    0.9904     18084

Confusion Matrix:
 [[8922  120]
 [  53 8989]]
--- Test Metrics ---
Accuracy  : 0.9988
Precision : 1.0000
Recall    : 0.9988
F1-score  : 0.9994
ROC AUC   : nan


Classification Report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.9988    0.9994      9777

    accuracy                         0.9988      9777
   macro avg     0.5000    0.4994    0.4997      9777
weighted avg     1.0000    0.9988    0.9994      9777

Confusion Mat

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
