In [1]:
import sys
import os

root_pth = os.path.abspath(os.path.join(os.getcwd(), ".."))
if root_pth not in sys.path:
    sys.path.append(root_pth)

In [2]:
from sana_clf.pipeline import SanaClassifierPipeline, SanaClassifierParameters, SanaText2ImgParameters
from sana_clf.train import train

In [6]:
prompt = """complex. detailed. simple. bokeh effect. abstract. photorealistic. artistic. stylized. aesthetic.
cinematic. instagram filters. color correction. midjourney. ugly. distorted. blurry. rendering.
AI-generated. synthetic. high quality. low quality. pixelated. low illumination."""

In [7]:
import pandas as pd
from pathlib import PureWindowsPath, Path

def load_csv(csv_path, dir_path):
    def set_abs_pth(x):
        return (dir_path / Path(PureWindowsPath(x).as_posix())).as_posix()
    df = pd.read_csv(Path(dir_path) / csv_path)
    df["fp"] = df["fp"].apply(set_abs_pth)
    return df

train_df = load_csv("train.csv", "/workspace/train")[["fp", "label"]].sample(20000, random_state=42).reset_index(drop=True)
val_df = load_csv("val.csv", "/workspace/val")[["fp", "label"]].sample(1000, random_state=42).reset_index(drop=True)

In [8]:
pipe = train(
    train_df=train_df,
    val_df=val_df,
    pretrained_model="Efficient-Large-Model/Sana_600M_512px_diffusers",
    cfg_params=SanaClassifierParameters(
        proj_dim=512,
        hidden_dims=[512, 128],
        drop_p=.1,
    ),
    t2i_params=SanaText2ImgParameters(prompt=prompt, t=.25),
    transformer_layers=[24, 25, 9, 23, 22],
    output_dir="test_run",
    batch_size=4,
    epochs=1,
    lr=1e-4,
    warmup_steps = 0,
    logging_steps = 10,
    save_steps = 200,
    seed = 42,
    device="cuda",
    wandb_project="sana-classifier",
    wandb_run_name="5layers_base_prompt",
    verbose=True,
    progress_bar=True,
    wandb_enabled=True,
)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch 1 [train]:   0%|          | 0/5000 [00:00<?, ?it/s]

Checkpoint сохранен: test_run/checkpoint-step-5000
Checkpoint сохранен: test_run/checkpoint-epoch-1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch 1 [val]:   0%|          | 0/250 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation results for epoch 1:
  Accuracy:  0.8230
  Precision: 0.8042
  Recall:    0.8736
  F1 Score:  0.8375
  ROC AUC:   0.9108
  Confusion Matrix:
[[367 111]
 [ 66 456]]
  Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.77      0.81       478
         1.0       0.80      0.87      0.84       522

    accuracy                           0.82      1000
   macro avg       0.83      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



0,1
step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇████
train/loss,▄▄▅▄▂▁▃▁▁▄▂▅▄▂▂▂▂▂▂▁▁▁▂▃▁▅▂▄▁▂▂▁▂▁█▆▁▂▂▂
val/accuracy,▁
val/epoch,▁
val/f1,▁
val/precision,▁
val/recall,▁
val/roc_auc,▁

0,1
step,5000.0
train/loss,0.12408
val/accuracy,0.823
val/epoch,1.0
val/f1,0.83747
val/precision,0.80423
val/recall,0.87356
val/roc_auc,0.91077
