In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/qshing_guard/

Mounted at /content/drive
/content/drive/MyDrive/qshing_guard


## 0. PIP Install

In [None]:
! apt-get update
! apt-get install -y libzbar0

! pip install -q qrcode pyzbar

## 1. Dataset

In [None]:
# 1-1. URL 구축(Fast Test: 2,000 | Normal 20,000 | Large 50,000)
! python -m src.data.build_manifest \
  --kisa_csv data/raw/kisa_db.csv \
  --kakao_csv data/raw/kakao_db.csv \
  --normal_csv data/raw/normal_urls.csv \
  --normal_limit 2000 \
  --phish_limit 2000 \
  --balance_ratio 1.0 \
  --dedup_by_url_norm \
  --split_by url \
  --out_dir data/processed \
  --seed 42

[STEP] build kisa...
Normalize URLs (kisa): 100% 27582/27582 [00:00<00:00, 81824.42row/s]
  kisa: 27567
[STEP] build kakao...
Extract URLs (kakao): 100% 19009/19009 [00:00<00:00, 112440.33row/s]
Normalize URLs (kakao): 100% 3045/3045 [00:00<00:00, 76705.26row/s]
  kakao: 3037
[STEP] build normal...
Normalize URLs (normal): 100% 2000/2000 [00:00<00:00, 79442.09row/s]
  normal: 2000 (limit=2000)
[STEP] merge...
[STEP] dedup by url_norm: 32604 -> 21505
[STEP] phish_limit=2000 -> phish=2000, benign=2484
[STEP] balance_ratio=1.0 -> phish=2000, benign=2000
Compute eTLD+1 (domain_reg): 100% 4000/4000 [00:00<00:00, 105933.49row/s]
[STEP] split...
[AUDIT] url_norm overlap train-val=0, train-test=0, val-test=0
[AUDIT] domain_reg overlap train-val=38, train-test=49, val-test=19
[OK] manifest written to: data/processed
  total: 4000 label1(phish): 2000 label0(benign): 2000
  train/val/test: 2800 400 800


In [None]:
# 1-2. QR 생성(5~15분 소요)
! python -m src.qr.generate_qr \
  --manifest_path data/processed/manifest.csv \
  --out_dir data/qr_images \
  --update_manifest_out data/processed/manifest_with_qr.csv \
  --ecc H --box_size 10 --border 4

Generate QR: 100% 4000/4000 [01:24<00:00, 47.46qr/s, saved=4000]
Saved QR images under: data/qr_images
Wrote updated manifest: data/processed/manifest_with_qr.csv


In [None]:
# 1-3. URL + QR 데이터셋 생성
! python -m src.data.attach_qr_paths \
  --manifest_with_qr data/processed/manifest_with_qr.csv \
  --splits_dir data/processed \
  --out_dir data/processed

Wrote: data/processed/train_with_qr.csv
Wrote: data/processed/val_with_qr.csv
Wrote: data/processed/test_with_qr.csv


## 2. Real World Data Augmentation

In [None]:
# 2-1. 기본 QR 현실 데이터 증강 모듈
! python -m src.qr.augment_qr \
  --input_dir data/qr_images \
  --out_dir data/qr_images_aug \
  --n_per_image 2 \
  --strength light \
  --context_mode none \
  --seed 42

Augment QR: 100% 4000/4000 [04:59<00:00, 13.37img/s, skipped=0, written=8000]
Wrote augmented images to: data/qr_images_aug
Total augmented files: 8000


In [None]:
# 2-2. 'QR + 현실 배경 합성'이 포함된 강화된 증강 모듈
! python -m src.qr.augment_qr \
  --input_dir data/qr_images \
  --out_dir data/qr_images_aug \
  --n_per_image 2 \
  --strength strong \
  --background_dir assets/backgrounds \
  --context_mode mix \
  --context_prob 0.75 \
  --output_size 512 \
  --decode_filter \
  --save_meta_csv data/processed/qr_aug_meta.csv \
  --seed 42

[BG] Found 40 backgrounds under assets/backgrounds
	i=2 f=-1(010) part=0
	i=13 f=-1(000) part=0
	i=15 f=-1(101) part=0
	i=29 f=-1(000) part=0
	i=3 f=-1(000) part=0
Augment QR:  27% 1061/4000 [07:01<16:15,  3.01img/s, skipped=0, written=2122][WARN] skipped 1ce83424ce46b45a.png aug0: decode_failed
Augment QR:  33% 1316/4000 [08:38<14:07,  3.17img/s, skipped=1, written=2631][WARN] skipped dd1213725bba680d.png aug1: decode_failed
	i=9 f=-1(101) part=0
Augment QR:  41% 1625/4000 [10:35<13:41,  2.89img/s, skipped=2, written=3248][WARN] skipped 9583bc0307d9f503.png aug0: decode_failed
	i=12 f=-1(101) part=0
	i=21 f=-1(101) part=0
Augment QR:  61% 2451/4000 [15:45<10:40,  2.42img/s, skipped=3, written=4899][WARN] skipped 9e5c0d5d2190b59a.png aug1: decode_failed
	i=21 f=-1(111) part=0
Augment QR:  73% 2920/4000 [18:31<10:07,  1.78img/s, skipped=4, written=5836][WARN] skipped c97f41043b67d0c3.png aug1: decode_failed
	i=3 f=-1(010) part=0
Augment QR:  89% 3553/4000 [22:24<04:00,  1.86img/s, skipp

## 3. Fusion Training Models

In [None]:
# 3-1. URL 문자열 모델
! python -m src.train.train_url \
  --train_csv data/processed/train.csv \
  --val_csv data/processed/val.csv \
  --test_csv data/processed/test.csv \
  --out_dir artifacts/models/url \
  --class_weight balanced \
  --warn_fpr 0.01 \
  --block_fpr 0.001

val {'tn': 200, 'fp': 0, 'fn': 5, 'tp': 195, 'precision': 0.9999999999999949, 'recall_tpr': 0.9749999999999952, 'f1': 0.9873417721513938, 'fpr': 0.0, 'tpr': 0.9749999999999952, 'n': 400, 'roc_auc': 0.99795, 'pr_auc': 0.9981282814413853, 'ece': 0.03691139930830799}
test {'tn': 398, 'fp': 2, 'fn': 3, 'tp': 397, 'precision': 0.9949874686716766, 'recall_tpr': 0.9924999999999975, 'f1': 0.9937421777216501, 'fpr': 0.004999999999999987, 'tpr': 0.9924999999999975, 'n': 800, 'roc_auc': 0.99918125, 'pr_auc': 0.9993268164642503, 'ece': 0.038950447477173106}
Saved: artifacts/models/url/url_model.joblib


In [None]:
# 3-2. QR 이미지 + Contxt Branch 모델
! python -m src.train.train_qr \
  --train_csv data/processed/train_with_qr.csv \
  --val_csv data/processed/val_with_qr.csv \
  --test_csv data/processed/test_with_qr.csv \
  --out_dir artifacts/models/qr \
  --augment_strength light \
  --use_context \
  --balance_sampler \
  --calibrate \
  --warn_fpr 0.01 \
  --block_fpr 0.001

Device: cuda
Train ep1: 100% 44/44 [08:37<00:00, 11.77s/batch, loss=0.643]
[VAL ep1] {'tn': 129, 'fp': 71, 'fn': 47, 'tp': 153, 'precision': 0.6830357142857113, 'recall_tpr': 0.7649999999999962, 'f1': 0.7216981132070455, 'fpr': 0.35499999999999826, 'tpr': 0.7649999999999962, 'n': 400, 'roc_auc': 0.816625, 'pr_auc': 0.8484573711924459, 'ece': 0.10594689626246694}
Train ep2: 100% 44/44 [03:58<00:00,  5.43s/batch, loss=0.55]
[VAL ep2] {'tn': 200, 'fp': 0, 'fn': 167, 'tp': 33, 'precision': 0.9999999999999697, 'recall_tpr': 0.16499999999999918, 'f1': 0.28326180257486167, 'fpr': 0.0, 'tpr': 0.16499999999999918, 'n': 400, 'roc_auc': 0.883025, 'pr_auc': 0.903196732010873, 'ece': 0.3671856193174609}
Train ep3: 100% 44/44 [02:51<00:00,  3.91s/batch, loss=0.531]
[VAL ep3] {'tn': 197, 'fp': 3, 'fn': 78, 'tp': 122, 'precision': 0.9759999999999922, 'recall_tpr': 0.609999999999997, 'f1': 0.7507692307687528, 'fpr': 0.014999999999999925, 'tpr': 0.609999999999997, 'n': 400, 'roc_auc': 0.918825, 'pr_auc'

In [None]:
# 3-3. Fusion(URL 문자열 + QR 이미지 + Contxt Branch) 모델
! python -m src.train.train_fusion \
  --train_csv data/processed/train_with_qr.csv \
  --val_csv data/processed/val_with_qr.csv \
  --test_csv data/processed/test_with_qr.csv \
  --out_dir artifacts/models/fusion \
  --augment_strength light \
  --use_context \
  --fusion_mode gated \
  --balance_sampler \
  --calibrate \
  --warn_fpr 0.01 \
  --block_fpr 0.001

Device: cuda
Train ep1: 100% 59/59 [02:18<00:00,  2.35s/batch, loss=0.646]
[VAL ep1] {'tn': 200, 'fp': 0, 'fn': 93, 'tp': 107, 'precision': 0.9999999999999907, 'recall_tpr': 0.5349999999999974, 'f1': 0.697068403908336, 'fpr': 0.0, 'tpr': 0.5349999999999974, 'n': 400, 'roc_auc': 0.981925, 'pr_auc': 0.9864840262654948, 'ece': 0.3093055955320597}
Train ep2: 100% 59/59 [02:19<00:00,  2.37s/batch, loss=0.381]
[VAL ep2] {'tn': 199, 'fp': 1, 'fn': 15, 'tp': 185, 'precision': 0.9946236559139732, 'recall_tpr': 0.9249999999999954, 'f1': 0.9585492227974232, 'fpr': 0.004999999999999975, 'tpr': 0.9249999999999954, 'n': 400, 'roc_auc': 0.9986, 'pr_auc': 0.9986334797331952, 'ece': 0.1255374425277114}
Train ep3: 100% 59/59 [02:18<00:00,  2.35s/batch, loss=0.0926]
[VAL ep3] {'tn': 199, 'fp': 1, 'fn': 4, 'tp': 196, 'precision': 0.9949238578680153, 'recall_tpr': 0.9799999999999951, 'f1': 0.987405541561208, 'fpr': 0.004999999999999975, 'tpr': 0.9799999999999951, 'n': 400, 'roc_auc': 0.9994, 'pr_auc': 0.99

In [None]:
# 3-4. Model Test Report
! python -m src.eval.eval_fusion_operational \
  --val_csv data/processed/val_with_qr.csv \
  --test_csv data/processed/test_with_qr.csv \
  --model_dir artifacts/models/fusion \
  --out_dir artifacts/reports/fusion_eval \
  --fit_temperature_on_val \
  --warn_fpr 0.01 \
  --block_fpr 0.001 \
  --use_context

Device: cuda
[OK] Saved: artifacts/reports/fusion_eval
  mode: fit_temperature_on_val | fusion_mode: gated
  warn_thr: 0.5917623043060303 block_thr: 0.9981675148010254


## 4. Security Game based Adversarial Training Model(Co-evolution Train)

- We formulate quishing detection as a co-evolutionary adversarial game, where a context-aware attacker and a defender are alternately optimized, enabling the detector to progressively adapt to increasingly realistic attack strategies.

In [None]:
# 4.1 Red Team(Attack) vs Blue Team(Defense) Co-evolution Train
! python -m src.train.train_coevolution \
  --train_csv data/processed/train_with_qr.csv \
  --val_csv data/processed/val_with_qr.csv \
  --out_dir artifacts/models/coevo_fusion_payload \
  --detector_mode fusion \
  --fusion_mode gated \
  --use_context \
  --background_dir assets/backgrounds \
  --rounds 15 \
  --attacker_pool 3 \
  --k_attack 5 \
  --k_defense 100 \
  --batch_size 64 \
  --image_size 224 \
  --balance_sampler \
  --decode_filter \
  --payload_match \
  --payload_col url_norm \
  --decode_subset 16 \
  --decode_resample 3 \
  --decode_min_keep 4

Device: cuda


In [None]:
# Robustness Curve
! python -m src.eval.eval_robustness_curve \
  --test_csv data/processed/test_with_qr.csv \
  --out_dir artifacts/reports/coevo_robustness \
  --detector_mode fusion \
  --fusion_mode gated \
  --use_context \
  --background_dir assets/backgrounds \
  --ckpt_detector artifacts/models/coevo_fusion/detector_round15.pt \
  --ckpt_attacker artifacts/models/coevo_fusion/attacker0_round15.pt \
  --strength_grid 0,0.2,0.4,0.6,0.8,1.0

## 5. Real world Test

In [None]:
# 6-1. 실제 URL 매핑 샘플 20개 생성
! python scripts/make_fusion_demo_csv.py \
  --input_dir data/qr_images \
  --mapping_csv data/processed/test_with_qr.csv \
  --out_csv samples/fusion_demo.csv \
  --limit 20

[OK] Wrote: samples/fusion_demo.csv
[OK] Rows: 20
                                                                  qr_path                   url_norm
/content/drive/MyDrive/qshing_guard/data/qr_images/1/e3880f63a7ed7ad7.png https://w3nu.yahwagsc.pro/
/content/drive/MyDrive/qshing_guard/data/qr_images/1/ddcac6ad475166e9.png        http://apiqido.top/
/content/drive/MyDrive/qshing_guard/data/qr_images/0/87b55241ffa82ded.png     https://1xbet-lua.top/
/content/drive/MyDrive/qshing_guard/data/qr_images/1/d6ac813bb19d6e3a.png     http://buly.kr/8IsrVQd
/content/drive/MyDrive/qshing_guard/data/qr_images/1/78301bf8a4a229fb.png https://05a9.yahwagsc.pro/
/content/drive/MyDrive/qshing_guard/data/qr_images/1/f347c8292d2c640a.png https://w6wm.yahwagsc.pro/
/content/drive/MyDrive/qshing_guard/data/qr_images/0/30668bf792181179.png           https://hhs.gov/
/content/drive/MyDrive/qshing_guard/data/qr_images/1/4b05b87275e159d2.png     https://s10.a1dv.skin/
/content/drive/MyDrive/qshing_guard/data/

In [None]:
# 6-2. Classification Predict
! python -m src.app.demo_fusion_predict \
  --input_csv samples/fusion_demo.csv \
  --model_dir artifacts/models/fusion \
  --thresholds_json artifacts/reports/fusion_eval/thresholds.json \
  --out_dir artifacts/demo/fusion_run1 \
  --fusion_mode gated

[OK] Saved: artifacts/demo/fusion_run1/predictions.csv
[OK] Per-item visualizations: artifacts/demo/fusion_run1/per_item
[OK] Summary plots: artifacts/demo/fusion_run1/decision_counts.png artifacts/demo/fusion_run1/prob_hist.png
