In [1]:
import os
import zipfile
import dotenv
import subprocess
import pandas as pd
from typing import Dict, Any, Optional
import json
import yaml
import time
import glob
import hashlib
import shutil
import logging
import pandas as pd
from PIL import Image

competition_name = "urban-reid-challenge"

################ Probably nothing has to be modified from now on ################
logging.root.setLevel(logging.INFO)
dataset_path = os.path.join('assets', 'datasets', competition_name)

assert dotenv.load_dotenv('../../.env')
assert os.getenv('KAGGLE_USERNAME')

from kaggle.api.kaggle_api_extended import KaggleApi
from huggingface_hub import HfApi, snapshot_download

api = KaggleApi()
api.authenticate()

In [2]:
def huggingface_model_download(
    folder_models: str,
    model_repository: str,
    model_config: Optional[str] = None,
    token: Optional[str] = None,
    clean: bool = False,
) -> None:
    '''
    Download a model or specific model config from Hugging Face Hub.

    Args:
        folder_models: Local directory to save the model
        model_repository: Hugging Face repository ID
        model_config: Specific model config to download (None for entire repository)
        token: Hugging Face authentication token
        clean: If True, the folder will be deleted before downloading
    '''
    folder_model = os.path.join(folder_models, model_config) if model_config else folder_models
    if clean and os.path.exists(folder_model):
        shutil.rmtree(folder_model)
    if os.path.exists(folder_model):
        logger.info('Model already exists locally, skipping download')
        return
    os.makedirs(folder_model, exist_ok=True)

    folder_cache = '/tmp/huggingface_cache'
    folder_cache_model = os.path.join(folder_cache, model_repository)
    if model_config:
        folder_cache_model = os.path.join(folder_cache_model, model_config)
    os.makedirs(folder_cache_model, exist_ok=True)

    if model_config is not None:
        repo_path = snapshot_download(
            repo_id=model_repository,
            repo_type="model",
            token=token,
            allow_patterns=f"{model_config}/*",
            cache_dir=folder_cache,
        )
    else:
        repo_path = snapshot_download(
            repo_id=model_repository,
            repo_type="model",
            token=token,
            cache_dir=folder_cache,
        )

    source_path = repo_path if not model_config else os.path.join(repo_path, model_config)
    print(source_path)
    for root, _, files in os.walk(source_path):
        for file in files:
            file_source_path = os.path.join(root, file)
            if os.path.islink(file_source_path):
                file_source_path = os.path.join(root, os.readlink(file_source_path))
            rel_path = os.path.relpath(os.path.join(root, file), start=source_path)
            target_path = os.path.join(folder_model, rel_path)
            os.makedirs(os.path.dirname(target_path), exist_ok=True)
            shutil.copy2(file_source_path, target_path)

In [3]:
# Download the dataset
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path, exist_ok=True)
    api.competition_download_files(competition_name, path="./assets")
    with zipfile.ZipFile(f'./assets/{competition_name}.zip', 'r') as zip_ref:
        zip_ref.extractall(dataset_path)
    os.remove(f'./assets/{competition_name}.zip')
    logging.info(f"Downloaded dataset for {competition_name}")
    
    %cd assets/datasets/urban-reid-challenge
    !mv ./image_query/image_query/* ./image_query/
    !rm -r ./image_query/image_query
    
    !mv ./image_test/image_test/* ./image_test/
    !rm -r ./image_test/image_test
    
    !mv ./image_train/image_train/* ./image_train/
    !rm -r ./image_train/image_train
    %cd ../../..
else:
    logging.info(f"Dataset already existed")

INFO:root:Dataset already existed


In [4]:
# Generate reduced dataset
'''
cp -r urban-reid-challenge/ urban-reid-challenge-reduced/
cd urban-reid-challenge-reduced/
for file in image_query/0*.jpg; do
    num=$(basename "$file" .jpg)  # Extract number
    num=$((10#$num))  # Convert to decimal
    if ((num > 3)); then
        rm "$file"
    fi
done
for file in image_test/0*.jpg; do
    num=$(basename "$file" .jpg)  # Extract number
    num=$((10#$num))  # Convert to decimal
    if ((num > 10)); then
        rm "$file"
    fi
done

sed -i '4q' query.csv
sed -i '11q' test.csv
''';

In [None]:
# Generate augmented dataset?
#!cp -r assets/datasets/urban-reid-challenge assets/datasets/urban-reid-challenge-augmented

In [5]:
# Download the model
os.makedirs('assets/models', exist_ok=True)
if not os.path.exists('assets/models/resnet50-19c8e357.pth'):
    !curl -o "assets/models/resnet50-19c8e357.pth" "https://download.pytorch.org/models/resnet50-19c8e357.pth"
else:
    logging.info('model already existed')

if not os.path.exists('assets/models/jx_vit_base_p16_224-80ecf9dd.pth'):
    !curl -L -o 'assets/models/jx_vit_base_p16_224-80ecf9dd.pth'  'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth'
else:
    logging.info('model already existed')

if not os.path.exists('assets/models/dsd_pretrained'):
    huggingface_model_download(
        'assets/models/dsd_pretrained',
        model_repository='primecai/dsd_model',
        clean=True,
    )
assert os.path.exists('assets/models/dsd_pretrained/pytorch_lora_weights.safetensors')
assert os.path.exists('assets/models/dsd_pretrained/transformer/config.json')
assert os.path.exists('assets/models/dsd_pretrained/transformer/diffusion_pytorch_model.safetensors')

INFO:root:model already existed
INFO:root:model already existed


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

pytorch_lora_weights.safetensors:   0%|          | 0.00/418M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

/tmp/huggingface_cache/models--primecai--dsd_model/snapshots/7a33f82fa3575453456c4cff8683f5aab25c6f6e


In [2]:
# Download third-party libraries that are not in pypi
os.makedirs('assets/libs', exist_ok=True)
if not os.path.exists('assets/libs/diffusion-self-distillation'):
    !git clone https://github.com/primecai/diffusion-self-distillation.git assets/libs/diffusion-self-distillation

Cloning into 'assets/libs/diffusion-self-distillation'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 78 (delta 36), reused 47 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (78/78), 11.33 MiB | 9.32 MiB/s, done.
Resolving deltas: 100% (36/36), done.
