### Necessary imports

In [None]:
import PIL
import clip
import torch
import shutil
from glob import glob
from pathlib import Path
from tqdm.auto import tqdm

### Enable GPU support

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Select and load the vision transformer model

In [None]:
model_name = 'ViT-L/14'
# model_name = 'ViT-B/32'  # Need more models? Have a look at: https://huggingface.co/openai

model, preprocess = clip.load(model_name, device)

### Categorizing function

In [None]:
def categorize_images(labels, src_dir, img_extension='*.*', dest_folder=None, pred_threshold=60, verbose=False):
    # Load images
    filepaths = glob(f'{src_dir}\\{img_extension}')
    images = [(f, PIL.Image.open(f)) for f in filepaths]    
        
    # Create folders (that represent the predicted classes) if they're not already exist.
    dest_path = src_dir if dest_folder is None else dest_folder
    for label in labels:
        Path(f'{dest_path}\\{label}').mkdir(parents=True, exist_ok=True)  
    
    # Process and classify each image according to the given threshold
    for f, image in tqdm(images):
        image_input = preprocess(image).unsqueeze(0).to(device)
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {label}") for label in labels]).to(device)

        # Generate features
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_inputs)

        # Pick the top-k most similar labels for the image
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(len(labels))

        for value, index in zip(values, indices):
            pred_label = labels[index]
            prob = f'{100 * value.item():.2f}'
            
            if verbose:
                print(f"{labels[index]}: {100 * value.item():.2f}%")

            if float(prob) > pred_threshold:
                destination = f'{dest_path}\\{pred_label}\\{Path(f).name}' 
                shutil.copy(f, destination) 
    if verbose:
        print(f'\n{"-"*20}\nDone.')     

### Demo

In [None]:
labels = ['cat', 'frog']

src_dir = r'PATH_TO_YOUR_UNCATEGORIZED_IMAGES'
dest_dir = f'{src_dir}\\categorized'
categorize_images(labels=labels, src_dir=src_dir, dest_folder=dest_dir)