### Necessary imports

In [None]:
import PIL
import clip
import torch
import shutil
from glob import glob
from pathlib import Path
from tqdm.auto import tqdm

### Enable GPU support

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Select and load the vision transformer model

In [None]:
model_name = 'ViT-L/14'
# model_name = 'ViT-B/32'  # Need more models? Have a look at: https://huggingface.co/openai

model, preprocess = clip.load(model_name, device)

### Categorizing function

In [None]:
def categorize_images(labels, src_dir, img_extension='*.*', dest_folder=None, pred_threshold=0.6, verbose=False):
    # Load images   
    filepaths = Path(src_dir).glob(img_extension)
    images = [(f, PIL.Image.open(f)) for f in filepaths]    
        
    # Create folders (= predicted classes) if they're not already exist.
    dest_path = src_dir if dest_folder is None else dest_folder
    
    input_tokens = []
    for label in labels:        
        Path(dest_path, label).mkdir(parents=True, exist_ok=True)     
        input_tokens.append(clip.tokenize(f'a photo of a {label}'))
        
    text_inputs = torch.cat(input_tokens).to(device)    
    
    # Generate text features
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)   
        
    # Process and classify each image according to the given threshold
    for f, image in tqdm(images):
        image_input = preprocess(image).unsqueeze(0).to(device)

        # Generate image features
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            image_features /= image_features.norm(dim=-1, keepdim=True)
        
        # Pick top-k most similar labels for the image
        similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(len(labels))

        for value, index in zip(values, indices):
            pred_label = labels[index]
            prob = value.item()
            
            if verbose:
                adjusted_prob = 1 - prob if prob < 0.5 else prob
                print(f'Predicted as [{labels[index]}] Confidence: {100 * adjusted_prob:.2f}%')

            if float(prob) > pred_threshold:
                destination = Path(dest_path, pred_label, Path(f).name)  
                shutil.copy(f, destination) 
    if verbose:
        print(f'{"-"*40}\nDone.')     

### Demo

In [None]:
labels = ['label_1', 'label_2'] # each label represents a word/phrase that describes the content of the respective images. 
src_dir = r'PATH_TO_UNCATEGORIZED_IMAGES'

dest_dir = Path(src_dir, 'categorized')   
categorize_images(labels=labels, src_dir=src_dir, dest_folder=dest_dir, verbose=True)