# DINOv2

In [4]:
from google.colab import drive
drive.mount('/content/drive')

import torch
from torchvision import datasets, transforms

# Load DINOv2 onto GPU
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14').to(device)
model.eval()

# Transform for DINOv2 (zero-padding + normalization)
dinov2_transform = transforms.Compose([
    transforms.Pad((96, 96)),  # (224-32)/2 = 96 pixels padding
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Transform for original CIFAR-10 (just ToTensor to get raw pixels)
original_transform = transforms.ToTensor()

# Load dataset twice (once for DINOv2, once for original)
cifar_dinov2 = datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=dinov2_transform,
)

cifar_original = datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=original_transform,
)

# Create DataLoaders
loader_dinov2 = torch.utils.data.DataLoader(
    cifar_dinov2,
    batch_size=512,
    num_workers=4,
    pin_memory=True,
)

loader_original = torch.utils.data.DataLoader(
    cifar_original,
    batch_size=512,
    num_workers=4,
    pin_memory=True,
)

# # Extract DINOv2 embeddings
# embeddings, labels = [], []
# with torch.no_grad():
#     for images, targets in loader_dinov2:
#         images = images.to(device, non_blocking=True)
#         embeddings.append(model(images).cpu())
#         labels.append(targets)

# embeddings = torch.cat(embeddings)  # Shape: [10000, 384]
# labels = torch.cat(labels)  # Shape: [10000]

# # Extract original images (32x32, no padding/normalization)
# original_images = []
# for images, _ in loader_original:
#     original_images.append(images)

# original_images = torch.cat(original_images)  # Shape: [10000, 3, 32, 32]

# # Save results (optional)
# torch.save({
#     'embeddings': embeddings,
#     'labels': labels,
#     'original_images': original_images,
# }, 'cifar10_dinov2_features_and_originals.pt')

# print("Shapes:")
# print(f"Embeddings: {embeddings.shape}")  # [10000, 384]
# print(f"Labels: {labels.shape}")  # [10000]
# print(f"Original Images: {original_images.shape}")  # [10000, 3, 32, 32]

Mounted at /content/drive


Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


In [5]:
import numpy as np
from scipy.stats import pearsonr
import torch

def correlation_dissimilarity(emb1, emb2):
    """
    emb1 (np.array) : embedding in one feature space
    emb2 (np.array) : embedding in another feature space
    """
    dissim1 = 1. - np.corrcoef(emb1)
    dissim2 = 1. - np.corrcoef(emb2)

    triu_indices = np.triu_indices_from(dissim1, k=1)
    flat1 = dissim1[triu_indices]
    flat2 = dissim2[triu_indices]

    # Compute second-order similarity (Pearson correlation)
    r, _ = pearsonr(flat1, flat2)
    return r


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn import preprocessing

def train_linear_classifier(X, y, test_size=0.2, random_state=42, **kwargs):
    """
    Trains a linear classifier (Logistic Regression) and returns the model and accuracy.

    Parameters:
    X (array-like): Feature matrix
    y (array-like): Target vector
    test_size (float): Proportion of data to use for testing (default: 0.2)
    random_state (int): Random seed for reproducibility (default: 42)
    **kwargs: Additional arguments to pass to LogisticRegression

    Returns:
    tuple: (trained_model, accuracy_score)
    """
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize and train the linear classifier
    model = LogisticRegression(**kwargs)
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return model, accuracy

def encode_set(encoder_function: callable, loader, original_loader, device="cpu"):
    all_embeddings = []
    all_labels = []
    all_original_images = []

    with torch.no_grad():
        for (images_dino, targets), (images_orig, _) in zip(loader, original_loader):
            images_dino = images_dino.to(device, non_blocking=True)
            embeddings = encoder_function(images_dino).cpu()
            all_embeddings.append(embeddings)
            all_labels.append(targets)
            all_original_images.append(images_orig)

    embeddings = torch.cat(all_embeddings)  # [N, D]
    labels = torch.cat(all_labels)
    original_images = torch.cat(all_original_images)
    original_images = original_images.reshape(original_images.shape[0], -1)

    return (embeddings.numpy(),
            labels.numpy(),
            original_images.numpy())



from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, ColorBar
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.palettes import Viridis256
from bokeh.layouts import row

import plotly.express as px

def embedding_plotter(embedding, data=None, hue=None, hover=None, tools = None, nv_cat = 5, height = 400, width = 400, display_result=True):
    '''
    Рисовалка эмбеддинга. 2D renderer: bokeh. 3D renderer: plotly.
    Обязательные инструменты:
        - pan (двигать график)
        - box zoom
        - reset (вылезти из зума в начальное положение)

        embedding: something 2D/3D, slicable ~ embedding[:, 0] - валидно
            Эмбеддинг
        data: pd.DataFrame
            Данные, по которым был построен эмбеддинг
        hue: string
            Колонка из data, по которой красим точки. Поддерживает интерактивную легенду: по клику на каждое
                значение hue можно скрыть весь цвет.
        hover: string or list of strings
            Колонк[а/и] из data, значения которых нужно выводить при наведении мышки на точку
        nv_cat: int
            number of unique values to consider column categorical
        tools: iterable or string in form "tool1,tool2,..." or ["tool1", "tool2", ...]
            tools for the interactive plot
        height, width: int
            parameters of the figure
        display_result: boolean
            if the results are displayed or just returned

    '''
    if tools is None:
        tools = 'lasso_select,box_select,pan,zoom_in,zoom_out,reset,hover'
    else:
        if hover and not("hover" in tools):
            tools = 'hover,'+",".join(tools)


    if embedding.shape[1] == 3:
        if hover:
            hover_data = {h:True for h in hover}
        else:
            hover_data = None
        df = pd.DataFrame(embedding, columns = ['x', 'y', 'z'])
        df = pd.concat((df, data), axis=1)
        fig = px.scatter_3d(
            data_frame = df,
            x='x',
            y='y',
            z='z',
            color=df[hue],
            hover_data = hover_data
        )

        fig.update_layout(
            modebar_add=tools.split(","),
        )

        fig.update_traces(marker_size=1, selector=dict(type='scatter3d'))

        if display_result: fig.show()

    if embedding.shape[1] == 2:
        output_notebook()
        df = pd.DataFrame(embedding, columns = ['x', 'y'])
        df = pd.concat((df, data), axis=1)
        tooltips = [
            ('x, y', '$x, $y'),
            ('index', '$index')
        ]
        if hover:
            for col in hover:
                tooltips.append((col, "@"+col))
        fig = figure(tools=tools, width=width, height=height, tooltips=tooltips)
        if df[hue].nunique() < nv_cat or df[hue].dtype == "category":
            df[hue] = df[hue].astype(str)
            source = ColumnDataSource(df)
            color_mapper = factor_cmap(
            field_name=hue,
            palette='Category10_3',
            factors=df[hue].unique()
            )
            fig.scatter(
            x='x', y='y',
            color=color_mapper,
            source=source,
            legend_group=hue)

            fig.legend.location = 'bottom_left'
            fig.legend.click_policy = 'mute'
        else:
            source = ColumnDataSource(df)
            color_mapper = linear_cmap(
                field_name=hue,
                palette=Viridis256,
                low=min(df[hue]),
                high=max(df[hue]))
            fig.scatter(
                x='x', y='y',
                color=color_mapper,
                source=source)
            color_bar = ColorBar(color_mapper=color_mapper['transform'], width=8, location=(0,0), title = hue)
            fig.add_layout(color_bar, 'right')


        if display_result: show(fig)

    if embedding.shape[1] > 3:
        print("wrong species, doooooodes")
    else: return fig

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

def get_tsne(embeddings):
    # Create t-SNE
    tsne = TSNE(n_components=2, random_state=1,
                init='pca', n_iter=5000,
                metric='cosine')

    # Fit and transform your data
    tsne_results = tsne.fit_transform(embeddings[:1000])

    # Prepare the data DataFrame correctly
    data_df = pd.DataFrame({
        'label': np.array(labels[:1000])  # Assuming you have labels
        # Add any other columns you want for hover information
    })

    # Call the plotting function correctly
    embedding_plotter(
        embedding=tsne_results,  # This should be your 2D t-SNE results (1000x2 array)
        data=data_df,            # This contains your labels and other metadata
        hue='label',             # Column name in data_df to use for coloring
    )
    data_df['tsne_x'] = tsne_results[:,0]
    data_df['tsne_y'] = tsne_results[:,1]

    return data_df


def run(encoder_function: callable, loader: torch.utils.data.DataLoader, original_loader: torch.utils.data.DataLoader,
        logger = None, device = "cpu", embeddings_np = None, labels_np = None, original_images_np = None, **kwargs):
    if embeddings_np is None or labels_np is None or original_images_np is None:
        embeddings_np, labels_np, original_images_np = encode_set(encoder_function, loader, original_loader, device)
    cl_acc = train_linear_classifier(embeddings_np, labels_np, **kwargs)[1]
    cor_diss = correlation_dissimilarity(embeddings_np, original_images_np)
    if not logger is None:
        logger.log({
            "classification_accuracy" : cl_acc,
            "second_order_similarity" : cor_diss
        })

    print(f"classification_accuracy : {cl_acc}, \nsecond_order_similarity : {cor_diss}")
    data_df = get_tsne(embeddings_np)
    return embeddings_np, labels_np, original_images_np, data_df

In [None]:
# embeddings_np = embeddings.detach().cpu().numpy()
# original_images_np = original_images.detach().cpu().numpy().reshape(original_images.shape[0], -1)
# # correlation_dissimilarity(embeddings_np[:1000], embeddings_np[1000:2000])
# correlation_dissimilarity(embeddings_np[:100], original_images_np[:100])
import wandb
run_name = f'untrained_dino_metric'
config = {
    "encoder" : "dino",
    "type_log" : "metric",
}

import os

file_path = "/content/drive/MyDrive/Data/DIM_counter.txt"

if not os.path.exists(file_path):
    with open(file_path, 'w') as file:
        file.write(f"0")
with open(file_path, 'r') as file:
    counter = int(file.read())+1

if counter > 1:
    df = pd.read_csv('/content/drive/MyDrive/Data/DIM_embedding.csv')
    embeddings, labels, original_images = df[[c for c in df.columns if 'emb' in c]].to_numpy(), df['label'].to_numpy(), df[[c for c in df.columns if 'or' in c]].to_numpy()
else:
    embeddings, labels, original_images = None, None, None

logger = wandb.init(project = 'CV_frameworks', config = config, name = run_name)

embeddings, labels, original_images, tsne_data = run(model, loader_dinov2, loader_original,logger = logger, device= device,
                                                     embeddings_np = embeddings, labels_np = labels, original_images_np = original_images,
                                                     max_iter = 5000)
logger.finish()

import pandas as pd

df = pd.DataFrame(np.hstack([embeddings, original_images]), columns = [f"emb_{i}" for i in range(embeddings.shape[1])]+[f"or_{i}" for i in range(original_images.shape[1])])
df['label'] = labels




df.to_csv(f'/content/drive/MyDrive/Data/DIM_embedding{counter}.csv')

tsne_data.to_csv(f'/content/drive/MyDrive/Data/DIM_tsne_embedding{counter}.csv')



In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, ColorBar
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.palettes import Viridis256
from bokeh.layouts import row

import plotly.express as px

def embedding_plotter(embedding, data=None, hue=None, hover=None, tools = None, nv_cat = 5, height = 400, width = 400, display_result=True):
    '''
    Рисовалка эмбеддинга. 2D renderer: bokeh. 3D renderer: plotly.
    Обязательные инструменты:
        - pan (двигать график)
        - box zoom
        - reset (вылезти из зума в начальное положение)

        embedding: something 2D/3D, slicable ~ embedding[:, 0] - валидно
            Эмбеддинг
        data: pd.DataFrame
            Данные, по которым был построен эмбеддинг
        hue: string
            Колонка из data, по которой красим точки. Поддерживает интерактивную легенду: по клику на каждое
                значение hue можно скрыть весь цвет.
        hover: string or list of strings
            Колонк[а/и] из data, значения которых нужно выводить при наведении мышки на точку
        nv_cat: int
            number of unique values to consider column categorical
        tools: iterable or string in form "tool1,tool2,..." or ["tool1", "tool2", ...]
            tools for the interactive plot
        height, width: int
            parameters of the figure
        display_result: boolean
            if the results are displayed or just returned

    '''
    if tools is None:
        tools = 'lasso_select,box_select,pan,zoom_in,zoom_out,reset,hover'
    else:
        if hover and not("hover" in tools):
            tools = 'hover,'+",".join(tools)


    if embedding.shape[1] == 3:
        if hover:
            hover_data = {h:True for h in hover}
        else:
            hover_data = None
        df = pd.DataFrame(embedding, columns = ['x', 'y', 'z'])
        df = pd.concat((df, data), axis=1)
        fig = px.scatter_3d(
            data_frame = df,
            x='x',
            y='y',
            z='z',
            color=df[hue],
            hover_data = hover_data
        )

        fig.update_layout(
            modebar_add=tools.split(","),
        )

        fig.update_traces(marker_size=1, selector=dict(type='scatter3d'))

        if display_result: fig.show()

    if embedding.shape[1] == 2:
        output_notebook()
        df = pd.DataFrame(embedding, columns = ['x', 'y'])
        df = pd.concat((df, data), axis=1)
        tooltips = [
            ('x, y', '$x, $y'),
            ('index', '$index')
        ]
        if hover:
            for col in hover:
                tooltips.append((col, "@"+col))
        fig = figure(tools=tools, width=width, height=height, tooltips=tooltips)
        if df[hue].nunique() < nv_cat or df[hue].dtype == "category":
            df[hue] = df[hue].astype(str)
            source = ColumnDataSource(df)
            color_mapper = factor_cmap(
            field_name=hue,
            palette='Category10_3',
            factors=df[hue].unique()
            )
            fig.scatter(
            x='x', y='y',
            color=color_mapper,
            source=source,
            legend_group=hue)

            fig.legend.location = 'bottom_left'
            fig.legend.click_policy = 'mute'
        else:
            source = ColumnDataSource(df)
            color_mapper = linear_cmap(
                field_name=hue,
                palette=Viridis256,
                low=min(df[hue]),
                high=max(df[hue]))
            fig.scatter(
                x='x', y='y',
                color=color_mapper,
                source=source)
            color_bar = ColorBar(color_mapper=color_mapper['transform'], width=8, location=(0,0), title = hue)
            fig.add_layout(color_bar, 'right')


        if display_result: show(fig)

    if embedding.shape[1] > 3:
        print("wrong species, doooooodes")
    else: return fig

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

# Assuming your embeddings are in a variable called 'embeddings'
# embeddings = np.random.rand(10000, 384)  # Example - replace with your actual embeddings

# Create t-SNE
tsne = TSNE(n_components=2, random_state=1,
            init='pca', n_iter=5000,
            metric='cosine')

# Fit and transform your data
tsne_results = tsne.fit_transform(embeddings[:1000])

# Prepare the data DataFrame correctly
data_df = pd.DataFrame({
    'label': np.array(labels[:1000])  # Assuming you have labels
    # Add any other columns you want for hover information
})

# Call the plotting function correctly
embedding_plotter(
    embedding=tsne_results,  # This should be your 2D t-SNE results (1000x2 array)
    data=data_df,            # This contains your labels and other metadata
    hue='label',             # Column name in data_df to use for coloring
)



# Theia

In [None]:
!pip install omegaconf

Collecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from omegaconf)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=960f759ef9e070d4770976dc1139e6ebf23f9bc9d9f9f29b7dbb22e1cec699c9
  Stored in directory: /root/.cache/pip/wheels/1a/97/32/461f837398029ad76911109f07047fde1d7b661a147c7c56d1
Successfull

In [None]:
from transformers import AutoModel
from PIL import Image
import torch
import numpy as np

import torch
from torchvision import datasets, transforms

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

model = AutoModel.from_pretrained(
    "theaiinstitute/theia-base-patch16-224-cdiv",
    trust_remote_code=True
)
model.eval()

import torchvision.transforms as transforms
from torchvision import datasets
import torch

# Theia-compatible transform (expects uint8 [0-255] or normalized float32 [0-1])
theia_transform = transforms.Compose([
    transforms.Resize(224),  # Resize to Theia's expected input size
    transforms.ToTensor(),   # Converts to float32 [0,1] range
])

original_transform = transforms.ToTensor()

cifar_theia = datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=theia_transform,
)

cifar_original = datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=original_transform,
)

loader_theia = torch.utils.data.DataLoader(
    cifar_theia,
    batch_size=512,
    num_workers=4,
    pin_memory=True,
)


loader_original = torch.utils.data.DataLoader(
    cifar_original,
    batch_size=512,
    num_workers=4,
    pin_memory=True,
)

run(model, loader_theia, loader_original, device = device)


# image = Image.open("scene.jpg").resize((224, 224))
# input_tensor = torch.from_numpy(np.array(image)).unsqueeze(0)

# # 3. Get features
# theia_feature = model.forward_feature(input_tensor)  # Unified feature
# all_features = model(input_tensor)

Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
