In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds


# Exploring Dataset Distance Heuristics for Computer Vision Problems

Supposed we have a model $f(\mathbf{x}; \theta)$, trained on a dataset $\mathcal{D}_A$ and we would like to estimate ahead of time how well the model will perform on a new task and dataset $\mathcal{D}_B$. We would like a heuristic $h(\mathcal{D}_A, \mathcal{D}_B)$, which approximates model transfer from $\mathcal{D}_A$ to $\mathcal{D}_B$ as measured by performance on the new task.

We would like to evaluate three heuristics on several datasets across several models to get a sense of how well these heuristics perform.

| Heuristic Name             | Model     | Dataset A | Dataset B       | Heuristic Score | Test Metric|
| ---------------------------| ----------|-----------|-----------------|-----------------|------------|
| Inter-Dataset CKA          | ResNet50  | ImageNet  | scene_parse150  |                 |            |
| Contrastive Loss           | ResNet50  | ImageNet  | scene_parse150  |                 |            |
| Feature Moments            | ResNet50  | ImageNet  | scene_parse150  |                 |            |
| Jensen-Shannon divergence  | ResNet50  | ImageNet  | scene_parse150  |                 |            |
| Inter-Dataset CKA          | ResNet50  | ImageNet  | celeb_a         |                 |            |
| Contrastive Loss           | ResNet50  | ImageNet  | celeb_a         |                 |            |
| Feature Moments            | ResNet50  | ImageNet  | celeb_a         |                 |            |
| Jensen-Shannon divergence  | ResNet50  | ImageNet  | celeb_a         |                 |            |
| Inter-Dataset CKA          | ResNet50  | ImageNet  | cars196         |                 |            |
| Contrastive Loss           | ResNet50  | ImageNet  | cars196         |                 |            |
| Feature Moments            | ResNet50  | ImageNet  | cars196         |                 |            |
| Jensen-Shannon divergence  | ResNet50  | ImageNet  | cars196         |                 |            |

## Inter-Dataset CKA
Dataset-wise CKA

## Contrastive Loss
Dataset-wise Contrastive Loss

## Feature Moments
Simple calculation of the mean and std. dev. of the features

## Jensen-Shannon Divergence
Method for detecting feature drift by approximating the JS divergence.

More info:
* https://practicalml.net/Detecting-data-drift/
* https://www.tensorflow.org/tfx/data_validation/get_started#checking_data_skew_and_drift

In [2]:
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
import numpy as np

from src.heuristics import cka_heuristic, #intermediary_feature_moments, feature_moments

# Basic procedure

model = None
src_dataset = None
dest_dataset = None

# score = dummy_heuristic(src_dataset, dest_dataset, model)

# Freeze model
# Linear evaluation on dest_dataset
# compare test metric and score
    

In [None]:
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model

import numpy as np

base_model = ResNet50(weights='imagenet')
# for layer in base_model.layers: 
#     print(layer.name)

model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)


def noise_cka(samples = 50):
    "cka of random noise vectors processed in ResNet50"
    featurelistX = []   
    featurelistY = []   
    for i in range(samples):
        x = np.random.randint(low=0,high=255,size=(4,224,224,3))
        y = np.random.randint(low=0,high=255,size=(4,224,224,3))
        x = preprocess_input(x)
        y = preprocess_input(x)
        feature_vector_x = model.predict(x)
        feature_vector_y = model.predict(y)
        feature_vector_x = tf.reshape(feature_vector_x, shape=(-1, 2048)).numpy()
        feature_vector_y = tf.reshape(feature_vector_y, shape=(-1, 2048)).numpy()
        featurelistX.append(feature_vector_x)
        featurelistY.append(feature_vector_y)
    featurelistX = np.array(featurelistX)
    featurelistY = np.array(featurelistY)
    featurelistX = featurelistX.reshape((-1,2048))
    featurelistY = featurelistY.reshape((-1,2048))

    xy_cka = cka_heuristic(featurelistX,featurelistY, "ResNet50")
    return xy_cka
noise_cka()
