# Metric notebook test 

This notebook show how to use domain_gap metrics with a notebook after installing DataQualityMetric (DQM) library. Those metrics aim to evaluate the gap between two datasets which would allow to estimated required finetuning while adapting the model, be used as a loss function in generative models or rate simulated data similarity with real life data. For now, metrics implementation are only image type compatible.

The computation of the metrics requires a configuration file in which all the parameters for the data processing, feature extractor model and method parameters are defined. Examples are available in dqm/domain_gap/cfg/{metric_name} folder. 

We authorize homemade models for the computation of features, however those model must be pytorch friendly and contain both architecture and weights in a single ".pt" file. Default model are retrieved from torch hub models with imagenet dataset pretrained weight.

Here is a list of the metrics:
- FID
- Wasserstein
- PAD
- KLMVN

!!! pay attention that the preprocessing applied to the images (parameters of the "DATA" part in the configuration file) is compatible with the model inputs, it may be necessary to check training prepocessor pipeline.

## FID: Frechet Inception Distance

In [None]:
from dqm.domain_gap.metrics import FID
import torch
import os
from PIL import Image

# don't show user warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Instanciate the metric class
fid = FID()

# Generate synthetic image dataset folders
def generate_image_dataset(num_images, height, width, folder_name):
    """
    Generate a set of random images saved to a specified folder.
    Each image will have 3 channels (RGB) with pixel values in the range [0, 255].
    """
    os.makedirs(folder_name, exist_ok=True)
    
    for i in range(num_images):
        # Create random image data with RGB values in the range [0, 255]
        img_array = torch.randint(0, 256, (height, width, 3), dtype=torch.uint8).numpy()
        img = Image.fromarray(img_array)
        
        # Save image to folder
        img.save(os.path.join(folder_name, f"img_{i:04d}.png"))

# Paths to synthetic image folders
source_folder = "./synthetic_source_images"
target_folder = "./synthetic_target_images"

# Generate synthetic datasets
generate_image_dataset(100, 299, 299, source_folder)
generate_image_dataset(100, 299, 299, target_folder)

# Define your own config file, you can find examples in dqm/domain_gap/cfg/{metric_name}
fid_config_json = {
	"DATA": {
		"batch_size": 32,                      # Features will be compute on {batch_size} images at the same time
		"height": 299,                         # Resize images height to {height} value
		"width": 299,                          # Resize images width to {width} value
		"norm_mean": [                         # Normalize images mean with {norm_mean} values for RGB channels
				0.485,
				0.456,
				0.406
			],
		"norm_std": [                          # Normalize images std with {norm_std} values for RGB cahnnels
				0.229,
				0.224,
				0.225
			],
		"source": source_folder,      # source images are retrieved from {source} path
		"target": target_folder       # target images are retrieved from {target} path
	},
	"MODEL": {
		"device": "cpu",                       # Metric will be computed in {device}
		"n_layer_feature": -2                  # the layer extractor feature will be the:
    	},                                     # i-th if int       |  {n_layer_feature} if str
	"METHOD": {
		"name": "fid"                          # Metric name, used only with CLI
	}
}

# Compute the metric
dist = fid.compute_image_distance(fid_config_json)
print("-"*80)
print(f"metric name: {fid_config_json['METHOD']["name"]}")
print(f"source folder: {source_folder} ({len(os.listdir(source_folder))} images)")
print(f"target folder: {target_folder} ({len(os.listdir(target_folder))} images)")
print("Preprocessing:")
print(" "*4+f"-image resize: ({fid_config_json["DATA"]["width"]},{fid_config_json["DATA"]["height"]})")
print(" "*4+"-image normalize:")
print(" "*8+f"-mean: {fid_config_json["DATA"]["norm_mean"]}")
print(" "*8+f"-std: {fid_config_json["DATA"]["norm_std"]}")
print(f"device: {fid_config_json["MODEL"]["device"]}")
print(f"feature extraction layer: {fid_config_json["MODEL"]["n_layer_feature"]}")
if "arch" not in fid_config_json["MODEL"].keys():
	print("(default) model architecture : InceptionV3")
print(f"fid score: {dist.item()}")
print("-"*80)
# remove generated source and target folder
!rm -r synthetic_source_images/ synthetic_target_images/


--------------------------------------------------------------------------------
metric name: fid
source folder: ./synthetic_source_images (100 images)
target folder: ./synthetic_target_images (100 images)
Preprocessing:
    -image resize: (299,299)
    -image normalize:
        -mean: [0.485, 0.456, 0.406]
        -std: [0.229, 0.224, 0.225]
device: cpu
feature extraction layer: -2
model architecture : InceptionV3 (default)
fid score: 3.927738982265243
--------------------------------------------------------------------------------


## Wasserstein

In [2]:
from dqm.domain_gap.metrics import Wasserstein
import torch
import os
from PIL import Image

# Instanciate the metric class
wass = Wasserstein()

# Generate synthetic image dataset folders
def generate_image_dataset(num_images, height, width, folder_name):
    """
    Generate a set of random images saved to a specified folder.
    Each image will have 3 channels (RGB) with pixel values in the range [0, 255].
    """
    os.makedirs(folder_name, exist_ok=True)
    
    for i in range(num_images):
        # Create random image data with RGB values in the range [0, 255]
        img_array = torch.randint(0, 256, (height, width, 3), dtype=torch.uint8).numpy()
        img = Image.fromarray(img_array)
        
        # Save image to folder
        img.save(os.path.join(folder_name, f"img_{i:04d}.png"))

# Paths to synthetic image folders
source_folder = "./synthetic_source_images"
target_folder = "./synthetic_target_images"

# Generate synthetic datasets
generate_image_dataset(100, 299, 299, source_folder)
generate_image_dataset(100, 299, 299, target_folder)

# Define your own config file, you can find examples in dqm/domain_gap/cfg/{metric_name}
wass_config_json = {
	"DATA": {
		"batch_size": 10,
		"height": 299,
		"width": 299,
		"norm_mean": [
				0.485,
				0.456,
				0.406
			],
		"norm_std": [
				0.229,
				0.224,
				0.225
			],
		"source": source_folder, 
		"target": target_folder  
	},
	"MODEL": {
        "arch": "resnet18",
		"device": "cpu",
		"n_layer_feature": -2
    	},
	"METHOD": {
		"name": "wasserstein",
		"dimension": "1D"
	}
}

# Compute the metric
dist = wass.compute_1D_distance(wass_config_json)

print("-"*80)
print(f"metric name: {wass_config_json['METHOD']["name"]}")
print(f"source folder: {source_folder} ({len(os.listdir(source_folder))} images)")
print(f"target folder: {target_folder} ({len(os.listdir(target_folder))} images)")
print("Preprocessing:")
print(" "*4+f"-image resize: ({fid_config_json["DATA"]["width"]},{fid_config_json["DATA"]["height"]})")
print(" "*4+"-image normalize:")
print(" "*8+f"-mean: {fid_config_json["DATA"]["norm_mean"]}")
print(" "*8+f"-std: {fid_config_json["DATA"]["norm_std"]}")
print(f"device: {fid_config_json["MODEL"]["device"]}")
print(f"feature extraction layer: {fid_config_json["MODEL"]["n_layer_feature"]}")
if "arch" not in fid_config_json["MODEL"].keys():
	print("model architecture : InceptionV3 (default)")
print(f"wasserstein score: {dist.item()}")
print("-"*80)

# remove generated source and target folder
!rm -r synthetic_source_images/ synthetic_target_images/


--------------------------------------------------------------------------------
metric name: wasserstein
source folder: ./synthetic_source_images (100 images)
target folder: ./synthetic_target_images (100 images)
Preprocessing:
    -image resize: (299,299)
    -image normalize:
        -mean: [0.485, 0.456, 0.406]
        -std: [0.229, 0.224, 0.225]
device: cpu
feature extraction layer: -2
model architecture : InceptionV3 (default)
wasserstein score: 0.012761476961261994
--------------------------------------------------------------------------------


## KLMVN: Kullback-Leibler for MultiVariate Normal distribution

In [3]:
from dqm.domain_gap.metrics import KLMVN
import torch
import os
from PIL import Image

# Instanciate the metric class
klmvn = KLMVN()

# Generate synthetic image dataset folders
def generate_image_dataset(num_images, height, width, folder_name):
    """
    Generate a set of random images saved to a specified folder.
    Each image will have 3 channels (RGB) with pixel values in the range [0, 255].
    """
    os.makedirs(folder_name, exist_ok=True)
    
    for i in range(num_images):
        # Create random image data with RGB values in the range [0, 255]
        img_array = torch.randint(0, 256, (height, width, 3), dtype=torch.uint8).numpy()
        img = Image.fromarray(img_array)
        
        # Save image to folder
        img.save(os.path.join(folder_name, f"img_{i:04d}.png"))

# Paths to synthetic image folders
source_folder = "./synthetic_source_images"
target_folder = "./synthetic_target_images"

# Generate synthetic datasets
generate_image_dataset(100, 299, 299, source_folder)
generate_image_dataset(100, 299, 299, target_folder)

# Define your own config file, you can find examples in dqm/domain_gap/cfg/{metric_name}
klmvn_config_json = {
	"DATA": {
		"batch_size": 10,
		"height": 28,
		"width": 28,
		"norm_mean": [
				0.485,
				0.456,
				0.406
			],
		"norm_std": [
				0.229,
				0.224,
				0.225
			],
		"source": source_folder, 
		"target": target_folder 
	},
	"MODEL": {
        "arch": "resnet18",
		"device": "cpu",
		"n_layer_feature": -2
    	},
	"METHOD": {
		"name": "klmvn"
	}
}

# Compute the metric
dist = klmvn.compute_image_distance(klmvn_config_json)

print("-"*80)
print(f"metric name: {klmvn_config_json['METHOD']["name"]}")
print(f"source folder: {source_folder} ({len(os.listdir(source_folder))} images)")
print(f"target folder: {target_folder} ({len(os.listdir(target_folder))} images)")
print("Preprocessing:")
print(" "*4+f"-image resize: ({fid_config_json["DATA"]["width"]},{fid_config_json["DATA"]["height"]})")
print(" "*4+"-image normalize:")
print(" "*8+f"-mean: {fid_config_json["DATA"]["norm_mean"]}")
print(" "*8+f"-std: {fid_config_json["DATA"]["norm_std"]}")
print(f"device: {fid_config_json["MODEL"]["device"]}")
print(f"feature extraction layer: {fid_config_json["MODEL"]["n_layer_feature"]}")
if "arch" not in fid_config_json["MODEL"].keys():
	print("model architecture : InceptionV3 (default)")
print(f"kullback-Leibler score: {dist.item()}")
print("-"*80)

# remove generated source and target folder
!rm -r synthetic_source_images/ synthetic_target_images/


--------------------------------------------------------------------------------
metric name: klmvn
source folder: ./synthetic_source_images (100 images)
target folder: ./synthetic_target_images (100 images)
Preprocessing:
    -image resize: (299,299)
    -image normalize:
        -mean: [0.485, 0.456, 0.406]
        -std: [0.229, 0.224, 0.225]
device: cpu
feature extraction layer: -2
model architecture : InceptionV3 (default)
kullback-Leibler score: 290.20548276043786
--------------------------------------------------------------------------------


## PAD: Proxy A Distance


In [4]:
from dqm.domain_gap.metrics import ProxyADistance
import torch
import os
from PIL import Image

# Instanciate the metric class
pad = ProxyADistance()

# Generate synthetic image dataset folders
def generate_image_dataset(num_images, height, width, folder_name):
    """
    Generate a set of random images saved to a specified folder.
    Each image will have 3 channels (RGB) with pixel values in the range [0, 255].
    """
    os.makedirs(folder_name, exist_ok=True)
    
    for i in range(num_images):
        # Create random image data with RGB values in the range [0, 255]
        img_array = torch.randint(0, 256, (height, width, 3), dtype=torch.uint8).numpy()
        img = Image.fromarray(img_array)
        
        # Save image to folder
        img.save(os.path.join(folder_name, f"img_{i:04d}.png"))

# Paths to synthetic image folders
source_folder = "./synthetic_source_images"
target_folder = "./synthetic_target_images"

# Generate synthetic datasets
generate_image_dataset(100, 299, 299, source_folder)
generate_image_dataset(100, 299, 299, target_folder)

# Define your own config file, you can find examples in dqm/domain_gap/cfg/{metric_name}
pad_config_json = {
	"DATA": {
		"height": 224,
		"width": 224,
		"batch_size": 10,
		"norm_mean": [
			0.485,
			0.456,
			0.406
		],
		"norm_std": [
			0.229,
			0.224,
			0.225
		],
		"source": source_folder, 
		"target": target_folder 
	},
	"MODEL": {
		"arch": ["efficientnet_b0","vgg16"],
		"device": "cpu",
		"n_layer_feature": -2
	},
	"METHOD": {
		"name": "proxy",
        "evaluator": "mse"
	}
}


# Compute the metric
dist = pad.compute_image_distance(pad_config_json)

print("-"*80)
print(f"metric name: {pad_config_json['METHOD']["name"]}")
print(f"source folder: {source_folder} ({len(os.listdir(source_folder))} images)")
print(f"target folder: {target_folder} ({len(os.listdir(target_folder))} images)")
print("Preprocessing:")
print(" "*4+f"-image resize: ({pad_config_json["DATA"]["width"]},{pad_config_json["DATA"]["height"]})")
print(" "*4+"-image normalize:")
print(" "*8+f"-mean: {pad_config_json["DATA"]["norm_mean"]}")
print(" "*8+f"-std: {pad_config_json["DATA"]["norm_std"]}")
print(f"device: {pad_config_json["MODEL"]["device"]}")
print(f"model architecture: {pad_config_json["MODEL"]["arch"]}")
print(f"feature extraction layer: {pad_config_json["MODEL"]["n_layer_feature"]}")
print(f"Proxy A Distance score: {dist.item()}")
print("-"*80)

# remove generated source and target folder
!rm -r synthetic_source_images/ synthetic_target_images/


--------------------------------------------------------------------------------
metric name: proxy
source folder: ./synthetic_source_images (100 images)
target folder: ./synthetic_target_images (100 images)
Preprocessing:
    -image resize: (224,224)
    -image normalize:
        -mean: [0.485, 0.456, 0.406]
        -std: [0.229, 0.224, 0.225]
device: cpu
model architecture: ['efficientnet_b0', 'vgg16']
feature extraction layer: -2
Proxy A Distance score: 0.9131710737349599
--------------------------------------------------------------------------------


## MMD: Maximum Mean Discrepancy

In [1]:
from dqm.domain_gap.metrics import MMD

import torch
import os
from PIL import Image

# Instanciate the metric class
mmd = MMD()

# Generate synthetic image dataset folders
def generate_image_dataset(num_images, height, width, folder_name):
    """
    Generate a set of random images saved to a specified folder.
    Each image will have 3 channels (RGB) with pixel values in the range [0, 255].
    """
    os.makedirs(folder_name, exist_ok=True)
    
    for i in range(num_images):
        # Create random image data with RGB values in the range [0, 255]
        img_array = torch.randint(0, 256, (height, width, 3), dtype=torch.uint8).numpy()
        img = Image.fromarray(img_array)
        
        # Save image to folder
        img.save(os.path.join(folder_name, f"img_{i:04d}.png"))

# Paths to synthetic image folders
source_folder = "/home/yoann/data/synthetic_source_images"
target_folder = "/home/yoann/data/synthetic_target_images"

# Generate synthetic datasets
generate_image_dataset(100, 299, 299, source_folder)
generate_image_dataset(100, 299, 299, target_folder)

# Define your own config file, you can find examples in dqm/domain_gap/cfg/{metric_name}
mmd_config_json = {
	"DATA": {
		"height": 224,
		"width": 224,
		"batch_size": 10,
		"norm_mean": [
			0.485,
			0.456,
			0.406
		],
		"norm_std": [
			0.229,
			0.224,
			0.225
		],
		"source": source_folder, 
		"target": target_folder 
	},
	"MODEL": {
        "arch": "resnet18",
		"device": "cpu",
		"n_layer_feature": -2
    	},
	"METHOD": {
		"name": "mmd",
		"kernel": "linear",
		"kernel_params": {
			"gamma": 1.0,
			"degree": 3.0,
			"coefficient0": 1.0 
		}
	}
}


# Compute the metric
dist = mmd.compute(mmd_config_json)

print("-"*80)
print(f"metric name: {mmd_config_json['METHOD']["name"]}")
print(f"source folder: {source_folder} ({len(os.listdir(source_folder))} images)")
print(f"target folder: {target_folder} ({len(os.listdir(target_folder))} images)")
print("Preprocessing:")
print(" "*4+f"-image resize: ({mmd_config_json["DATA"]["width"]},{mmd_config_json["DATA"]["height"]})")
print(" "*4+"-image normalize:")
print(" "*8+f"-mean: {mmd_config_json["DATA"]["norm_mean"]}")
print(" "*8+f"-std: {mmd_config_json["DATA"]["norm_std"]}")
print(f"device: {mmd_config_json["MODEL"]["device"]}")
print(f"model architecture: {mmd_config_json["MODEL"]["arch"]}")
print(f"feature extraction layer: {mmd_config_json["MODEL"]["n_layer_feature"]}")
print(f"Maximum Mean Discrepancy score: {dist}")
print("-"*80)

# remove generated source and target folder
# !rm -r synthetic_source_images/ synthetic_target_images/




same features
torch.Size([100, 512])
--------------------------------------------------------------------------------
metric name: mmd
source folder: /home/yoann/data/synthetic_source_images (100 images)
target folder: /home/yoann/data/synthetic_target_images (100 images)
Preprocessing:
    -image resize: (224,224)
    -image normalize:
        -mean: [0.485, 0.456, 0.406]
        -std: [0.229, 0.224, 0.225]
device: cpu
model architecture: resnet18
feature extraction layer: -2
Maximum Mean Discrepancy score: 0.05866938456892967
--------------------------------------------------------------------------------


# CMD: Central Moments Discrepancy

In [1]:
import warnings
warnings.filterwarnings('ignore')

from dqm.domain_gap.metrics import CMD

import torch
import os
from PIL import Image

# Instanciate the metric class
cmd = CMD()

# Generate synthetic image dataset folders
def generate_image_dataset(num_images, height, width, folder_name):
    """
    Generate a set of random images saved to a specified folder.
    Each image will have 3 channels (RGB) with pixel values in the range [0, 255].
    """
    os.makedirs(folder_name, exist_ok=True)
    
    for i in range(num_images):
        # Create random image data with RGB values in the range [0, 255]
        img_array = torch.randint(0, 256, (height, width, 3), dtype=torch.uint8).numpy()
        img = Image.fromarray(img_array)
        
        # Save image to folder
        img.save(os.path.join(folder_name, f"img_{i:04d}.png"))

# Paths to synthetic image folders
source_folder = "/home/yoann/data/synthetic_source_images_mini"
target_folder = "/home/yoann/data/synthetic_target_images_mini"

# Generate synthetic datasets
#generate_image_dataset(100, 299, 299, source_folder)
#generate_image_dataset(100, 299, 299, target_folder)

# Define your own config file, you can find examples in dqm/domain_gap/cfg/{metric_name}
cmd_config_json = {
	"DATA": {
		"height": 224,
		"width": 224,
		"batch_size": 10,
		"norm_mean": [
			0.485,
			0.456,
			0.406
		],
		"norm_std": [
			0.229,
			0.224,
			0.225
		],
		"source": source_folder,
		"target": target_folder
	},
	"MODEL": {
		"arch": "resnet18",
        "n_layer_feature" : [
            "maxpool",
            "layer1.1.relu_1",
            "layer2.1.relu_1", 
            "layer3.1.relu_1", 
            "layer4.1.relu_1"],
        "feature_extractors_layers_weights" : [1, 1, 1, 1, 1],
        "device": "cpu"
	},
	"METHOD": {
		"name": "cmd",
        "k": 5
	}
}


# Compute the metric
dist = cmd.compute(cmd_config_json)

print("-"*80)
print(f"metric name: {cmd_config_json['METHOD']["name"]}")
print(f"source folder: {source_folder} ({len(os.listdir(source_folder))} images)")
print(f"target folder: {target_folder} ({len(os.listdir(target_folder))} images)")
print("Preprocessing:")
print(" "*4+f"-image resize: ({cmd_config_json["DATA"]["width"]},{cmd_config_json["DATA"]["height"]})")
print(" "*4+"-image normalize:")
print(" "*8+f"-mean: {cmd_config_json["DATA"]["norm_mean"]}")
print(" "*8+f"-std: {cmd_config_json["DATA"]["norm_std"]}")
print(f"device: {cmd_config_json["MODEL"]["device"]}")
print(f"model architecture: {cmd_config_json["MODEL"]["arch"]}")
print(f"feature extraction layer: {cmd_config_json["MODEL"]["n_layer_feature"]}")
print(f"Central Moments Discrepancy score: {dist}")
print("-"*80)

# remove generated source and target folder
# !rm -r synthetic_source_images/ synthetic_target_images/


--------------------------------------------------------------------------------
metric name: cmd
source folder: /home/yoann/data/synthetic_source_images_mini (10 images)
target folder: /home/yoann/data/synthetic_target_images_mini (10 images)
Preprocessing:
    -image resize: (224,224)
    -image normalize:
        -mean: [0.485, 0.456, 0.406]
        -std: [0.229, 0.224, 0.225]
device: cpu
model architecture: resnet18
feature extraction layer: ['maxpool', 'layer1.1.relu_1', 'layer2.1.relu_1', 'layer3.1.relu_1', 'layer4.1.relu_1']
Central Moments Discrepancy score: 0.009210899472236633
--------------------------------------------------------------------------------


In [7]:
print(len(os.listdir(source_folder)))
print(len(os.listdir(target_folder)))

10
10


In [5]:
print(source_folder)

/home/yoann/data/synthetic_source_images_mini


In [None]:
# 
6650915840.0
105973.578125

2573.7099609375