In [None]:
import torch
from torch.optim import Adam
from torch.nn import functional as F

# Define the teacher network and student network
teacher = TeacherNetwork()
student = StudentNetwork()

# Define the optimizer for the student network
optimizer = Adam(student.parameters())

# Define a dataset of input examples
inputs = torch.randn(100, 3, 32, 32)

# Compute teacher features
teacher_features = teacher.extract_features(inputs)

# Training loop
for i in range(num_epochs):
    # Compute the student's predicted features
    student_features = student.extract_features(inputs)
    
    # Compute the EMD loss between the student's features and the teacher's features
    emd_loss = F.emd_loss(student_features, teacher_features)
    
    # Backpropagate the loss and update the student's parameters
    optimizer.zero_grad()
    emd_loss.backward()
    optimizer.step()


Here we use the extract_features function to get the features maps from the teacher and student network, and then calculate the EMD loss between them. As I've said before, Pytorch does not have built in function for EMD, so you would have to use it from other library or create it by yourself. It's also worth noting that this is just a simplified example and you'll likely want to use more advanced techniques like data augmentation, learning rate scheduling, etc. to improve performance. Also the EMD loss alone may not be enought to train the student, it is common to use it in combination with other losses like cross entropy or MSE.

In [None]:
import torch
from torch.optim import Adam
from torch.nn import functional as F

# Define the teacher network and student network
teacher = TeacherNetwork()
student = StudentNetwork()

# Define the optimizer for the student network
optimizer = Adam(student.parameters())

# Define a dataset of input examples
inputs = torch.randn(100, 3, 32, 32)
targets = teacher(inputs)

# Training loop
for i in range(num_epochs):
    # Compute the student's predicted outputs
    student_outputs = student(inputs)
    
    # Compute the EMD loss between the student's predictions and the teacher's predictions
    emd_loss = F.emd_loss(student_outputs, targets)
    
    # Backpropagate the loss and update the student's parameters
    optimizer.zero_grad()
    emd_loss.backward()
    optimizer.step()


Please note that Pytorch does not have built in function for EMD, so you would have to use it from other library or create it by yourself.

It's also worth noting that this is just a simplified example and you'll likely want to use more advanced techniques like data augmentation, learning rate scheduling, etc. to improve performance. Also the EMD loss alone may not be enought to train the student, it is common to use it in combination with other losses like cross entropy or MSE.

In [None]:
import torch
from torch.nn import functional as F

# Define the two sets of features maps
student_features = torch.randn(100, 512, 8, 8)
teacher_features = torch.randn(100, 512, 8, 8)

# Define a distance matrix, which measures the dissimilarity between each pair of features
distance_matrix = F.pairwise_distance(student_features.view(100, -1), teacher_features.view(100, -1))

# Define a flow matrix, which represents the "flow" of probability mass between the two sets of features
flow_matrix = torch.randn(100, 512, 512)

# Compute the EMD loss between the student's features and the teacher's features
emd_loss = torch.mean(torch.


Earth Mover's Distance (EMD) is a measure of the difference between two probability distributions. It is also known as the Wasserstein distance, or the Mallows distance. The EMD is a way to measure the "distance" between two sets of points, where each point has a mass or weight associated with it. The EMD is the minimum amount of "work" required to transform one probability distribution into the other, where "work" is defined as the amount of mass moved multiplied by the distance it is moved. The EMD can be used in image processing, computer vision, and machine learning.

In [1]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def emd(p, q):
    m = len(p)
    n = len(q)
    C = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            C[i, j] = np.abs(p[i] - q[j])
    row_ind, col_ind = linear_sum_assignment(C)
    return np.sum(C[row_ind, col_ind])

p = [0.2, 0.3, 0.5]
q = [0.1, 0.4, 0.5]
print(emd(p, q))


0.20000000000000004


This code creates two 1-dimensional probability distributions, p and q, and calculates the EMD between them using the linear_sum_assignment function from the scipy library. The function returns the EMD, which is the sum of the absolute differences between the corresponding elements of p and q, multiplied by the minimum amount of "work" required to transform one probability distribution into the other.

Note that this is a simple example that only works for 1D probability distributions, in practice you may want to use a library that handle higher dimensional distributions and also support a wide range of distance metric.

In [2]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def emd(teacher, student):
    # Get the number of classes for the teacher and student outputs
    m, _ = teacher.shape
     
        
    n, _ = student.shape
    
    # Calculate the cost matrix
    C = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            C[i, j] = np.linalg.norm(teacher[i] - student[j])
    
    # Use linear_sum_assignment to find the optimal assignment of classes
    row_ind, col_ind = linear_sum_assignment(C)
    
    # Calculate the EMD as the sum of the costs of the optimal assignments
    return np.sum(C[row_ind, col_ind])

# Example usage
teacher_output = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.3]])
student_output = np.array([[0.2, 0.1], [0.4, 0.3], [0.3, 0.4]])
print(emd(teacher_output, student_output))


0.14142135623730953


This code creates two 2-dimensional probability distributions, teacher_output and student_output, and calculates the EMD between them using the linear_sum_assignment function from the scipy library. The function returns the EMD, which is the sum of the distances between the corresponding elements of teacher_output and student_output after finding the optimal class assignments, multiplied by the minimum amount of "work" required to transform one probability distribution into the other.

Note that in this example, the distance between the output of the teacher and the student is calculated as the Euclidean distance between the class probabilities. You can use any other distance metric that makes sense for your problem.

In [3]:
teacher = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.3]])
print(teacher.shape)

(3, 2)


This code creates two sets of features, teacher_features and student_features, and calculates the EMD between them using the linear_sum_assignment function from the scipy library. The function returns the EMD, which is the sum of the distances between the corresponding elements of teacher_features and student_features after finding the optimal sample assignments, multiplied by the minimum amount of "work" required to transform one set of features into the other.

Note that in this example, the distance between the features of the teacher and the student is calculated as the Euclidean distance between the feature vectors. You can use any other distance metric that makes sense for your problem.

Also note that the above code assumes that the features are already extracted and aligned, in real-world scenarios, you might need to preprocess the data and align the feature maps before calculating the EMD

In [4]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def emd(teacher_features, student_features):
    # Get the number of samples for the teacher and student features
    m, _ = teacher_features.shape
    n, _ = student_features.shape
    
    # Calculate the cost matrix
    C = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            C[i, j] = np.linalg.norm(teacher_features[i] - student_features[j])
    
    # Use linear_sum_assignment to find the optimal assignment of samples
    row_ind, col_ind = linear_sum_assignment(C)
    
    # Calculate the EMD as the sum of the costs of the optimal assignments
    return np.sum(C[row_ind, col_ind])

# Example usage
teacher_features = np.array([[1, 2], [3, 4], [4, 3]])
student_features = np.array([[2, 1], [4, 3], [3, 4]])
print(emd(teacher_features, student_features))


1.4142135623730951
