# MixUp versions codes

## Classic MixUp (V1)

### Training

In [None]:
for i, (x, y) in enumerate(loader):
    x = x.to(self.device).float()
    y = y.to(self.device).float()

    with torch.no_grad():
        batch_size = x.shape[0]
        indexes = torch.randperm(batch_size)
        x_shuffle = x[indexes]
        y_shuffle = y[indexes]
        x_mix, _ = self.mixup(x, x_shuffle, y, y_shuffle)

    self.optim.zero_grad()

    logits_mix = self.model(x_mix)
    pred_mix = self.activation(logits_mix, dim=1)

    loss = self.criterion(pred_mix, y, y_shuffle, self.mixup.get_lambda())
    loss.backward()
    self.optim.step()

    # skip metrics here...

### Criterion

In [None]:
class MixUpLoss(Module):
	def __init__(self, criterion: Callable = CrossEntropyWithVectors()):
		super().__init__()
		self.criterion = criterion

	def forward(self, pred: Tensor, labels_a: Tensor, labels_b: Tensor, lambda_: float) -> Tensor:
		"""
			:param pred: Output of the model for the mixed batch.
			:param labels_a: True labels without shuffle.
			:param labels_b: True labels with shuffle.
			:param lambda_: Coefficient used during the mix.
		"""
		return lambda_ * self.criterion(pred, labels_a) + (1.0 - lambda_) * self.criterion(pred, labels_b)

### MixUp
For supervised training, we create a mixup module :

In [None]:
MixUp(0.4, False)

In [None]:
class MixUp(Module):
	"""
		Module MixUp that mix batch and labels with a parameter lambda sample from a beta distribution.

		Code overview :

		lambda ~ Beta(alpha, alpha) \n
		lambda = max(lambda, 1 - lambda) \n
		batch = batch_a * lambda + batch_b * (1 - lambda) \n
		label = label_a * lambda + label_b * (1 - lambda) \n

		Note:
			- if alpha -> 0 and apply_max == True, lambda sampled near 1,
			- if alpha -> 1 and apply_max == True, lambda sampled from uniform distribution in [0.5, 1.0],
			- if alpha -> 0 and apply_max == False, lambda sampled near 1 or 0,
			- if alpha -> 1 and apply_max == False, lambda sampled from uniform distribution in [0.0, 1.0],
	"""

	def __init__(self, alpha: float = 0.75, apply_max: bool = True):
		"""
			Build the MixUp Module.

			:param alpha: Controls the Beta distribution used to sampled the coefficient lambda. (default: 0.75)
			:param apply_max: If True, apply the "lambda = max(lambda, 1 - lambda)" after the sampling of lambda. (default: True)
				This operation is useful for having a mixed batch near to the first batch passed as input.
				It was set to True in MixMatch training but not in MixUp training.
		"""
		super().__init__()
		self.beta = Beta(alpha, alpha)
		self.apply_max = apply_max

		# Sample from Beta distribution if alpha > 0.0, otherwise returns 1.0
		self._sample = (lambda: self.beta.sample().item()) if alpha > 0.0 else (lambda: 1.0)
		self._lambda = 0.0

	def forward(self, batch_a: Tensor, batch_b: Tensor, labels_a: Tensor, labels_b: Tensor) -> (Tensor, Tensor):
		"""
			Apply MixUp to batches and labels.
		"""
		if batch_a.shape != batch_b.shape or labels_a.shape != labels_b.shape:
			raise RuntimeError("Invalid shapes for MixUp : ({:s} != {:s} or {:s} != {:s})".format(
				batch_a.shape, batch_b.shape, labels_a.shape, labels_b.shape))

		self._lambda = self._sample()
		if self.apply_max:
			self._lambda = max(self._lambda, 1.0 - self._lambda)

		batch_mix = batch_a * self._lambda + batch_b * (1.0 - self._lambda)
		labels_mix = labels_a * self._lambda + labels_b * (1.0 - self._lambda)

		return batch_mix, labels_mix

	def get_lambda(self) -> float:
		"""
			Returns the last lambda sampled. If no data has been passed to forward(), returns 0.0.
		"""
		return self._lambda

## MixUp mix labels (V2)
The criterion becomes classic CrossEntropy and the training now use the mixed label.

### Training

In [None]:
for i, (x, y) in enumerate(loader):
    x = x.to(self.device).float()
    y = y.to(self.device).float()

    with torch.no_grad():
        batch_size = x.shape[0]
        indexes = torch.randperm(batch_size)
        x_shuffle = x[indexes]
        y_shuffle = y[indexes]
        x_mix, y_mix = self.mixup(x, x_shuffle, y, y_shuffle)

    self.optim.zero_grad()

    logits_mix = self.model(x_mix)
    pred_mix = self.activation(logits_mix, dim=1)

    loss = self.criterion(pred_mix, y_mix)
    loss.backward()
    self.optim.step()

    # skip metrics here...


### Criterion

In [None]:
class CrossEntropyWithVectors(Module):
    """
        Compute Cross-Entropy between two distributions.
        Input and targets must be a batch of probabilities distributions of shape (batch_size, nb_classes) tensor.
    """
    def __init__(self, reduction: str = "batchmean", dim: Optional[int] = 1, log_input: bool = False):
        super().__init__()
        self.reduce_fn = get_reduction_from_name(reduction)
        self.dim = dim
        self.log_input = log_input

    def forward(self, input_: Tensor, targets: Tensor, dim: Optional[int] = None) -> Tensor:
        """
            Compute cross-entropy with targets.
            Input and target must be a (batch_size, nb_classes) tensor.
        """
        if dim is None:
            dim = self.dim
        if not self.log_input:
            input_ = torch.log(input_)
        loss = -torch.sum(input_ * targets, dim=dim)
        return self.reduce_fn(loss)

## MixUp smooth (V3)
Only criterion change, the rest is the same than the classic MixUp (V1).

### Criterion

In [None]:
class MixUpLossSmooth(Module):
	def __init__(self, criterion: Callable = CrossEntropyWithVectors()):
		super().__init__()
		self.criterion = criterion

	def forward(self, pred: Tensor, labels_a: Tensor, labels_b: Tensor, lambda_: float) -> Tensor:
		"""
			:param pred: Output of the model for the mixed batch.
			:param labels_a: True labels without shuffle.
			:param labels_b: True labels with shuffle.
			:param lambda_: Coefficient used during the mix.
		"""
		return lambda_ * self.criterion(pred, labels_a * lambda_) + (1.0 - lambda_) * self.criterion(pred, labels_b * (1.0 - lambda_))