<a href="https://colab.research.google.com/github/Muzhi1920/awesome-models/blob/main/03-Loss%E4%B8%8E%E4%BC%98%E5%8C%96/00_loss_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import math
import numpy as np
from tensorflow.python.distribute import distribution_strategy_context as ds
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import candidate_sampling_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import custom_gradient
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
from tensorflow.python.ops import gen_nn_ops
from tensorflow.python.ops import gen_sparse_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import variables
from tensorflow.python.ops.losses import util as losses_util
from tensorflow.python.platform import device_context
from tensorflow.python.util import dispatch
from tensorflow.python.util.deprecation import deprecated_args
from tensorflow.python.util.deprecation import deprecated_argument_lookup
from tensorflow.python.util.tf_export import tf_export

## 0.准备工作

In [None]:
logits = tf.random.normal(shape=[8,1])
binomial_samples = tf.random.stateless_binomial(shape=[8], seed=[1, 2], counts=[1]*8, probs=[1.0], output_dtype=dtypes.float32)
labels = tf.expand_dims(binomial_samples, axis=1)
logits,labels

(<tf.Tensor: shape=(8, 1), dtype=float32, numpy=
 array([[ 0.743362  ],
        [-0.04994878],
        [-0.12192969],
        [-0.6007877 ],
        [-0.9180826 ],
        [-0.53149897],
        [-2.1738625 ],
        [ 0.7629548 ]], dtype=float32)>,
 <tf.Tensor: shape=(8, 1), dtype=float32, numpy=
 array([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]], dtype=float32)>)

## 分类Loss

### 0.sigmoid_cross_entropy_with_logits
输入logits，输出sigmoid的交叉熵

- 逻辑回归的Loss公式：$x - x * z + log(1 + exp(-x))$
- 对于x < 0, e^(-x)，求e^(+∞)，导致上限值溢出；推导得到：$- x * z + log(1 + exp(x))$
- 综合起来为：$relu(x) - x * z +log(1 + exp(-|x|))$


In [None]:
tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)

<tf.Tensor: shape=(8, 1), dtype=float32, numpy=
array([[0.38900542],
       [0.7184334 ],
       [0.7559693 ],
       [1.0379966 ],
       [1.2540432 ],
       [0.9938001 ],
       [2.2815838 ],
       [0.38273308]], dtype=float32)>

In [None]:
tf.nn.relu(logits) - logits * labels + math_ops.log1p(math_ops.exp(- tf.abs(logits)))

<tf.Tensor: shape=(8, 1), dtype=float32, numpy=
array([[0.38900542],
       [0.7184334 ],
       [0.7559693 ],
       [1.0379966 ],
       [1.2540432 ],
       [0.9938001 ],
       [2.2815838 ],
       [0.38273308]], dtype=float32)>

### 1.softmax_loss

#### softmax_cross_entropy_with_logits

- tf.nn.softmax_cross_entropy_with_logits
- tf.nn.softmax_cross_entropy_with_logits_v2

v2设置labels的stop_gradient，旧的弃用

监督学习：labels都是标记好的真值；但labels并不一定都是人工手动标注的，对抗生成网络（GAN）生成label。

In [None]:
logits = tf.random.normal(shape=[8,3])
binomial_samples = tf.random.stateless_binomial(shape=[24], seed=[1, 2], counts=[1]*24, probs=[0.6], output_dtype=dtypes.float32)
labels = tf.reshape(binomial_samples, [8,3])
logits,labels

(<tf.Tensor: shape=(8, 3), dtype=float32, numpy=
 array([[ 0.5298147 ,  1.4655572 , -1.7060746 ],
        [ 0.38383943,  1.6901616 , -0.04928871],
        [-0.84147036, -1.469868  , -0.9128455 ],
        [-1.7267729 ,  0.70714355, -0.3341877 ],
        [ 2.1018481 , -1.8824366 , -2.1326375 ],
        [ 0.7706794 , -0.58798075, -0.56795937],
        [ 0.07821601, -0.4862161 , -0.89811707],
        [ 0.0670175 ,  0.62969047,  0.74955124]], dtype=float32)>,
 <tf.Tensor: shape=(8, 3), dtype=float32, numpy=
 array([[0., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 1.],
        [1., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 1., 0.]], dtype=float32)>)

In [None]:
tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.       , 2.1085496, 1.5304103, 4.2054744, 4.049399 , 1.7768488,
       1.2298858, 2.5469708], dtype=float32)>

#### sparse_softmax_cross_entropy_with_logits

In [None]:
logits = tf.random.normal(shape=[8,3])
binomial_samples = np.random.binomial(2, 0.4, size=8)
labels = tf.reshape(binomial_samples, [8])
logits,labels

(<tf.Tensor: shape=(8, 3), dtype=float32, numpy=
 array([[-0.6791042 ,  0.8999842 , -0.15620121],
        [ 1.5036064 , -1.6424059 ,  1.0729109 ],
        [-0.78484875,  0.08258355, -1.0332958 ],
        [-0.22827785, -0.03214472, -0.36922324],
        [-0.16294631,  0.83794194,  1.0936366 ],
        [ 1.3484765 ,  0.8223789 , -0.48319885],
        [-1.1132156 , -1.5437346 ,  2.0449705 ],
        [-0.5141856 ,  0.17654698, -1.1332694 ]], dtype=float32)>,
 <tf.Tensor: shape=(8,), dtype=int64, numpy=array([1, 1, 1, 1, 1, 1, 1, 1])>)

In [None]:
tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.4407955 , 3.6725616 , 0.55827534, 0.9304917 , 0.9779167 ,
       1.0863141 , 3.6564918 , 0.5715886 ], dtype=float32)>

### 2.负采样Loss

仅考虑召回场景，labels是next item id

#### 1.nce loss
- _compute_sampled_logits
  1. weights：labels的embedding [1024, 64]
  2. biases：labels的biases[num_classes,]
  3. labels：[1024, 1]，batch_size内的item id
  4. inputs：[1024, 64]，NN网络的输出
  5. num_sampled：每batch的负例随机采样数
  6. num_classes：
  7. num_true=1：每个样本target class为1
  8. sampled_values=None
- call -> sigmoid_cross_entropy_with_logits

In [None]:
weights = tf.random.normal(shape=[8,6])
bias = tf.random.normal(shape=[8])
labels = tf.reshape(tf.constant([0,1,2,3]),[4,1])
inputs = tf.random.normal(shape=[4,6])
weights,bias,labels,inputs

(<tf.Tensor: shape=(8, 6), dtype=float32, numpy=
 array([[ 1.5868555e+00, -1.3844897e+00,  9.1051930e-01,  5.9027994e-01,
          3.2574993e-01, -1.2678149e+00],
        [-3.1594810e-01,  6.1188310e-01,  2.2674493e-01, -9.4877654e-01,
         -4.3090087e-01,  4.0671283e-01],
        [-4.7653633e-01,  6.5542066e-01, -1.1961337e-02,  2.7360048e-02,
          1.9327186e-02, -1.1802855e+00],
        [-1.4216665e+00,  6.9922519e-01, -1.1515529e+00, -1.8073394e+00,
          1.1251889e-03, -1.3351833e+00],
        [-9.4146651e-01, -2.1295372e-01,  5.1290399e-01,  2.3094016e-01,
         -1.3172195e+00, -7.6489991e-01],
        [-1.7267352e+00,  5.3442007e-01,  4.4312009e-01,  4.3158332e-01,
         -6.2142098e-01, -4.2784286e-01],
        [ 1.3126055e+00, -7.5113978e-03,  2.9865471e-01, -4.7692153e-01,
          4.0922251e-01,  1.5048288e+00],
        [-8.8414259e-02, -4.5963374e-01, -1.2771840e-01, -3.8791093e-01,
         -4.7713992e-01,  2.9816005e-01]], dtype=float32)>,
 <tf.Tensor: 

In [None]:
tf.nn.nce_loss(weights=weights,biases=bias,labels=labels,inputs=inputs,num_sampled=3,num_classes=8)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([4.79131  , 5.286718 , 5.2074556, 2.206083 ], dtype=float32)>

In [None]:
from tensorflow.python.ops.nn_impl import _compute_sampled_logits as csl

In [None]:
logits,s_labels = csl(weights=weights,biases=bias,labels=labels,inputs=inputs,num_sampled=3,num_classes=8)
logits,s_labels

(<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[ 3.3078656 , -2.2302356 , -0.25395274,  0.8061598 ],
        [-1.6487408 , -1.6487408 ,  0.7910253 ,  3.9377856 ],
        [-0.5043384 , -0.6995781 , -0.5043384 ,  1.9978073 ],
        [ 7.0577607 ,  1.2053163 , -0.33884823,  4.0117917 ]],
       dtype=float32)>, <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]], dtype=float32)>)

In [None]:
def _sum_rows(x):
  """Returns a vector summing up each row of the matrix x."""
  cols = array_ops.shape(x)[1]
  ones_shape = array_ops.stack([cols, 1])
  ones = array_ops.ones(ones_shape, x.dtype)
  return array_ops.reshape(math_ops.matmul(x, ones), [-1])
sampled_losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=s_labels, logits=logits, name="sampled_losses")
_sum_rows(sampled_losses)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1.8876141, 7.1225023, 3.9775438, 6.035969 ], dtype=float32)>

由于每次执行，随机负采样的样本不同，故计算结果不同

#### 2.sampled_softmax_loss
与NCE不同的是计算loss使用softmax还是sigmioid，其他相同
- _compute_sampled_logits
- call -> softmax_cross_entropy_with_logits_v2

In [None]:
logits,s_labels = csl(weights=weights,biases=bias,labels=labels,inputs=inputs,num_sampled=3,num_classes=8)
logits,s_labels

(<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[ 3.3613472 ,  3.3613472 , -0.6248181 , -0.9899025 ],
        [-1.8918233 ,  2.71918   ,  0.42015994, -0.79662657],
        [-0.8752037 ,  2.0896254 , -0.8752037 , -0.10070443],
        [ 6.615677  , -6.2109795 , -0.7097136 ,  1.2547957 ]],
       dtype=float32)>, <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]], dtype=float32)>)

In [None]:
s_labels = array_ops.stop_gradient(s_labels, name="labels_stop_gradient")
sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(labels=s_labels, logits=logits)
sampled_losses

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.7087555, 4.7420516, 3.1595883, 0.0053438], dtype=float32)>

#### 3.对比学习InfoNCE loss
定义损失：
$$L(i,j) = -log \frac{ e^ {\frac{s_{i,j}}{τ}}}   {\sum_{k=1}^{2N} e^{\frac{s_{i,k}} {τ}}}$$

得到总的Loss为：
$$Loss =  \frac{1}{2N} \sum_{k=1}^{N} [L(2k-1,2k) + L(2k,2k-1)] $$

参考[对比学习loss](https://github.com/google-research/simclr/blob/master/objective.py)

In [None]:
LARGE_NUM=1e9
def add_contrastive_loss(hidden,
                         hidden_norm=True,
                         temperature=1.0,
                         tpu_context=None,
                         weights=1.0):
  """Compute loss for model.
  Args:
    hidden: hidden vector (`Tensor`) of shape (2 * bsz, dim).
    hidden_norm: whether or not to use normalization on the hidden vector.
    temperature: a `floating` number for temperature scaling.
    tpu_context: context information for tpu.
    weights: a weighting number or vector.
  Returns:
    A loss scalar.
    The logits for contrastive prediction task.
    The labels for contrastive prediction task.
  """
  # Get (normalized) hidden1 and hidden2.
  if hidden_norm:
    hidden = tf.math.l2_normalize(hidden, -1)
  hidden1, hidden2 = tf.split(hidden, 2, 0)
  batch_size = tf.shape(hidden1)[0]

  # Gather hidden1/hidden2 across replicas and create local labels.
  if tpu_context is not None:
    hidden1_large = tf.concat(hidden1, tpu_context)
    hidden2_large = tf.concat(hidden2, tpu_context)
    enlarged_batch_size = tf.shape(hidden1_large)[0]
    # TODO(iamtingchen): more elegant way to convert u32 to s32 for replica_id.
    replica_id = tf.cast(tf.cast(xla.replica_id(), tf.uint32), tf.int32)
    labels_idx = tf.range(batch_size) + replica_id * batch_size
    labels = tf.one_hot(labels_idx, enlarged_batch_size * 2)
    masks = tf.one_hot(labels_idx, enlarged_batch_size)
  else:
    hidden1_large = hidden1
    hidden2_large = hidden2
    labels = tf.one_hot(tf.range(batch_size), batch_size * 2)
    masks = tf.one_hot(tf.range(batch_size), batch_size)

  logits_aa = tf.matmul(hidden1, hidden1_large, transpose_b=True) / temperature
  logits_aa = logits_aa - masks * LARGE_NUM
  logits_bb = tf.matmul(hidden2, hidden2_large, transpose_b=True) / temperature
  logits_bb = logits_bb - masks * LARGE_NUM
  logits_ab = tf.matmul(hidden1, hidden2_large, transpose_b=True) / temperature
  logits_ba = tf.matmul(hidden2, hidden1_large, transpose_b=True) / temperature

  loss_a = tf.nn.softmax_cross_entropy_with_logits(
      labels, tf.concat([logits_ab, logits_aa], 1))
  loss_b = tf.nn.softmax_cross_entropy_with_logits(
      labels, tf.concat([logits_ba, logits_bb], 1))
  loss = loss_a + loss_b

  return loss, logits_ab, labels

In [None]:
add_contrastive_loss(hidden=logits)

(<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.9669977, 3.9188712], dtype=float32)>,
 <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[ 0.39735496,  0.01248601],
        [ 0.8449203 , -0.9734093 ]], dtype=float32)>,
 <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[1., 0., 0., 0.],
        [0., 1., 0., 0.]], dtype=float32)>)

##### 代码细节

In [None]:
hidden = logits
hidden_norm = True
temperature = 1.0

In [None]:
if hidden_norm:
  hidden = tf.math.l2_normalize(hidden, -1)
hidden1, hidden2 = tf.split(hidden, 2, 0)
batch_size = tf.shape(hidden1)[0]
hidden1,hidden2

(<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[ 0.68659556,  0.68659556, -0.1276266 , -0.20219949],
        [-0.5511028 ,  0.7921183 ,  0.12239586, -0.23206352]],
       dtype=float32)>, <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[-0.36005217,  0.85965604, -0.36005217, -0.04142904],
        [ 0.72002465, -0.67597896, -0.07724248,  0.1365671 ]],
       dtype=float32)>)

In [None]:
hidden1_large = hidden1
hidden2_large = hidden2
labels = tf.one_hot(tf.range(batch_size), batch_size * 2)
masks = tf.one_hot(tf.range(batch_size), batch_size)
labels,masks

(<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[1., 0., 0., 0.],
        [0., 1., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[1., 0.],
        [0., 1.]], dtype=float32)>)

In [None]:
logits_aa = tf.matmul(hidden1, hidden1_large, transpose_b=True) / temperature
logits_aa = logits_aa - masks * LARGE_NUM
logits_aa

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-1.0000000e+09,  1.9678229e-01],
       [ 1.9678229e-01, -1.0000000e+09]], dtype=float32)>

In [None]:
logits_bb = tf.matmul(hidden2, hidden2_large, transpose_b=True) / temperature
logits_bb = logits_bb - masks * LARGE_NUM
logits_bb

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-1.000000e+09, -8.182024e-01],
       [-8.182024e-01, -1.000000e+09]], dtype=float32)>

In [None]:
logits_ab = tf.matmul(hidden1, hidden2_large, transpose_b=True) / temperature
logits_ba = tf.matmul(hidden2, hidden1_large, transpose_b=True) / temperature
logits_ab,logits_ba

(<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[ 0.39735496,  0.01248601],
        [ 0.8449203 , -0.9734093 ]], dtype=float32)>,
 <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[ 0.39735496,  0.8449203 ],
        [ 0.01248601, -0.9734093 ]], dtype=float32)>)

In [None]:
loss_a = tf.nn.softmax_cross_entropy_with_logits(labels, tf.concat([logits_ab, logits_aa], 1))
loss_b = tf.nn.softmax_cross_entropy_with_logits(labels, tf.concat([logits_ba, logits_bb], 1))
loss_a,loss_b

(<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.9158114, 2.3402822], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.0511863, 1.578589 ], dtype=float32)>)

In [None]:
loss = loss_a + loss_b
loss

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.9669977, 3.9188712], dtype=float32)>

### 比较召回中的nce-loss、softmax-loss、infonce-loss
都包含sampled负采样阶段，扩增负例，计算loss

1. softmax-loss：多分类问题，计算多分类loss（见识所有不同类的样本）
2. nce-loss：多分类问题，通过负采样转为计算二分类问题，是softmax的优化版，word2vec中大幅降低计算量（见识非己样本）
3. infoNce：提高相关及其衍生样本的相似度，降低非相关及其衍生样本的相似度。（见识同类及其相似样本，和不同类及其自相似样本）

## normalize

In [None]:
norm1 = linalg_ops.norm(logits)
norm2 = tf.sqrt(tf.reduce_sum(tf.math.pow(logits,2.0)))
norm1==norm2

<tf.Tensor: shape=(), dtype=bool, numpy=True>

In [None]:
(logits / norm2)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[ 0.29935524,  0.29935524, -0.05564512, -0.08815885],
       [-0.16848221,  0.24216504,  0.03741865, -0.07094606],
       [-0.07794399,  0.18609808, -0.07794399, -0.00896855],
       [ 0.5891797 , -0.5531381 , -0.06320575,  0.11174973]],
      dtype=float32)>

### L2_normalize

In [None]:
square_sum = math_ops.reduce_sum(math_ops.square(logits))
x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, 1e-12))
square_sum, x_inv_norm == 1.0/tf.sqrt(square_sum)

(<tf.Tensor: shape=(), dtype=float32, numpy=126.08199>,
 <tf.Tensor: shape=(), dtype=bool, numpy=True>)

In [None]:
logits * x_inv_norm

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[ 0.29935524,  0.29935524, -0.05564512, -0.08815885],
       [-0.1684822 ,  0.24216504,  0.03741865, -0.07094605],
       [-0.07794399,  0.18609808, -0.07794399, -0.00896855],
       [ 0.5891797 , -0.5531381 , -0.06320575,  0.11174973]],
      dtype=float32)>