In [2]:
import TensorFlow

In [16]:
var logits = Tensor<Float32>(randomNormal: [10,5], mean: Tensor(0), standardDeviation: Tensor(4))
var targets = Tensor<Int32>(randomUniform: [10], lowerBound: Tensor(0), upperBound: Tensor(5))

In [17]:
print(logits)
print(targets)

[[ -7.5606837,  -4.1594152,  -1.4371437,  0.94998556,  -2.1296244],
 [   5.460545,   0.9486486,   5.6935763,   0.7171767,   3.3408916],
 [  -5.106072,  -1.2702762,  -1.3153108,   -4.035973,  -5.5275283],
 [0.109969355,  -3.3563774,  -11.882913,    5.777429,  -1.5817682],
 [ -3.3135924,    0.683405,  -4.6655736,   1.4658132,   4.1020246],
 [   3.543782,    8.338613,   3.8994873,  0.21737371,   1.3145567],
 [  1.1342677,   2.9519236,    2.331409,   -5.044607,   -1.913505],
 [   2.484174,   1.2623852,   2.4720688,   -5.913093,   -6.056792],
 [   -8.37429,   -4.443255,   -5.749257,  -1.6235662,   5.7547674],
 [  3.2606332,   2.0625463,   -6.767269,  -1.5690302,  -1.1145483]]
[1, 4, 0, 2, 1, 1, 3, 1, 3, 3]


In [18]:
let oneHotLogits = Tensor<Float32>(oneHotAtIndices: logits.argmax(squeezingAxis: 1),
                                   depth: logits.shape[1]); oneHotLogits

[[0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0]]


In [19]:
let (loss, gradients) = valueWithGradient(at: logits) {logits in 
                             softmaxCrossEntropy(logits: logits, labels: targets)
                        }
print("logits: \(logits[0]) \nLabel:\(targets[0]) \nLoss:\(loss) \nGradients:\(-gradients[0])")

logits: [-7.5606837, -4.1594152, -1.4371437, 0.94998556, -2.1296244] 
Label:1 
Loss:5.7067485 
Gradients:[-1.7595223e-05,    0.099472106,   -0.008031833,    -0.08740408,   -0.004018594]


In [20]:
let (loss, gradients) = valueWithGradient(at: oneHotLogits) {logits in 
                             softmaxCrossEntropy(logits: logits, labels: targets)
                        }
print("logits: \(oneHotLogits[0]) \nLabel:\(targets[0]) \nLoss:\(loss) \nGradients:\(-gradients[0])")

logits: [0.0, 0.0, 0.0, 1.0, 0.0] 
Label:1 
Loss:1.8048325 
Gradients:[-0.01488476,  0.08511525, -0.01488476, -0.04046097, -0.01488476]


**While the gradients work fine in both the cases, raw values result in significantly higher loss as compared to loss with one-hot encoded logits**

## Finding ways to implement `ignore_index` for crossEntropy

In [60]:
var logits = Tensor<Float32>(randomNormal: [10,5], mean: Tensor(0), standardDeviation: Tensor(4))
var targets = Tensor<Int32>(randomUniform: [10], lowerBound: Tensor(0), upperBound: Tensor(5))

In [61]:
print(logits)
print(targets)

[[  -7.816966,   1.6490363,  -2.5067804,   1.1694168,   0.8058538],
 [ -1.4318359,  0.91529536,   3.3752975,  -0.6105987,  -3.2968643],
 [  11.477498,  -0.2612766, -0.92973423,   -5.894342,  -3.5515406],
 [  2.6089673,   -3.037576,  -4.1986403,     3.46875,   -5.185842],
 [  4.8170567,    3.301774,    1.725311,   3.6389813,  -1.3453578],
 [ -2.3146663,  -6.5037622,  -2.8066723,    5.053336,  0.74045706],
 [  1.9974203,   -2.865681,  -3.5518906, 0.030817203,  -2.1184027],
 [  -2.112357,   -2.725534,     2.77681,   1.4542646,   3.3758972],
 [  5.8977785,   3.1237755,     -6.5468,    6.299514,    4.218277],
 [ -1.7487556,   -5.027978,   3.2570636,  0.73814887,   1.3552209]]
[1, 1, 4, 4, 4, 4, 1, 0, 3, 3]


I looked up for cleaner solution similar to pyTorch, but unfortunately, there isn't any. (There was one (issue)[https://github.com/keras-team/keras/issues/6118] on keras repo which didn't receive any positive feedback, also, this is not supported by tensorflow_ops at first place)


Let me endeavour doing it contrived way. I've `logits` with shape (10,5) and `targets` with shape(initially being sparse) which I'm one-hot encoding to deal with indices. The shape here indicates I've 10 examples with 5 classes. Now, consider I want to ignore the index `2` from the loss, we need to mask our inputs accordingly.

In [58]:
public func debugGradients<T: TensorFlowFloatingPoint>(logits: Tensor<T>,targets: Tensor<T>) {
    let (loss, gradients) = valueWithGradient(at: logits) { logits in 
                                softmaxCrossEntropy(logits: logits, probabilities: targets)
                            }
    print("logits: \(logits[0]) \nLabel:\(targets[0]) \nLoss:\(loss) \nGradients:\(-gradients[0])")
}

In [80]:
var oneHotTargets = Tensor<Float32>(oneHotAtIndices: targets, depth: 5)

In [81]:
debugGradients(logits: logits, targets: oneHotTargets)

logits: [ -7.816966,  1.6490363, -2.5067804,  1.1694168,  0.8058538] 
Label:[0.0, 1.0, 0.0, 0.0, 0.0] 
Loss:5.265192 
Gradients:[-3.7499424e-06,    0.051576387, -0.00075894274,   -0.029975135,    -0.02083856]


We're ignoring index `2` here

In [78]:
var ids = Tensor(rangeFrom: 0, to: 5, stride: 1)
let indices = _Raw.where_(_Raw.notEqual(ids, Tensor(2))).squeezingShape(at: 1)
print("Indices after exclude: \(indices)")

Indices after exclude: [0, 1, 3, 4]


In [82]:
let maskedLogits = logits.gathering(atIndices: indices, alongAxis: 1)
let maskedTargets = oneHotTargets.gathering(atIndices: indices, alongAxis: 1)

debugGradients(logits: maskedLogits, targets: maskedTargets)

logits: [-7.816966, 1.6490363, 1.1694168, 0.8058538] 
Label:[0.0, 1.0, 0.0, 0.0] 
Loss:4.828433 
Gradients:[-3.7786199e-06,    0.051206063,   -0.030204369,   -0.020997923]


If you look at the gradients, we got rid of grad at idx=2, affecting total loss as well