In [1]:
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Lambda
from tensorflow.keras.layers import Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from tensorflow.keras.layers import MaxPooling2D

import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from keras import optimizers
import matplotlib.pyplot as plt
import numpy as np
import keras
from keras.datasets import mnist
import random
import tensorflow as tf 
# import tensorflow_addons as tfa

In [2]:
## loading the data

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train/ 255
x_test = x_test / 255

X = np.concatenate((x_train,x_test))
y = np.concatenate((y_train, y_test))

from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [3]:
trainX = np.expand_dims(trainX, axis=-1)
testX = np.expand_dims(testX, axis=-1)

In [4]:
print(trainX.shape)
print(testX.shape) 
print(trainY.shape)
print(testY.shape)

(49000, 28, 28, 1)
(21000, 28, 28, 1)
(49000,)
(21000,)


In [5]:
def make_pair_of_two(X,y):
  anchor = []
  positive_or_negative = []
  labels = []

  for i in range(len(X)):
    anchor.append(X[i])
    positive_or_negative.append(X[random.choice(np.where(y==y[i])[0])])
    labels.append(1)

    anchor.append(X[i])
    positive_or_negative.append(X[random.choice(np.where(y != y[i])[0])])
    labels.append(0)

  return np.array(anchor), np.array(positive_or_negative), np.array(labels)


In [6]:
def make_pair_of_three(X,y):
  anchor = []
  positive = []
  negative = []
  labels = []

  for i in range(len(X)):
    anchor.append(X[i])
    positive.append(X[random.choice(np.where(y==y[i])[0])])
    negative.append(X[random.choice(np.where(y != y[i])[0])])
    labels.append(0)

  return np.array(anchor), np.array(positive), np.array(negative), np.array(labels)

In [13]:
def define_model(input_shape, embedding_dimension):

  model=Sequential()
  model.add(Conv2D(128, 5, padding='same', activation='relu', input_shape=input_shape))
  model.add(BatchNormalization())
  model.add(MaxPooling2D(pool_size=2,strides=None, padding="same"))
  model.add(Dropout(0.3))

  model.add(Conv2D(64, 5, padding='same', activation='relu', input_shape=input_shape))
  model.add(BatchNormalization())
  model.add(MaxPooling2D(pool_size=2,strides=None, padding="same"))
  model.add(Dropout(0.3))

  model.add(Conv2D(32, 5, padding='same', activation='relu', input_shape=input_shape))
  model.add(BatchNormalization())
  model.add(MaxPooling2D(pool_size=2,strides=None, padding="same"))
  model.add(Dropout(0.3))

  model.add(Flatten())
  model.add(tf.keras.layers.Dense(units= 64,activation="relu"))
  model.add(tf.keras.layers.Dense(units= embedding_dimension,activation='softmax'))

  return model

In [8]:
def distance_measure(features):
  if measure == "triplet":
    (f_pos, anch, f_neg) = features
    x = K.pow(K.abs((f_pos - anch),2))
    y = K.pow(K.abs((f_neg - anch),2))
    margin = 0.2
    distance = x - y + margin
    # K.pow()
    return max(distance,0.0)
  else:
    (featsA, featsB) = features
  #   sumSquared = K.sum(K.square(featsA - featsB), axis=1,
  #     keepdims=True)
  #   temp = K.sqrt(K.maximum(sumSquared, K.epsilon()))
    # print(temp.get_value())
    # print(K.eval(temp))
    # if temp>0.5:
    #   return 0
    # return 1
    # return temp
    # (f1, f2) = features
    return K.abs(featsA-featsB)

In [9]:
def triplet_loss(y_true,y_pred):
  return K.mean(y_pred)

def threshold(y_true,y_pred):
  if y_pred>0.5:
    return 1
  else:
    return 0

def constructive_loss(y_true,y_pred):
  y_true = tf.cast(y_true, y_pred.dtype)
  squaredPreds = K.square(y_pred)
  squaredMargin = K.square(K.maximum(1 - y_pred, 0))
  loss = K.mean(y_true * squaredMargin + (1 - y_true) * squaredPreds)
  return loss 

## Using Contrastive loss

#### Using `Adam` optimizer

In [24]:
pair0_train,pair1_train,labels_train = make_pair_of_two(trainX,trainY)
pair0_test,pair1_test,labels_test = make_pair_of_two(testX,testY)

imgA = Input(shape=(28,28,1))
imgB = Input(shape=(28,28,1))
featureExtractor = define_model((28,28,1),32)
featsA = featureExtractor(imgA)
featsB = featureExtractor(imgB)


measure="binary"
distance = Lambda(distance_measure)([featsA, featsB])
outputs = Dense(1, activation="softmax")(distance)
model = Model(inputs=[imgA, imgB], outputs=outputs)

model.compile(loss=constructive_loss, optimizer="adam",
	metrics=["accuracy"])

history = model.fit(
	[pair0_train, pair1_train], labels_train,
	validation_data=([pair0_test, pair1_test], labels_test),
	batch_size=32, 
	epochs=5) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
scores_constructive_adam = model.evaluate([pair0_test, pair1_test], labels_test, verbose=0)
print("Accuracy: %.2f%%" % (scores_constructive_adam[1]*100)) 

Accuracy: 50.00%


#### Using RMSProp

In [27]:
imgA = Input(shape=(28,28,1))
imgB = Input(shape=(28,28,1))
featureExtractor = define_model((28,28,1),32)
featsA = featureExtractor(imgA)
featsB = featureExtractor(imgB)


measure="binary"
distance = Lambda(distance_measure)([featsA, featsB])
outputs = Dense(1, activation="softmax")(distance)
model = Model(inputs=[imgA, imgB], outputs=outputs)

opt = keras.optimizers.RMSprop()
model.compile(loss=constructive_loss, optimizer=opt,
	metrics=["accuracy"])


history = model.fit(
	[pair0_train, pair1_train], labels_train,
	validation_data=([pair0_test, pair1_test], labels_test),
	batch_size=32, 
	epochs=5)

scores_constructive_rms = model.evaluate([pair0_test, pair1_test], labels_test, verbose=0)
print("Accuracy: %.2f%%" % (scores_constructive_rms[1]*100)) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 50.00%


#### Using mini batch gradient descent

In [28]:
imgA = Input(shape=(28,28,1))
imgB = Input(shape=(28,28,1))
featureExtractor = define_model((28,28,1),32)
featsA = featureExtractor(imgA)
featsB = featureExtractor(imgB)

measure="binary"
distance = Lambda(distance_measure)([featsA, featsB])
outputs = Dense(1, activation="softmax")(distance)
model = Model(inputs=[imgA, imgB], outputs=outputs)

opt = keras.optimizers.SGD()
model.compile(loss=constructive_loss, optimizer=opt,
	metrics=["accuracy"])

history = model.fit(
	[pair0_train, pair1_train], labels_train,
	validation_data=([pair0_test, pair1_test], labels_test),
	batch_size=32, 
	epochs=5)

scores_constructive_mgd = model.evaluate([pair0_test, pair1_test], labels_test, verbose=0)
print("Accuracy: %.2f%%" % (scores_constructive_mgd[1]*100)) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 50.00%


## Using regularized cross entropy

#### Using Adam optimizer

In [26]:
measure="binary"
distance = Lambda(distance_measure)([featsA, featsB])
outputs = Dense(1, activation="softmax")(distance)
model_binary = Model(inputs=[imgA, imgB], outputs=outputs)


model_binary.compile(loss="categorical_crossentropy", optimizer="adam",
	metrics=["accuracy"])

history = model_binary.fit(
	[pair0_train, pair1_train], labels_train,
	validation_data=([pair0_test, pair1_test], labels_test),
	batch_size=64, 
	epochs=5)

scores_regularized_adam = model_binary.evaluate([pair0_test, pair1_test], labels_test)
print("Accuracy: %.2f%%" % (scores_regularized_adam[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 50.00%


#### Using RMSProp optimizer

In [29]:
measure="binary"
distance = Lambda(distance_measure)([featsA, featsB])
outputs = Dense(1, activation="softmax")(distance)
model_binary = Model(inputs=[imgA, imgB], outputs=outputs)

opt = keras.optimizers.RMSprop()
model_binary.compile(loss="categorical_crossentropy", optimizer=opt,
	metrics=["accuracy"])

history = model_binary.fit(
	[pair0_train, pair1_train], labels_train,
	validation_data=([pair0_test, pair1_test], labels_test),
	batch_size=64, 
	epochs=5)

scores_regularized_rms = model_binary.evaluate([pair0_test, pair1_test], labels_test)
print("Accuracy: %.2f%%" % (scores_regularized_rms[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 50.00%


#### Using Mini-batch gradient descent

In [30]:
measure="binary"
distance = Lambda(distance_measure)([featsA, featsB])
outputs = Dense(1, activation="softmax")(distance)
model_binary = Model(inputs=[imgA, imgB], outputs=outputs)

opt = keras.optimizers.SGD()
model_binary.compile(loss="categorical_crossentropy", optimizer=opt,
	metrics=["accuracy"])

history = model_binary.fit(
	[pair0_train, pair1_train], labels_train,
	validation_data=([pair0_test, pair1_test], labels_test),
	batch_size=64, 
	epochs=5)

scores_regularized_sgd = model_binary.evaluate([pair0_test, pair1_test], labels_test)
print("Accuracy: %.2f%%" % (scores_regularized_sgd[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 50.00%


## Using triplet loss

#### Using Adam optimizer

In [10]:
def triplet_distance(x, alpha = 0.2):
 anchor = x[0]
 positive = x[1]
 negative = x[2]
 pos_dist = K.abs(anchor-positive)
 neg_dist = K.abs(anchor-negative)
 basic_loss = pos_dist-neg_dist+alpha
 loss = K.maximum(basic_loss,0.0)
 return loss

In [15]:
pair0,pair1,pair2,labels_train = make_pair_of_three(trainX,trainY)
pair0_test,pair1_test,pair2_test,labels_test = make_pair_of_three(testX,testY)


imgA = Input(shape=(28,28,1))
imgB = Input(shape=(28,28,1))
imgC = Input(shape=(28,28,1))
featureExtractor = define_model((28,28,1),32)
featsA = featureExtractor(imgA)
featsB = featureExtractor(imgB)
featsC = featureExtractor(imgC)


measure="triplet"
distance = Lambda(triplet_distance)([featsA, featsB, featsC])
outputs = Dense(1, activation="sigmoid")(distance)
model_binary = Model(inputs=[imgA, imgB, imgC], outputs=outputs)

model_binary.compile(loss=triplet_loss, optimizer="adam",
	metrics=["accuracy"])

history = model_binary.fit(
	[pair0,pair1,pair2],  labels_train,
	validation_data=([pair0_test,pair1_test,pair2_test],labels_test),
	batch_size=64, 
	epochs=5)

scores_triplet_sgd = model_binary.evaluate([pair0_test,pair1_test,pair2_test], labels_test)
print("Accuracy: %.2f%%" % (scores_triplet_sgd[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 100.00%


#### Using RMSProp optimizer

In [18]:
measure="triplet"
distance = Lambda(triplet_distance)([featsA, featsB, featsC])
outputs = Dense(1, activation="sigmoid")(distance)
model_binary = Model(inputs=[imgA, imgB, imgC], outputs=outputs)

opt = keras.optimizers.RMSprop()
model_binary.compile(loss=triplet_loss, optimizer=opt,
	metrics=["accuracy"])

history = model_binary.fit(
	[pair0,pair1,pair2],  labels_train,
	validation_data=([pair0_test,pair1_test,pair2_test],labels_test),
	batch_size=64, 
	epochs=5)

scores_triplet_rms = model_binary.evaluate([pair0_test,pair1_test,pair2_test], labels_test)
print("Accuracy: %.2f%%" % (scores_triplet_rms[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 100.00%


#### Using mini-batch gradient descent

In [20]:
measure="triplet"
distance = Lambda(triplet_distance)([featsA, featsB, featsC])
outputs = Dense(1, activation="sigmoid")(distance)
model_binary = Model(inputs=[imgA, imgB, imgC], outputs=outputs)

opt = keras.optimizers.SGD()
model_binary.compile(loss=triplet_loss, optimizer=opt,
	metrics=["accuracy"])

history = model_binary.fit(
	[pair0,pair1,pair2],  labels_train,
	validation_data=([pair0_test,pair1_test,pair2_test],labels_test),
	batch_size=64, 
	epochs=5)

scores_triplet_sgd = model_binary.evaluate([pair0_test,pair1_test,pair2_test], labels_test)
print("Accuracy: %.2f%%" % (scores_triplet_sgd[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 100.00%


## Comparing the loss and optimizers

In [34]:
from tabulate import tabulate

table = [["","Adam","Mini batch","RMSProp"],["Constructive",scores_constructive_adam[1], scores_constructive_mgd[1], scores_constructive_rms[1]],["Regularized",scores_regularized_adam[1], scores_regularized_sgd[1], scores_regularized_rms[1]],
          ["Triplet_loss",scores_triplet_sgd[1],scores_triplet_sgd[1],scores_triplet_rms[1]]]

print(tabulate(table))

------------  ----  ----------  -------
              Adam  Mini batch  RMSProp
Constructive  0.5   0.5         0.5
Regularized   0.5   0.5         0.5
Triplet_loss  1.0   1.0         1.0
------------  ----  ----------  -------


- One advantage of the triplet loss is that it tries to be less “greedy” than the contrastive loss (which considers pairwise examples). This is because the triplet loss takes an anchor example and tries to bring positive examples closer while also pushing away negative example. 
- The contrastive loss, on the other hand, only considers pairwise examples at a time, so in a sense it is more “greedy.” The triplet loss is still too greedy however, since it heavily depends on the selection of the anchor, negative, and positive examples.
- All the optimizers are giving same probabilities

## Pros and Cons

__Pros__:

- More Robust to class Imbalance
- Nice to an ensemble with the best classifier
- Learning from Semantic Similarity

__Cons__:

- Needs more training time than normal networks
- Doesn’t output probabilities

Reference: https://www.quora.com/What-is-the-pros-and-cons-of-Siamese-Network-comparing-with-others-There-are-many-good-results-produced-by-SN-but-why-it-seems-that-people-still-prefer-to-use-network-with-common-structures-like-n-outputs-for-n