# Download dataset

In [24]:
!wget -q --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1p1wjaqpTh_5RHfJu4vUh8JJCdKwYMHCp' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1p1wjaqpTh_5RHfJu4vUh8JJCdKwYMHCp" -O data && rm -rf /tmp/cookies.txt
!unzip -q data && rm data

In [30]:
!ls lfw2/lfw2/Aaron_Eckhart/ | wc -l

5749


In [2]:
import glob

files = glob.glob('lfw2/lfw2/Adam_Sandler/*.jpg')
print(len(files))
files

4


['lfw2/lfw2/Adam_Sandler/Adam_Sandler_0001.jpg',
 'lfw2/lfw2/Adam_Sandler/Adam_Sandler_0002.jpg',
 'lfw2/lfw2/Adam_Sandler/Adam_Sandler_0003.jpg',
 'lfw2/lfw2/Adam_Sandler/Adam_Sandler_0004.jpg']

# Imports

In [97]:
# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.layers import Dropout, Dense, Lambda, Multiply, Subtract, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam

# Utils
import pandas as pd
import numpy as np
import cv2
import os
import joblib
import time

data_path = 'lfw2/lfw2/'

In [19]:
def get_image_name(name, index):
    return f'{data_path}{name}/{name}_{str(index).rjust(4, "0")}.jpg'

In [35]:
pd.read_excel('datasplit.xls', f'train_diff')

Unnamed: 0,Name1,ID1,Name2,ID2
0,AJ_Cook,1,Marsha_Thomason,1
1,Aaron_Sorkin,2,Frank_Solich,5
2,Abdel_Nasser_Assidi,2,Hilary_McKay,1
3,Abdoulaye_Wade,4,Linda_Dano,1
4,Abdul_Rahman,1,Magui_Serna,1
...,...,...,...,...
1038,Tom_Vilsack,1,Wayne_Ferreira,5
1039,Trisha_Meili,1,Vladimiro_Montesinos,3
1040,Ty_Votaw,1,Wayne_Allard,1
1041,Vytas_Danelius,1,Zaini_Abdullah,1


In [79]:
def read_dataset_part(set_type, part_type):
    name1 = 'Name' if part_type == 'same' else 'Name1'
    name2 = 'Name' if part_type == 'same' else 'Name2'
    
    part_meta = pd.read_excel('datasplit.xls', f'{set_type}_{part_type}')
    part_data = [], []
    for _, row in part_meta.iterrows():
        img1_name = get_image_name(row[name1], row['ID1'])
        img1 = cv2.imread(img1_name)/255.
        part_data[0].append(img1)
        img2_name = get_image_name(row[name2], row['ID2'])
        img2 = cv2.imread(img2_name)/255.
        part_data[1].append(img2)
        
    return [np.array(data) for data in part_data]

def read_dataset(set_type):

    same_data = read_dataset_part(set_type, 'same')
    diff_data = read_dataset_part(set_type, 'diff')
    x_data = [np.concatenate([s, d]) for s, d in zip(same_data, diff_data)]
    y_data = np.concatenate([np.zeros(same_data[0].shape[0]), np.ones(diff_data[0].shape[0])])
    
    return x_data, y_data

In [92]:
train_x, train_y = read_dataset('train')
val_x, val_y = read_dataset('val')
test_x, test_y = read_dataset('test')

# Siamese network

In [106]:
from tensorflow.keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

In [113]:
sm = 0

def siamese_model(length):
    global sm
    inp = Input(shape=length)
    X = Conv2D(64, 10, activation='relu')(inp)
    X = MaxPooling2D(pool_size=2)(X)
    X = Conv2D(128, 7, activation='relu')(X)
    X = MaxPooling2D(pool_size=2)(X)
    X = Conv2D(128, 4, activation='relu')(X)
    X = MaxPooling2D(pool_size=2)(X)
    X = Conv2D(256, 4, activation='relu')(X)
    X = MaxPooling2D(pool_size=2)(X)
    X = Conv2D(512, 3, activation='relu')(X)
    X = MaxPooling2D(pool_size=2)(X)
    X = Conv2D(256, 3, activation='relu')(X)
    X = Flatten()(X)
    
    sm += 1
    
    return Model(inp, X, name=f'siamese_model_{sm}')

In [117]:
siamese_model(train_x[0].shape[1:]).summary()

Model: "siamese_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        [(None, 250, 250, 3)]     0         
_________________________________________________________________
conv2d_29 (Conv2D)           (None, 241, 241, 64)      19264     
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 120, 120, 64)      0         
_________________________________________________________________
conv2d_30 (Conv2D)           (None, 114, 114, 128)     401536    
_________________________________________________________________
max_pooling2d_21 (MaxPooling (None, 57, 57, 128)       0         
_________________________________________________________________
conv2d_31 (Conv2D)           (None, 54, 54, 128)       262272    
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 27, 27, 128)   

In [133]:
def init_siamese_model(image_shape, output_shape=1):
    input_1 = Input(shape=image_shape)
    input_2 = Input(shape=image_shape)
    
    
    sm = siamese_model(image_shape)

    vector_1 = sm(input_1)
    
    vector_2 = sm(input_2)
    
    x3 = Subtract()([vector_1, vector_2])
    x6 = tf.math.abs(x3)
    x3 = Multiply()([x3, x3])

    x1_ = Multiply()([vector_1, vector_1])
    x2_ = Multiply()([vector_2, vector_2])
    x4 = Subtract()([x1_, x2_])
    
    x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([vector_1, vector_2])

    conc = Concatenate(axis=-1)([x3,x4,x5,x6])

    x = Dense(512, activation="relu")(conc)
#     x = Dropout(0.01)(x)
    out = Dense(output_shape, activation="sigmoid", name = 'out')(x)

    model = Model([input_1, input_2], out)

    model.compile(loss='binary_crossentropy', optimizer=Adam())#, metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(name='rmse')]
    
    return model

In [95]:
def get_callbacks(model_name):
    acc = 'val_loss'
    acc_mode = 'min'
    
    checkpoint = ModelCheckpoint(
                              fr'./models/{model_name}.h5', 
                              monitor=acc, 
#                               verbose=1, 
                              save_best_only=True, 
                              mode=acc_mode)
    earlystop = EarlyStopping(monitor=acc, mode=acc_mode, verbose=1, patience=6)
    reduceLR = ReduceLROnPlateau(monitor = 'val_loss', mode = 'min', patience = 5,
                            factor = 0.5, min_lr = 1e-6, verbose = 1)

    return [checkpoint, reduceLR, earlystop]

In [104]:
def train_model(model_gen, train_data, val_data, batch_size=16, use_saved=False):
    os.makedirs('./models', exist_ok=True)
    model_name = model_gen.__name__[5:]
        
    if use_saved:
        history = joblib.load(fr'./models/{model_name}_history.sav')
    else:
        callbacks = get_callbacks(model_name)
        
        train_x, train_y = train_data
        model = model_gen(train_x[0].shape[1:])
        history = model.fit(
                            x=train_x,
                            y=train_y,
                            batch_size=batch_size,
                            epochs=20,
                            validation_data=val_data,
                            callbacks=callbacks
                            )
        
        history = history.history
        joblib.dump(history, fr'./models/{model_name}_history.sav')
    
    model = load_model(fr'./models/{model_name}.h5')
    
    return model, history

In [134]:
start = time.time()
siam_model, siam_history = train_model(init_siamese_model,
                                       (train_x, train_y),
                                       (val_x, val_y),
                                       batch_size=32,
                                       use_saved=False)
end = time.time()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: early stopping


In [135]:
siam_model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           [(None, 250, 250, 3) 0                                            
__________________________________________________________________________________________________
input_47 (InputLayer)           [(None, 250, 250, 3) 0                                            
__________________________________________________________________________________________________
siamese_model_10 (Model)        (None, 2304)         3567680     input_46[0][0]                   
                                                                 input_47[0][0]                   
__________________________________________________________________________________________________
subtract_24 (Subtract)          (None, 2304)         0           siamese_model_10[1][0]    