# Preparing dataset and generating triplets

## Grouping images by person id and saving this mapping in a csv file

**About data set**

This is a public data set download from https://cedar.buffalo.edu/NIJ/data/signatures.rar.
1. Folder full_forg: Contains 1320 forgery signatures (24 forgeries for each of 55 writers).
2. Folder full_org: Contains 1320 genuine signatures (24 genuines for each of 55 writers). 

Filename patern:    

**original_X_Y.png** and **forgeries_X_Y.png**
* X - ID number of a person who has done the signature.
* Y - Image smaple number

For example, the file with name *original_1_6.png* is the 6th signature sample of person 1 and it is a genuine signature.
On the other, the fine with name *forgeries_1_4.png* is the 4th signature sample of person 1 and it is a forged signature.

In [1]:
genuine_signatures_path = '../../dataset/signatures/full_org/'
forged_signatures_path = '../../dataset/signatures/full_forg/'

In [2]:
def chuncks( input_list, n ):
    # For item i in a range that is a length of l,
    for i in range(0, len(input_list), n):
        # Create an index range for l of n items:
        yield input_list[i:i+n]

In [3]:
from os import walk

forged_signatures = []
genuine_signatures = []

n = 24

for (dirpath, dirnames, filenames) in walk(forged_signatures_path):
    forged_signatures.extend(filenames)
    break

for (dirpath, dirnames, filenames) in walk(genuine_signatures_path):
    genuine_signatures.extend(filenames)
    break

# Removing last element because it is the 'Thumbs.db' file
forged_signatures = forged_signatures[:-1]
genuine_signatures = genuine_signatures[:-1]

genuine_signatures = list( chuncks( genuine_signatures, n) )
forged_signatures = list( chuncks(forged_signatures, n) )

In [4]:
print( 'forged_signatures:', len(forged_signatures), ', genuine_signatures:', len(genuine_signatures)  )

forged_signatures: 55 , genuine_signatures: 55


In [5]:
import pandas as pd

sample_signatures_df = pd.DataFrame( columns = ['PersonID', 'GenuineSignaturesSample', 'ForgedSignaturesSamples'] )

for i in range( len(forged_signatures) ):
    sample_signatures_df.loc[ int(i), ['PersonID', 'GenuineSignaturesSample', 'ForgedSignaturesSamples'] ] = [ i, genuine_signatures[i], forged_signatures[i] ]
                                                                                             
sample_signatures_df.sort_index( inplace = True )

In [6]:
sample_signatures_df.head()

Unnamed: 0,PersonID,GenuineSignaturesSample,ForgedSignaturesSamples
0,0,"[original_10_1.png, original_10_10.png, origin...","[forgeries_10_1.png, forgeries_10_10.png, forg..."
1,1,"[original_11_1.png, original_11_10.png, origin...","[forgeries_11_1.png, forgeries_11_10.png, forg..."
2,2,"[original_12_1.png, original_12_10.png, origin...","[forgeries_12_1.png, forgeries_12_10.png, forg..."
3,3,"[original_13_1.png, original_13_10.png, origin...","[forgeries_13_1.png, forgeries_13_10.png, forg..."
4,4,"[original_14_1.png, original_14_10.png, origin...","[forgeries_14_1.png, forgeries_14_10.png, forg..."


In [7]:
sample_signatures_df

Unnamed: 0,PersonID,GenuineSignaturesSample,ForgedSignaturesSamples
0,0,"[original_10_1.png, original_10_10.png, origin...","[forgeries_10_1.png, forgeries_10_10.png, forg..."
1,1,"[original_11_1.png, original_11_10.png, origin...","[forgeries_11_1.png, forgeries_11_10.png, forg..."
2,2,"[original_12_1.png, original_12_10.png, origin...","[forgeries_12_1.png, forgeries_12_10.png, forg..."
3,3,"[original_13_1.png, original_13_10.png, origin...","[forgeries_13_1.png, forgeries_13_10.png, forg..."
4,4,"[original_14_1.png, original_14_10.png, origin...","[forgeries_14_1.png, forgeries_14_10.png, forg..."
5,5,"[original_15_1.png, original_15_10.png, origin...","[forgeries_15_1.png, forgeries_15_10.png, forg..."
6,6,"[original_16_1.png, original_16_10.png, origin...","[forgeries_16_1.png, forgeries_16_10.png, forg..."
7,7,"[original_17_1.png, original_17_10.png, origin...","[forgeries_17_1.png, forgeries_17_10.png, forg..."
8,8,"[original_18_1.png, original_18_10.png, origin...","[forgeries_18_1.png, forgeries_18_10.png, forg..."
9,9,"[original_19_1.png, original_19_10.png, origin...","[forgeries_19_1.png, forgeries_19_10.png, forg..."


In [49]:
# Convert a list of strings into one string using specified separator
def to_one_string( list_of_string, separator = ","):
    return separator.join(list_of_string)

In [50]:
sample_signatures_df.GenuineSignaturesSample = sample_signatures_df.GenuineSignaturesSample.apply( to_one_string )

In [51]:
sample_signatures_df.ForgedSignaturesSamples = sample_signatures_df.ForgedSignaturesSamples.apply( to_one_string )

In [52]:
sample_signatures_df.head()

Unnamed: 0,PersonID,GenuineSignaturesSample,ForgedSignaturesSamples
0,0,"original_10_1.png,original_10_10.png,original_...","forgeries_10_1.png,forgeries_10_10.png,forgeri..."
1,1,"original_11_1.png,original_11_10.png,original_...","forgeries_11_1.png,forgeries_11_10.png,forgeri..."
2,2,"original_12_1.png,original_12_10.png,original_...","forgeries_12_1.png,forgeries_12_10.png,forgeri..."
3,3,"original_13_1.png,original_13_10.png,original_...","forgeries_13_1.png,forgeries_13_10.png,forgeri..."
4,4,"original_14_1.png,original_14_10.png,original_...","forgeries_14_1.png,forgeries_14_10.png,forgeri..."


In [54]:
sample_signatures_df.to_csv("../../dataset/signatures/sample_signatures.csv", ',', index = False )

## Generating triplets

A triplet is composed of one achor (A), one positive sample (P) and a negative sample (N).   

Reference: https://omoindrot.github.io/triplet-loss

In [60]:
! pip install more-itertools



In [149]:
import itertools
import numpy as np

r = 2
combinations = list( itertools.combinations( range(3), r) ) 
permutations = list( itertools. permutations( range(3), r) )

print( 'combinations:', combinations )
print( 'permutations',  permutations )

combinations: [(0, 1), (0, 2), (1, 2)]
permutations [(0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]


**Note:** Given that we are dealing with two different lists, genuine and forged signatures respectively,    
d( genuine[0], genuine[1] ) != distance( genuine[1], forged[0] ).    
So, we can make triplets like these:
* (0, 1, 1), (0, 2, 2), ..., (0, 23, 23)
* (1, 0, 0), (1, 2, 2), ..., (1, 23, 23)
* (2, 3, 3), (2, 4, 4), ..., (2, 23, 23)
* ...
* (23, 0, 0), (23, 1, 1), ..., (23, 22, 22)

In [151]:
permutations = list( itertools.permutations( range(24), 2) ) 
permutations.sort()
print( len(permutations) )

552


In [152]:
from tqdm import tqdm

triplets = []

for idx in tqdm(sample_signatures_df.index):
    
    # Getting signature samples
    genuine_signatures = sample_signatures_df.GenuineSignaturesSample[idx].split(',')
    forged_signatures = sample_signatures_df.ForgedSignaturesSamples[idx].split(',')
    
    if( len(genuine_signatures) != len(forged_signatures) ):
        print( "ERROR: Set with diferent sizes.")
        continue
    
    # Making triples
    for p in permutations:
        # triple = (Anchor, Positive, Negative)
        triplet = ( genuine_signatures[p[0]], genuine_signatures[p[1]], forged_signatures[p[1]] )
        triplets.append( triplet )

100%|████████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 1277.62it/s]


In [153]:
triplets[:10]

[('original_10_1.png', 'original_10_10.png', 'forgeries_10_10.png'),
 ('original_10_1.png', 'original_10_11.png', 'forgeries_10_11.png'),
 ('original_10_1.png', 'original_10_12.png', 'forgeries_10_12.png'),
 ('original_10_1.png', 'original_10_13.png', 'forgeries_10_13.png'),
 ('original_10_1.png', 'original_10_14.png', 'forgeries_10_14.png'),
 ('original_10_1.png', 'original_10_15.png', 'forgeries_10_15.png'),
 ('original_10_1.png', 'original_10_16.png', 'forgeries_10_16.png'),
 ('original_10_1.png', 'original_10_17.png', 'forgeries_10_17.png'),
 ('original_10_1.png', 'original_10_18.png', 'forgeries_10_18.png'),
 ('original_10_1.png', 'original_10_19.png', 'forgeries_10_19.png')]

In [154]:
print( "Number of triplets:", len(triplets) )

Number of triplets: 30360


**Splitting triplets list into random train and test subsets**

In [155]:
from sklearn.model_selection import train_test_split

y = np.zeros( len(triplets) )
X_train, X_test, _, _ = train_test_split( triplets, y, test_size=0.33, random_state=42)

In [156]:
print( "Number of triplets for training:", len(X_train) )
print( "Number of triplets for testing:", len(X_test) )

Number of triplets for training: 20341
Number of triplets for testing: 10019


**Saving training and testing datasets in csv files**

In [157]:
import csv

with open('../../dataset/signatures/X_train.csv', 'w') as csv_file:
    file = csv.writer( csv_file )
    file.writerow( ['Anchor', 'Positive', 'Negative'] )
    file.writerows(X_train)
    
with open('../../dataset/signatures/X_test.csv', 'w') as csv_file:
    file = csv.writer( csv_file )
    file.writerow( ['Anchor', 'Positive', 'Negative'] )
    file.writerows(X_test)

In [158]:
pd.read_csv( '../../dataset/signatures/X_train.csv' ).head()

Unnamed: 0,Anchor,Positive,Negative
0,original_14_17.png,original_14_1.png,forgeries_14_1.png
1,original_6_4.png,original_6_5.png,forgeries_6_5.png
2,original_17_8.png,original_17_12.png,forgeries_17_12.png
3,original_42_3.png,original_42_14.png,forgeries_42_14.png
4,original_1_18.png,original_1_10.png,forgeries_1_10.png
