# Split the APY-GZSL dataset into ZSL format

This notebook is used to split the APY dataset into training and validation classes, as required for evaluation in the ZSL scenario. The splits used were taken from [this paper](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/zero-shot-learning/zero-shot-learning-the-good-the-bad-and-the-ugly/).

In [1]:
import pandas as pd
import os
import shutil

In [2]:
# parameters for the separation

dataset1 ='./Data/APY/train/'
dataset2 ='./Data/APY/test/'

trainval_classes_file ='./Data/APY/trainvalclasses.txt'
test_classes_file ='./Data/APY/testclasses.txt'

new_dataset_folder = './Data/APY_Zero'
new_trainval_folder_name = './Data/APY_Zero/train/'
new_test_folder_name = './Data/APY_Zero/test/'

In [3]:
if not os.path.exists(new_trainval_folder_name):
    os.mkdir(new_trainval_folder_name)

if not os.path.exists(new_test_folder_name):
    os.mkdir(new_test_folder_name)
    
if not os.path.exists(new_dataset_folder):
    os.mkdir(new_dataset_folder)

# Load both train and test sets for APY

In [4]:
irevnet_im_embeddings = pd.concat([pd.read_csv(dataset1 + 'irevnet_image_embeddings.txt', index_col=0, header=0),
                                   pd.read_csv(dataset2 + 'irevnet_image_embeddings.txt', index_col=0, header=0)])

files_labels = pd.concat([pd.read_csv(dataset1+'filenames_labels.txt', index_col=0, header=0),
                          pd.read_csv(dataset2+'filenames_labels.txt', index_col=0, header=0)])

In [5]:
classes = pd.read_csv(dataset1+'classes.txt', index_col=0, header=0)

trainval_classes = pd.read_csv(trainval_classes_file, header=None)
test_classes = pd.read_csv(test_classes_file, header=None)

# Seperate Image embeddings

In [7]:
classes_of_interest = trainval_classes

row_oi = []
for class_of_interest in classes_of_interest.iloc[:, 0].values:
    class_id = classes.loc[classes['label'] == class_of_interest].index.tolist()[0]
    class_row_ids = files_labels.loc[files_labels['class_id'] == class_id].index.tolist()
    row_oi.append(class_row_ids)

row_ids = [item for sublist in row_oi for item in sublist]  # flatten list of
row_ids.sort()

new_im_emb = irevnet_im_embeddings.iloc[row_ids, :]
new_im_emb.index = range(new_im_emb.shape[0])
new_im_emb.index.name = 'sample_id'
new_im_emb.to_csv(new_trainval_folder_name + 'irevnet_image_embeddings.txt')

new_file_labels = files_labels.iloc[row_ids, :]
new_file_labels.index = range(new_file_labels.shape[0])
new_file_labels.index.name = 'sample_id'
new_file_labels.to_csv(new_trainval_folder_name + 'filenames_labels.txt')

In [8]:
classes_of_interest = test_classes

row_oi = []
for class_of_interest in classes_of_interest.iloc[:, 0].values:
    class_id = classes.loc[classes['label'] == class_of_interest].index.tolist()[0]
    class_row_ids = files_labels.loc[files_labels['class_id'] == class_id].index.tolist()
    row_oi.append(class_row_ids)

row_ids = [item for sublist in row_oi for item in sublist]  # flatten list of
row_ids.sort()

new_im_emb = irevnet_im_embeddings.iloc[row_ids, :]
new_im_emb.index = range(new_im_emb.shape[0])
new_im_emb.index.name = 'sample_id'
new_im_emb.to_csv(new_test_folder_name + 'irevnet_image_embeddings.txt')

new_file_labels = files_labels.iloc[row_ids, :]
new_file_labels.index = range(new_file_labels.shape[0])
new_file_labels.index.name = 'sample_id'
new_file_labels.to_csv(new_test_folder_name + 'filenames_labels.txt')

# Copy the rest of the files

In [9]:
shutil.copyfile(dataset1+'classes.txt', new_trainval_folder_name+'classes.txt')
shutil.copyfile(dataset1+'classes.txt', new_test_folder_name+'classes.txt');

In [10]:
shutil.copyfile(dataset1+'glove_embeddings_300.txt', new_trainval_folder_name+'glove_embeddings_300.txt')
shutil.copyfile(dataset1+'glove_embeddings_300.txt', new_test_folder_name+'glove_embeddings_300.txt');

In [11]:
shutil.copyfile(dataset1+'class_predicates.txt', new_trainval_folder_name+'class_predicates.txt')
shutil.copyfile(dataset1+'class_predicates.txt', new_test_folder_name+'class_predicates.txt');