## Data Preparation
This Notebook helps to transform the data to my desired format. Use this Notebook here (https://github.com/pedropro/TACO) to create a .csv with all the labels and the corresponding image names.

The images are divided in 15 folders with ambiguous names, this problem is resolved by adding the folder name (batch_*) as a prefix. The images are also saved in this format in the annotations.json

In [8]:
import os
import numpy as np
import pandas as pd
import json
import shutil

In [4]:
# Path to the image directory and annotation file
dataset_path = '../data'
anns_file_path = dataset_path + '/' + 'annotations.json'



In [18]:
# Creating a csv File with the image names and the labels
# Merge two df, create an array with 0/1 depending on the labels
def create_image_label_df(anns_file_path):
    with open(anns_file_path, 'r') as f:
        dataset = json.loads(f.read())

    anns = dataset['annotations']
    imgs = dataset['images']
    categories = dataset['categories']

    df_anns_raw = pd.DataFrame(anns)
    df_imgs = pd.DataFrame(imgs)

    df_anns = df_anns_raw.groupby('image_id').agg({'category_id':list}).reset_index()

    df_anns['category_id'] = df_anns['category_id'].apply(set)

    img_label_df = pd.merge(df_anns[['image_id', 'category_id']], 
                            df_imgs[['id', 'file_name','flickr_url']], 
                            left_on='image_id',
                            right_on = 'id')

    array_labels = lambda x: np.array([1 if i+1 in x else 0 for i in range(60)])
    img_label_df['labels'] = img_label_df['category_id'].apply(array_labels)


    #Change the array to columns

    to_dict = lambda x: dict(zip([f'{i}' for i in range(len(x))], x))
    new_df = pd.DataFrame(img_label_df['labels'].apply(to_dict).tolist())

    img_label_df = pd.concat([img_label_df.drop('labels', axis=1), new_df], axis=1)

    # Rename the columns according to the category
    cat = {str(d['id']): d['name'].replace(' ', '_') for d in categories}
    img_label_df = img_label_df.rename(columns = cat)
    
    
    img_label_df = img_label_df.drop(['category_id','image_id','id','flickr_url'], axis = 1)
    img_label_df['file_name'] = img_label_df['file_name'].str.replace('/', '_')
    
    #img_label_df.to_csv('labels.csv',index=False)

    return img_label_df

create_image_label_df(anns_file_path)

Unnamed: 0,file_name,Aluminium_foil,Battery,Aluminium_blister_pack,Carded_blister_pack,Other_plastic_bottle,Clear_plastic_bottle,Glass_bottle,Plastic_bottle_cap,Metal_bottle_cap,...,Pop_tab,Rope_&_strings,Scrap_metal,Shoe,Squeezable_tube,Plastic_straw,Paper_straw,Styrofoam_piece,Unlabeled_litter,Cigarette
0,batch_1_000006.jpg,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,batch_1_000008.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,batch_1_000010.jpg,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,batch_1_000019.jpg,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,batch_1_000026.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,batch_9_000095.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1496,batch_9_000096.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1497,batch_9_000097.jpg,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1498,batch_9_000098.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Change the image names that they match the csv file

directory_path = '../data'
folder_names = ['batch_' + str(i) for i in range(1, 16)]

def add_folder_prefix_to_image_names(directory_path, folder_names):
    """
    Adds the folder name as a prefix to the image names in each folder in the specified directory.
    
    Args:
        directory_path (str): The path to the directory containing the folders.
        folder_names (list of str): A list of folder names in the directory.
    """
    # loop through each folder in the directory
    for folder_name in folder_names:
        # get the full path of the folder
        folder_path = os.path.join(directory_path, folder_name)

        # loop through each file in the folder
        for file_name in os.listdir(folder_path):
            # get the full path of the file
            file_path = os.path.join(folder_path, file_name)

            # add the folder name as a prefix to the file name
            new_file_name = folder_name + '_' + file_name

            # rename the file with the new file name
            os.rename(file_path, os.path.join(folder_path, new_file_name))


#add_folder_prefix_to_image_names(directory_path, folder_names)

In [21]:
#Create a csv where the names of the images match the images in the all_images directory
IMAGE_DIR = '/Users/mjs/Desktop/Dev/TACO/data/all_images'
labels_df = pd.read_csv('./data/labels.csv')

file_list = os.listdir(IMAGE_DIR)
labels_list = list(labels_df['file_name'])


common_elements = list(set(file_list).intersection(set(labels_list)))

count = len(common_elements)

print(f'There are {count} images that match.')

labels_filtered = labels_df[labels_df['file_name'].isin(common_elements)]


#labels_filtered.to_csv('filtered_labels.csv',index=False)



len(file_list)

There are 833 images that match.


1501

### Changes on the folder structure of the images & image name


In [10]:
# create the new folder that contains all the images
def collect_images(directory_path, folder_names):
    new_folder_path = os.path.join(directory_path, 'all_images')
    if not os.path.exists(new_folder_path):
        os.mkdir(new_folder_path)

    # loop through each folder in the directory
    for folder_name in folder_names:
        # get the full path of the folder
        folder_path = os.path.join(directory_path, folder_name)

        # loop through each file in the folder
        for file_name in os.listdir(folder_path):
            # get the full path of the file
            file_path = os.path.join(folder_path, file_name)

            # copy the file to the new folder
            shutil.copy(file_path, os.path.join(new_folder_path, file_name))
#collect_images(directory_path, folder_names)

In [14]:
# Unify all the image names (lower-case)


directory_path = '/Users/mjs/Desktop/Dev/TACO/data/all_images'

for filename in os.listdir(directory_path):
    if filename.endswith(".JPG"):
        new_filename = filename.lower()
        os.rename(os.path.join(directory_path, filename), os.path.join(directory_path, new_filename))


In [24]:
label = pd.read_csv('./data/labels.csv')
file_list = os.listdir(IMAGE_DIR)


In [20]:
df_label = label.sort_values('file_name')

df_label.iloc[700:800,]

Unnamed: 0,file_name,Aluminium_foil,Battery,Aluminium_blister_pack,Carded_blister_pack,Other_plastic_bottle,Clear_plastic_bottle,Glass_bottle,Plastic_bottle_cap,Metal_bottle_cap,...,Pop_tab,Rope_&_strings,Scrap_metal,Shoe,Squeezable_tube,Plastic_straw,Paper_straw,Styrofoam_piece,Unlabeled_litter,Cigarette
699,batch_2_000017.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
700,batch_2_000018.JPG,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
701,batch_2_000019.JPG,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
702,batch_2_000020.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
703,batch_2_000021.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,batch_3_IMG_4881.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
826,batch_3_IMG_4883.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
827,batch_3_IMG_4887.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
828,batch_3_IMG_4889.JPG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
