## Data Preparation
This Notebook helps to transform the data to my desired format. Apply this Notebook here (https://github.com/pedropro/TACO) to create a .csv with all the labels and the corresponding image names.

The images are divided in 15 folders with ambiguous names, this problem is resolved by adding the folder name (batch_*) as a prefix. The images are also saved in this format in the annotations.json


#### Section 1
Create a csv file with the labels and the images

#### Section 2
Rename the images (unique names) and create one folder with all the images

#### Section 3
Feature engineering

#### Section 4
Split and balance the data

In [1]:
import os
import numpy as np
import pandas as pd
import json
import shutil

In [11]:
# Path to the image directory and annotation file
dataset_path = './data'
anns_file_path = dataset_path + '/' + 'splits/annotations.json'


In [78]:
with open(anns_file_path, 'r') as f:
    dataset = json.loads(f.read())
    
anns = dataset['annotations']
imgs = dataset['images']
categories = dataset['categories']
scenes = dataset['scene_annotations']
scene_cat = dataset['scene_categories']

df_anns_raw = pd.DataFrame(anns)
df_imgs = pd.DataFrame(imgs)
scenes = pd.DataFrame(scenes)

print(len(scene_cat), len(categories))

categories[1]

unique_supercategories = set()

for my_dict in categories:
    unique_supercategories.add(my_dict['supercategory'])

print(len(unique_supercategories))

7 60
28


In [123]:
scenes = dataset['scene_annotations']
scene_cat = dataset['scene_categories']

df_scenes = pd.DataFrame(scenes)

df_labels.columns
df_scenes['image_id'].value_counts()
df_scenes[df_scenes['image_id']==603]
df_scenes = df_scenes.drop_duplicates(subset='image_id')
df_scenes.shape

(1496, 2)

#### Section 1: Create the csv

In [128]:
# Creating a csv File with the image names and the labels
# Merge two df, create an array with 0/1 depending on the labels
# categories_level = 'name' for all 60 labels and categories_level = 'supercategory' for the reduced 28 labels
# add_scene = 'scene' if you want to add the scene annotations
def create_image_label_df(anns_file_path, categories_level = 'name', add_scene = 'scene'):
    with open(anns_file_path, 'r') as f:
        dataset = json.loads(f.read())

    anns = dataset['annotations']
    imgs = dataset['images']
    categories = dataset['categories']
    scenes = dataset['scene_annotations']
    scene_cat = dataset['scene_categories']


    df_anns_raw = pd.DataFrame(anns)
    df_imgs = pd.DataFrame(imgs)
    df_scenes = pd.DataFrame(scenes)
    df_scenes = df_scenes.drop_duplicates(subset='image_id')
    

    df_anns = df_anns_raw.groupby('image_id').agg({'category_id':list}).reset_index()

    df_anns['category_id'] = df_anns['category_id'].apply(set)
    df_scenes['background_ids'] = df_scenes['background_ids'].apply(set)
    
    
    # Add the file names to the category IDs - you can add more columns like the boxes by adding the column names
    img_label_df = pd.merge(df_anns[['image_id', 'category_id']], 
                            df_imgs[['id', 'file_name']], 
                            left_on='image_id',
                            right_on = 'id')
    
   
    # One hot encode all labels with 0 or 1 for all the categories
    array_labels = lambda x: np.array([1 if i in x else 0 for i in range(len(categories))])
    img_label_df['labels'] = img_label_df['category_id'].apply(array_labels)


    #Change the array to columns
    to_dict = lambda x: dict(zip([f'{i}' for i in range(len(x))], x))
    new_df = pd.DataFrame(img_label_df['labels'].apply(to_dict).tolist())

    img_label_df = pd.concat([img_label_df.drop('labels', axis=1), new_df], axis=1)

    # Rename the columns according to the category
    if categories_level == 'supercategory':
        cat = {str(d['id']): d['supercategory'].replace(' ', '_') for d in categories}
        img_label_df = img_label_df.rename(columns = cat)
    else:
        cat = {str(d['id']): d['name'].replace(' ', '_') for d in categories}
        img_label_df = img_label_df.rename(columns = cat)
    
    
    img_label_df = img_label_df.drop(['category_id','id'], axis = 1)
    img_label_df['file_name'] = img_label_df['file_name'].str.replace('/', '_')
    
    img_label_df['file_name'] = img_label_df['file_name'].str.lower()


 
    # Do the same for scenes
    if add_scene == 'scene':
        col_temp = img_label_df.columns
        
        img_labels_scene_df = pd.merge(img_label_df[col_temp], 
                                df_scenes[['image_id', 'background_ids']], 
                                left_on='image_id',
                                right_on = 'image_id')
        
        # One hot encode all labels with 0 or 1 for all the categories
        array_scenes = lambda x: np.array([1 if i in x else 0 for i in range(len(scene_cat))])
        img_labels_scene_df['scenes'] = img_labels_scene_df['background_ids'].apply(array_scenes)
        
        #Change the array to columns
        to_dict = lambda x: dict(zip([f'{i}' for i in range(len(x))], x))
        new_df = pd.DataFrame(img_labels_scene_df['scenes'].apply(to_dict).tolist())
        img_labels_scene_df = pd.concat([img_labels_scene_df.drop('scenes', axis=1), new_df], axis=1)
        
        # Rename the columns according to the scenes
        cat = {str(d['id']): d['name'].replace(' ', '_') for d in scene_cat}
        img_labels_scene_df = img_labels_scene_df.rename(columns = cat)
        
        img_labels_scene_df = img_labels_scene_df.drop(['image_id','background_ids'], axis = 1)
        img_label_df = img_labels_scene_df
    else:
        add_scene = '_'
        print('No scene annotations added')

        
        
    # Drop columns that are double, and create a binary df again
    if categories_level == 'supercategory':
        img_label_df = img_label_df.groupby(img_label_df.columns, axis=1).sum()
        
        def binarize(x):
            if x == 0:
                return 0
            else:
                return 1
        img_label_df.iloc[:,:-1] = img_label_df.iloc[:,:-1].applymap(binarize)
        
        #If you use this, personalize your directories
        dir_path = 'data/splits/'
        df_name = f'sup_cat_{add_scene}_labels.csv'
        dir_df = dir_path+df_name
        img_label_df.to_csv(dir_df,index=False)
    else:
        dir_path = 'data/splits/'
        df_name = f'{add_scene}_labels.csv'
        dir_df = dir_path+df_name
        img_label_df.to_csv(dir_df,index=False)
        

    
    return img_label_df

df = create_image_label_df(anns_file_path, categories_level = 'supercategory',add_scene = 'scene')

(1500, 2) (1500, 9)


#### Section 2: Changes on the folder structure of the images & image name


In [None]:
# Change the image names that they match the csv file. Use this function in the GitHub TACO data folder

directory_path = '../data'
folder_names = ['batch_' + str(i) for i in range(1, 16)]

def add_folder_prefix_to_image_names(directory_path, folder_names):
    """
    Adds the folder name as a prefix to the image names in each folder in the specified directory.
    
    Args:
        directory_path (str): The path to the directory containing the folders.
        folder_names (list of str): A list of folder names in the directory.
    """
    for folder_name in folder_names:
        folder_path = os.path.join(directory_path, folder_name)
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            new_file_name = folder_name + '_' + file_name
            os.rename(file_path, os.path.join(folder_path, new_file_name))


#add_folder_prefix_to_image_names(directory_path, folder_names)

# Unify all the image names (lower-case)

directory_path = '/Users/mjs/Desktop/Dev/TACO/data/all_images'

for filename in os.listdir(directory_path):
    if filename.endswith(".JPG"):
        new_filename = filename.lower()
        os.rename(os.path.join(directory_path, filename), os.path.join(directory_path, new_filename))

In [10]:
# create the new folder that contains all the images
def collect_images(directory_path, folder_names):
    new_folder_path = os.path.join(directory_path, 'all_images')
    if not os.path.exists(new_folder_path):
        os.mkdir(new_folder_path)

    for folder_name in folder_names:
        folder_path = os.path.join(directory_path, folder_name)
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            shutil.copy(file_path, os.path.join(new_folder_path, file_name))
            
            
#collect_images(directory_path, folder_names)

#### Section 3: Feature engineering
There are 60 categories and 28 super categories given. 

For easier use I choose new labels for the dataset, namely packaging materials (packaging_recycle), paper, glass, others and toxic materials

In [129]:
labels = pd.read_csv('./data/splits/sup_cat_scene_labels.csv')
labels.columns
labels.shape

(1496, 36)

In [132]:
#labels = pd.read_csv('./data/splits/labels.csv')

def feature_engineering(input_df, merging_features, new_feature_name):
    '''
    New definition of a super category:
    1. Input the original labels df
    2. Enter a list of labels that should be merged together to a new super category
    3. Give the new super category a name
    '''
    output_df = input_df.copy(deep=True)
    
    output_df[str(new_feature_name)] = output_df[merging_features].any(axis=1)
    
    return output_df[str(new_feature_name)]

petroleum_based = ['Plastic_bag_&_wrapper',
                   'Bottle_cap',
                   'Other_plastic',
                   'Straw',
                   'Lid',
                   'Plastic_container',
                   'Plastic_utensils',
                   'Rope_&_strings',
                   'Blister_pack',
                   'Plastic_glooves']
                   
                   
                   

bottle = ['Bottle',
          'Can',
          'Glass_jar'
          ]


other = ['Unlabeled_litter',
         'Cigarette',
         'Styrofoam_piece',
         'Pop_tab',
         'Broken_glass',
         'Scrap_metal',
         'Food_waste',
         'Shoe',
         'Squeezable_tube',
         'Battery']
         
         


paper = ['Carton',
         'Cup',
         'Paper',
         'Paper_bag'
         ]

scene_1 = ['Clean']
scene_2 = ['Indoor,_Man-made']
scene_3 = ['Pavement']
scene_4 = ['Sand,_Dirt,_Pebbles']
scene_5 = ['Trash']
scene_6 = ['Vegetation']
scene_7 = ['Water']


recycle_scenes_dict = {
    'petroleum_based': petroleum_based,
    'bottle': bottle,
    'other': other,
    'paper':paper,
    'Clean':scene_1,
    'Indoor,_Man-made':scene_2,
    'Pavement':scene_3,
    'Sand,_Dirt,_Pebbles':scene_4,
    'Trash':scene_5,
    'Vegetation':scene_6,
    'Water':scene_7
        
}


def recycle_labels(input_df, feature_definitions_dict):
    '''
    1. Create a dict with the new label name as the key and the merging features (list) as values
    2. Give the csv a new name
    
    '''
    
    list_labels = ['file_name']
    df_new_labels = input_df.copy(deep=True)
    
    for key, value in feature_definitions_dict.items():
        df_feature = feature_engineering(input_df, merging_features=value, new_feature_name=key)
        df_new_labels[key] = df_feature
        list_labels.append(key)
        

    
    output_df = df_new_labels[list_labels]
    
    def binarize(x):
        if x == True:
            return 1
        else:
            return 0
    output_df.loc[:,1:] = output_df.iloc[:,1:].applymap(binarize)
 
    
    output_df.to_csv('data/splits/optimized_scenes_labels.csv',index=False)
    
    return output_df
    
rec_labels = recycle_labels(labels,recycle_scenes_dict)   
rec_labels.head()

  output_df.loc[:,1:] = output_df.iloc[:,1:].applymap(binarize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df.loc[:,1:] = output_df.iloc[:,1:].applymap(binarize)


Unnamed: 0,file_name,petroleum_based,bottle,other,paper,Clean,"Indoor,_Man-made",Pavement,"Sand,_Dirt,_Pebbles",Trash,Vegetation,Water
0,batch_1_000006.jpg,0,1,0,0,0,1,0,0,0,0,0
1,batch_1_000008.jpg,0,0,0,1,0,1,0,0,0,0,0
2,batch_1_000010.jpg,1,1,0,0,0,0,1,0,0,0,0
3,batch_1_000019.jpg,1,1,0,0,0,0,0,0,0,1,0
4,batch_1_000026.jpg,0,1,0,0,0,0,0,0,0,1,0


In [131]:
rec_labels.shape

(1496, 12)

#### Section 4: Split and balance the data

In [85]:
scene_cat

[{'id': 0, 'name': 'Clean'},
 {'id': 1, 'name': 'Indoor, Man-made'},
 {'id': 2, 'name': 'Pavement'},
 {'id': 3, 'name': 'Sand, Dirt, Pebbles'},
 {'id': 4, 'name': 'Trash'},
 {'id': 5, 'name': 'Vegetation'},
 {'id': 6, 'name': 'Water'}]

In [133]:
#Check which column is the file_name column
rec_df = pd.read_csv('./data/splits/optimized_scenes_labels.csv')
labels_orig = list(rec_df.columns)[1:]

def stratified_split (df, train_split, val_split, columns):
    assert (train_split + val_split) == 1
    
    df_sample = df.sample(frac=1, random_state=42)

    grouped_df = df_sample.groupby(columns)
    arr_list = [np.split(g, [int(train_split * len(g))]) for i, g in grouped_df]
    
    train_df = pd.concat([t[0] for t in arr_list])
    val_df = pd.concat([t[1] for t in arr_list])
         
    return train_df, val_df


train_df, test_df = stratified_split(df = rec_df, train_split = 0.85, val_split = 0.15, columns = labels_orig )
train_df, val_df = stratified_split(df = train_df, train_split = 0.85, val_split = 0.15, columns = labels_orig )




def class_balancing (input_df, classes_to_balance):

    mygrpCounts = input_df.groupby(classes_to_balance)["file_name"].count()
    myindxs = mygrpCounts.index.to_numpy()
    vals = mygrpCounts.values.astype(np.float32)
    #vals2=np.minimum(np.sqrt(vals.max()/(vals)), 7)
    vals2 = (vals.max()/(vals))**0.3 # this limits the oversampling since 1:1 would lead to a 8x increase in size
    vals2 = vals2.round().astype(np.int32)
    output_df = input_df.copy(deep=True)
    
    for index in range(len(vals2)):
        locconds=myindxs[index]
        locval=vals2[index]
        locDf=input_df
        for colmnInd in range(len(classes_to_balance)):
            locDf=locDf[locDf[classes_to_balance[colmnInd]]==locconds[colmnInd]]
        if vals2[index]!=1:
            locDf2=pd.concat([locDf]*(locval-1), ignore_index=True)
            output_df=pd.concat((output_df, locDf2), ignore_index=True)

    output_df=output_df.sample(frac=1)
    #newcounts=output_df.groupby(classes_to_balance)["name"].count()
    #print(newcounts)
    #print(mygrpCounts)
    
    return output_df

train_rec_bal = class_balancing(input_df = train_df, classes_to_balance = labels_orig)


CSVPathIn = './data/splits/'

testPathOut=f'{CSVPathIn}test_opt_scene_2023-05-08.csv'
trainbalPathOut=f'{CSVPathIn}train_opt_scene_partially_balanced_2023-05-08.csv'
trainPathOut=f'{CSVPathIn}train_opt_scene_2023-05-08.csv'
valPathOut=f'{CSVPathIn}val_opt_scene_2023-05-08.csv'

test_df.to_csv(testPathOut,index=False)
train_rec_bal.to_csv(trainbalPathOut,index=False)
train_df.to_csv(trainPathOut,index=False)
val_df.to_csv(valPathOut,index=False)

print(f'''
    Original train: \t {train_df.shape[0]} rows
    Original validation:   {val_df.shape[0]} rows
    Original test: \t   {test_df.shape[0]} rows
    
    --Row count summary--
    Stratified split 85-15
    Train  \t {train_df.shape[0]} rows
    Validation    {val_df.shape[0]} rows
    
    Partial balancing
    Train bal\t {train_rec_bal.shape[0]} rows
    
    
    ''')



    Original train: 	 908 rows
    Original validation:   243 rows
    Original test: 	   345 rows
    
    --Row count summary--
    Stratified split 85-15
    Train  	 908 rows
    Validation    243 rows
    
    Partial balancing
    Train bal	 1518 rows
    
    
    


### K-Fold - increasing the dataset

In [5]:
train_bal_mat = pd.read_csv('./data/splits/train_material_partially_balanced_2023-04-13.csv')


In [6]:
CSVPathIn = './data/splits/'
trainbalx3PathOut=f'{CSVPathIn}train_material_partially_balanced_3fold_2023-04-14.csv'
trainbalx5PathOut=f'{CSVPathIn}train_material_partially_balanced_5_fold_2023-04-14.csv'



train_balx3_df = pd.concat([train_bal_mat]*3, ignore_index=True)
train_balx5_df = pd.concat([train_bal_mat]*5, ignore_index=True)


train_balx3_df.to_csv(trainbalx3PathOut,index=False)
train_balx5_df.to_csv(trainbalx5PathOut,index=False)