## Data Preparation
This Notebook helps to transform the data to my desired format. Apply this Notebook here (https://github.com/pedropro/TACO) to create a .csv with all the labels and the corresponding image names.

The images are divided in 15 folders with ambiguous names, this problem is resolved by adding the folder name (batch_*) as a prefix. The images are also saved in this format in the annotations.json


#### Section 1
Create a csv file with the labels and the images

#### Section 2
Rename the images (unique names) and create one folder with all the images

#### Section 3
Feature engineering

#### Section 4
Split and balance the data

In [1]:
import os
import numpy as np
import pandas as pd
import json
import shutil

In [2]:
# Path to the image directory and annotation file
dataset_path = './data'
anns_file_path = dataset_path + '/' + 'splits/annotations.json'


#### Section 1: Create the csv

In [4]:
# Creating a csv File with the image names and the labels
# Merge two df, create an array with 0/1 depending on the labels
# categories_level = 'name' for all 60 labels and categories_level = 'supercategory' for the reduced 30 labels
def create_image_label_df(anns_file_path, categories_level = 'name'):
    with open(anns_file_path, 'r') as f:
        dataset = json.loads(f.read())

    anns = dataset['annotations']
    imgs = dataset['images']
    categories = dataset['categories']

    df_anns_raw = pd.DataFrame(anns)
    df_imgs = pd.DataFrame(imgs)

    df_anns = df_anns_raw.groupby('image_id').agg({'category_id':list}).reset_index()

    df_anns['category_id'] = df_anns['category_id'].apply(set)

    img_label_df = pd.merge(df_anns[['image_id', 'category_id']], 
                            df_imgs[['id', 'file_name','flickr_url']], 
                            left_on='image_id',
                            right_on = 'id')

    array_labels = lambda x: np.array([1 if i+1 in x else 0 for i in range(len(categories))])
    img_label_df['labels'] = img_label_df['category_id'].apply(array_labels)


    #Change the array to columns

    to_dict = lambda x: dict(zip([f'{i}' for i in range(len(x))], x))
    new_df = pd.DataFrame(img_label_df['labels'].apply(to_dict).tolist())

    img_label_df = pd.concat([img_label_df.drop('labels', axis=1), new_df], axis=1)

    # Rename the columns according to the category
    if categories_level == 'supercategory':
        cat = {str(d['id']): d['supercategory'].replace(' ', '_') for d in categories}
        img_label_df = img_label_df.rename(columns = cat)
    else:
        cat = {str(d['id']): d['name'].replace(' ', '_') for d in categories}
        img_label_df = img_label_df.rename(columns = cat)
    
    
    img_label_df = img_label_df.drop(['category_id','image_id','id','flickr_url'], axis = 1)
    img_label_df['file_name'] = img_label_df['file_name'].str.replace('/', '_')
    
    img_label_df['file_name'] = img_label_df['file_name'].str.lower()
    

    # Drop columns that are double, and create a binary df again
    if categories_level == 'supercategory':
        img_label_df = img_label_df.groupby(img_label_df.columns, axis=1).sum()
        
        def binarize(x):
            if x == 0:
                return 0
            else:
                return 1
        img_label_df.iloc[:,:-1] = img_label_df.iloc[:,:-1].applymap(binarize)
        img_label_df.to_csv('data/splits/sup_cat_labels.csv',index=False)
    else:
        img_label_df.to_csv('data/splits/labels.csv',index=False)

    
    return img_label_df

create_image_label_df(anns_file_path, categories_level = 'name')

Unnamed: 0,file_name,Aluminium_foil,Battery,Aluminium_blister_pack,Carded_blister_pack,Other_plastic_bottle,Clear_plastic_bottle,Glass_bottle,Plastic_bottle_cap,Metal_bottle_cap,...,Pop_tab,Rope_&_strings,Scrap_metal,Shoe,Squeezable_tube,Plastic_straw,Paper_straw,Styrofoam_piece,Unlabeled_litter,Cigarette
0,batch_1_000006.jpg,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,batch_1_000008.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,batch_1_000010.jpg,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,batch_1_000019.jpg,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,batch_1_000026.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,batch_9_000095.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1496,batch_9_000096.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1497,batch_9_000097.jpg,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1498,batch_9_000098.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Section 2: Changes on the folder structure of the images & image name


In [None]:
# Change the image names that they match the csv file. Use this function in the GitHub TACO data folder

directory_path = '../data'
folder_names = ['batch_' + str(i) for i in range(1, 16)]

def add_folder_prefix_to_image_names(directory_path, folder_names):
    """
    Adds the folder name as a prefix to the image names in each folder in the specified directory.
    
    Args:
        directory_path (str): The path to the directory containing the folders.
        folder_names (list of str): A list of folder names in the directory.
    """
    for folder_name in folder_names:
        folder_path = os.path.join(directory_path, folder_name)
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            new_file_name = folder_name + '_' + file_name
            os.rename(file_path, os.path.join(folder_path, new_file_name))


#add_folder_prefix_to_image_names(directory_path, folder_names)

# Unify all the image names (lower-case)

directory_path = '/Users/mjs/Desktop/Dev/TACO/data/all_images'

for filename in os.listdir(directory_path):
    if filename.endswith(".JPG"):
        new_filename = filename.lower()
        os.rename(os.path.join(directory_path, filename), os.path.join(directory_path, new_filename))

In [10]:
# create the new folder that contains all the images
def collect_images(directory_path, folder_names):
    new_folder_path = os.path.join(directory_path, 'all_images')
    if not os.path.exists(new_folder_path):
        os.mkdir(new_folder_path)

    for folder_name in folder_names:
        folder_path = os.path.join(directory_path, folder_name)
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            shutil.copy(file_path, os.path.join(new_folder_path, file_name))
            
            
#collect_images(directory_path, folder_names)

#### Section 3: Feature engineering
There are 60 categories and 28 super categories given. 

For easier use I choose new labels for the dataset, namely packaging materials (packaging_recycle), paper, glass, others and toxic materials

In [10]:
labels = pd.read_csv('./data/splits/labels.csv')

def feature_engineering(input_df, merging_features, new_feature_name):
    '''
    New definition of a super category:
    1. Input the original labels df
    2. Enter a list of labels that should be merged together to a new super category
    3. Give the new super category a name
    '''
    output_df = input_df.copy(deep=True)
    
    output_df[str(new_feature_name)] = output_df[merging_features].any(axis=1)
    
    return output_df[str(new_feature_name)]

packaging_recycle = ['Aluminium_foil',
                                'Aluminium_blister_pack',
                                'Carded_blister_pack', 
                                'Clear_plastic_bottle',
                                'Other_plastic_bottle',
                                'Plastic_bottle_cap',
                                'Metal_bottle_cap',
                                'Aerosol', 
                                'Drink_can',
                                'Food_Can',
                                'Drink_carton',
                                'Meal_carton',
                                'Disposable_plastic_cup', 
                                'Foam_cup',
                                'Other_plastic_cup',
                                'Plastic_lid',
                                'Metal_lid',
                                'Plastified_paper_bag', 
                                'Garbage_bag',
                                'Single-use_carrier_bag',
                                'Polypropylene_bag', 
                                'Plastic_film',
                                'Six_pack_rings',
                                'Crisp_packet',
                                'Other_plastic_wrapper',
                                'Spread_tub', 
                                'Tupperware',
                                'Disposable_food_container',
                                'Foam_food_container',
                                'Other_plastic_container',
                                'Plastic_glooves', 
                                'Plastic_utensils',
                                'Pop_tab',
                                'Rope_&_strings',
                                'Scrap_metal',
                                'Squeezable_tube',
                                'Plastic_straw',
                                'Styrofoam_piece', 
                                'Other_plastic']

aluminum = ['Aluminium_foil',
           'Aluminium_blister_pack',
            'Metal_bottle_cap',
            'Aerosol', 
            'Drink_can',
            'Food_Can',
            'Metal_lid',
            'Scrap_metal']


plastic = ['Carded_blister_pack', 
            'Clear_plastic_bottle',
            'Other_plastic_bottle',
            'Plastic_bottle_cap',
            'Drink_carton',
            'Disposable_plastic_cup', 
            'Foam_cup',
            'Other_plastic_cup',
            'Plastic_lid',
            'Plastified_paper_bag', 
            'Garbage_bag',
            'Single-use_carrier_bag',
            'Polypropylene_bag', 
            'Plastic_film',
            'Six_pack_rings',
            'Crisp_packet',
            'Other_plastic_wrapper',
            'Spread_tub', 
            'Tupperware',
            'Disposable_food_container',
            'Foam_food_container',
            'Other_plastic_container',
            'Plastic_glooves', 
            'Plastic_utensils',
            'Pop_tab',
            'Rope_&_strings',
            'Squeezable_tube',
            'Plastic_straw',
            'Styrofoam_piece', 
            'Other_plastic']


paper = ['Corrugated_carton',
        'Meal_carton',
        'Egg_carton',
        'Pizza_box', 
        'Toilet_tube',
        'Other_carton',
        'Paper_cup',
        'Normal_paper',
        'Tissues', 
        'Wrapping_paper',
        'Magazine_paper',
        'Paper_bag',
        'Paper_straw'
                       ]


glass = ['Glass_bottle',
        'Broken_glass',
        'Glass_cup', 
        'Glass_jar'
                       ]



other_objects = ['Cigarette',
                 'Shoe',
                 'Unlabeled_litter', 
                 'Food_waste'
                       ]



toxic = ['Battery']

recycle_dict = {
    'aluminum': ['Aluminium_foil', 'Aluminium_blister_pack', 'Metal_bottle_cap','Aerosol', 
                 'Drink_can', 'Food_Can', 'Metal_lid', 'Scrap_metal'],
    
    'plastic': ['Carded_blister_pack', 'Clear_plastic_bottle','Other_plastic_bottle',
            'Plastic_bottle_cap', 'Drink_carton', 'Disposable_plastic_cup', 'Foam_cup',
            'Other_plastic_cup', 'Plastic_lid', 'Plastified_paper_bag', 'Garbage_bag',
            'Single-use_carrier_bag','Polypropylene_bag', 'Plastic_film', 'Six_pack_rings',
            'Crisp_packet', 'Other_plastic_wrapper','Spread_tub', 'Tupperware',
            'Disposable_food_container','Foam_food_container', 'Other_plastic_container',
            'Plastic_glooves', 'Plastic_utensils','Pop_tab','Rope_&_strings',
            'Squeezable_tube','Plastic_straw','Styrofoam_piece', 'Other_plastic'],
    
    'paper': ['Corrugated_carton', 'Meal_carton', 'Egg_carton', 'Pizza_box', 
        'Toilet_tube', 'Other_carton', 'Paper_cup', 'Normal_paper',
        'Tissues', 'Wrapping_paper', 'Magazine_paper', 'Paper_bag','Paper_straw'],
    
    'glass':  ['Glass_bottle', 'Broken_glass', 'Glass_cup', 'Glass_jar'],
    
    'other_objects': ['Cigarette', 'Shoe', 'Unlabeled_litter', 'Food_waste'],
    
    'toxic': ['Battery']
}


def recycle_labels(input_df, feature_definitions_dict):
    '''
    1. Create a dict with the new label name as the key and the merging features (list) as values
    2. Give the csv a new name
    
    '''
    
    list_labels = ['file_name']
    df_new_labels = input_df.copy(deep=True)
    
    for key, value in feature_definitions_dict.items():
        df_feature = feature_engineering(input_df, merging_features=value, new_feature_name=key)
        df_new_labels[key] = df_feature
        list_labels.append(key)
        

    
    output_df = df_new_labels[list_labels]
    
    def binarize(x):
        if x == True:
            return 1
        else:
            return 0
    output_df.loc[:,1:] = output_df.iloc[:,1:].applymap(binarize)
 
    
    #output_df.to_csv('data/splits/material_labels.csv',index=False)
    
    return output_df
    
rec_labels = recycle_labels(labels,recycle_dict)   
rec_labels.head()

  output_df.loc[:,1:] = output_df.iloc[:,1:].applymap(binarize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df.loc[:,1:] = output_df.iloc[:,1:].applymap(binarize)


Unnamed: 0,file_name,aluminum,plastic,paper,glass,other_objects,toxic
0,batch_1_000006.jpg,0,1,0,0,0,0
1,batch_1_000008.jpg,0,0,1,0,0,0
2,batch_1_000010.jpg,0,1,0,1,0,0
3,batch_1_000019.jpg,1,1,0,1,0,0
4,batch_1_000026.jpg,1,0,0,0,0,0


#### Section 4: Split and balance the data

In [12]:
list(recycle_dict.keys())

['aluminum', 'plastic', 'paper', 'glass', 'other_objects', 'toxic']

In [13]:
labels_rec = list(recycle_dict.keys())
rec_df = pd.read_csv('./data/splits/material_labels.csv')

def stratified_split (df, train_split, val_split, columns):
    assert (train_split + val_split) == 1
    
    df_sample = df.sample(frac=1, random_state=42)

    grouped_df = df_sample.groupby(columns)
    arr_list = [np.split(g, [int(train_split * len(g))]) for i, g in grouped_df]
    
    train_df = pd.concat([t[0] for t in arr_list])
    val_df = pd.concat([t[1] for t in arr_list])
         
    return train_df, val_df


train_df, test_df = stratified_split(df = rec_df, train_split = 0.85, val_split = 0.15, columns = labels_rec )
train_df, val_df = stratified_split(df = train_df, train_split = 0.85, val_split = 0.15, columns = labels_rec )




def class_balancing (input_df, classes_to_balance):

    mygrpCounts = input_df.groupby(classes_to_balance)["file_name"].count()
    myindxs = mygrpCounts.index.to_numpy()
    vals = mygrpCounts.values.astype(np.float32)
    #vals2=np.minimum(np.sqrt(vals.max()/(vals)), 7)
    vals2 = (vals.max()/(vals))**0.7 # this limits the oversampling since 1:1 would lead to a 8x increase in size
    vals2 = vals2.round().astype(np.int32)
    output_df = input_df.copy(deep=True)
    
    for index in range(len(vals2)):
        locconds=myindxs[index]
        locval=vals2[index]
        locDf=input_df
        for colmnInd in range(len(classes_to_balance)):
            locDf=locDf[locDf[classes_to_balance[colmnInd]]==locconds[colmnInd]]
        if vals2[index]!=1:
            locDf2=pd.concat([locDf]*(locval-1), ignore_index=True)
            output_df=pd.concat((output_df, locDf2), ignore_index=True)

    output_df=output_df.sample(frac=1)
    #newcounts=output_df.groupby(classes_to_balance)["name"].count()
    #print(newcounts)
    #print(mygrpCounts)
    
    return output_df

train_rec_bal = class_balancing(input_df = train_df, classes_to_balance = labels_rec)


CSVPathIn = './data/splits/'

testPathOut=f'{CSVPathIn}test_material_2023-04-13.csv'
trainbalPathOut=f'{CSVPathIn}train_material_partially_balanced_2023-04-13.csv'
trainPathOut=f'{CSVPathIn}train_material_2023-04-13.csv'
valbalPathOut=f'{CSVPathIn}val_material_2023-04-13.csv'

test_df.to_csv(testPathOut,index=False)
train_rec_bal.to_csv(trainbalPathOut,index=False)
train_df.to_csv(trainPathOut,index=False)
val_df.to_csv(valbalPathOut,index=False)

print(f'''
    Original train: \t {train_df.shape[0]} rows
    Original validation:   {val_df.shape[0]} rows
    Original test: \t   {test_df.shape[0]} rows
    
    --Row count summary--
    Stratified split 85-15
    Train  \t {train_df.shape[0]} rows
    Validation    {val_df.shape[0]} rows
    
    Partial balancing
    Train bal\t {train_rec_bal.shape[0]} rows
    
    
    ''')



    Original train: 	 1053 rows
    Original validation:   202 rows
    Original test: 	   245 rows
    
    --Row count summary--
    Stratified split 85-15
    Train  	 1053 rows
    Validation    202 rows
    
    Partial balancing
    Train bal	 4234 rows
    
    
    


### K-Fold - increasing the dataset

In [5]:
train_bal_mat = pd.read_csv('./data/splits/train_material_partially_balanced_2023-04-13.csv')


In [6]:
CSVPathIn = './data/splits/'
trainbalx3PathOut=f'{CSVPathIn}train_material_partially_balanced_3fold_2023-04-14.csv'
trainbalx5PathOut=f'{CSVPathIn}train_material_partially_balanced_5_fold_2023-04-14.csv'



train_balx3_df = pd.concat([train_bal_mat]*3, ignore_index=True)
train_balx5_df = pd.concat([train_bal_mat]*5, ignore_index=True)


train_balx3_df.to_csv(trainbalx3PathOut,index=False)
train_balx5_df.to_csv(trainbalx5PathOut,index=False)