In [389]:
import numpy as np
import pandas as pd
import shutil
from roboflow import Roboflow
import os
import re

In [390]:
def clean_up(integral=False):
    if os.path.exists('GARBAGE-CLASSIFICATION-3-2'):
        shutil.rmtree('GARBAGE-CLASSIFICATION-3-2')
    if integral:
        if os.path.exists('data'):
            shutil.rmtree('data')
    else:
        for file in os.listdir('data'):
            file = os.path.join('data', file)
            if os.path.isfile(file):
                os.remove(file)

def move_files_recursively(src,dest='data',files=None):
    if not os.path.exists(dest):
        os.makedirs(dest)
    files_to_move = []
    if files is None:
        for root, dirs, files in os.walk(src):
            for file in files:
                if file.endswith('.jpg'):
                    files_to_move.append(os.path.join(root, file))
    else:
        interesting_files = list(set(files) & set(os.listdir(src)))
        for file in interesting_files:
            files_to_move.append(os.path.join(src, file))
    for file in files_to_move:
        shutil.move(file, dest)

In [391]:
clean_up(True)
rf = Roboflow(api_key="PYZngL70ijcagLdXpb6l")
project = rf.workspace("material-identification").project("garbage-classification-3")
dataset = project.version(2).download("tensorflow")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in GARBAGE-CLASSIFICATION-3-2 to tensorflow:: 100%|██████████| 204830/204830 [00:06<00:00, 33822.03it/s]




Extracting Dataset Version Zip to GARBAGE-CLASSIFICATION-3-2 in tensorflow:: 100%|██████████| 10472/10472 [00:00<00:00, 11028.00it/s]


In [392]:
train = pd.read_csv('GARBAGE-CLASSIFICATION-3-2/train/_annotations.csv')
test = pd.read_csv('GARBAGE-CLASSIFICATION-3-2/test/_annotations.csv')
val = pd.read_csv('GARBAGE-CLASSIFICATION-3-2/valid/_annotations.csv')
combined_data = pd.concat([train, test, val], ignore_index=True)

In [393]:
move_files_recursively('GARBAGE-CLASSIFICATION-3-2')

In [394]:
map_classes = {
    'METAL': 0,
    'CARDBOARD': 0,
    'PLASTIC': 0,
    'PAPER': 1,
    'GLASS': 2,
    'BIODEGRADABLE': 3
}
def get_class(c):
    for key in map_classes.keys():
        if key==c:
            return map_classes.get(key)
    return c

def remove_until_char(input_string,remove_char,include_char=True):
    match = re.search(remove_char, input_string)
    if match:
        return input_string[match.start() + 1 if include_char else 0:]
    return input_string

def edit_row(row):
    # file_name = row['filename']
    classe = get_class(row['class'])
    # file_name_modified = f'{classe.lower()}{remove_until_char(file_name,remove_char="_",include_char=False)}'
    # row['filename'] = file_name_modified
    row['class'] = classe
    # if os.path.exists(f'data/{file_name}'):
    #     os.rename(f'data/{file_name}', f'data/{file_name_modified}')
    return row

for index, row in combined_data.iterrows():
    combined_data.loc[index] = edit_row(row)

classes = ['PAPER', 'GLASS', 'METAL_CARDBOARD_PLASTIC','BIODEGRADABLE']
combined_data['class'] = combined_data['class'].apply(get_class)

classes_occurences = combined_data['class'].value_counts()    

In [395]:
print(f'Dataframe shape: {combined_data.shape}')
combined_data = combined_data.sample(frac=1).reset_index(drop=True) #in place shuffle
print(f'Dataframe shape shuffled: {combined_data.shape}')
balanced_data = pd.DataFrame(columns=combined_data.columns)
for class_label in combined_data['class'].unique():
    class_samples = combined_data[combined_data['class'] == class_label].head(np.min(classes_occurences))
    balanced_data = pd.concat([balanced_data, class_samples], ignore_index=True)
print(f'Balanced dataframe shape: {balanced_data.shape}')
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
print(f'Balanced dataframe shape shuffled: {balanced_data.shape}')
balanced_data.to_csv('data/balanced_annotations.csv', index=False)

Dataframe shape: (74090, 8)
Dataframe shape shuffled: (74090, 8)
Balanced dataframe shape: (17560, 8)
Balanced dataframe shape shuffled: (17560, 8)


In [396]:
ratios=[0.6, 0.25, 0.15]


def get_set_name():
    rand = np.random.uniform()
    if rand <= ratios[0]:
        set_name = 'TRAIN'
    elif rand <= ratios[0] + ratios[1]:
        set_name = 'TEST'
    else:
        set_name = 'VAL'
    return set_name

balanced_data['set'] = balanced_data.apply(lambda row: get_set_name(), axis=1) 

In [397]:
balanced_data.to_csv('data/annotations.csv', index=False)
balanced_data

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax,set
0,paper183_jpg.rf.0a02cd08383d2d3c1c5ff1b465b3ad...,416,416,1,107,220,414,414,TEST
1,cardboard442_jpg.rf.a316f85a031314fcbb04af7d7d...,416,416,0,4,52,404,361,TRAIN
2,cardboard1299_jpg.rf.8dc041efa9d5eb17dcdccb772...,416,416,0,52,104,221,364,TRAIN
3,paper2003_jpg.rf.ed53d988f025a95cd0ddc9a388b8c...,416,416,1,105,0,333,377,TEST
4,biodegradable456_jpg.rf.6796979e1bde834fe6d26c...,416,416,3,300,371,397,415,TEST
...,...,...,...,...,...,...,...,...,...
17555,biodegradable1644_jpg.rf.12fa4ef84042e676e4ae5...,416,416,3,166,18,200,59,TRAIN
17556,glass1743_jpg.rf.386f202fe224a7bb2f2223cdb504d...,416,416,2,222,30,362,387,VAL
17557,glass1584_jpg.rf.5de7d2435d1ee4c4872de7b67ec5a...,416,416,2,154,304,320,411,TRAIN
17558,paper1025_jpg.rf.6081505db67655fcd54d3a0da076c...,416,416,1,305,125,395,320,TRAIN
