In [22]:
import numpy as np
import pandas as pd
import shutil
from roboflow import Roboflow
import os
import re

In [23]:
def clean_up(integral=False):
    if os.path.exists('GARBAGE-CLASSIFICATION-3-2'):
        shutil.rmtree('GARBAGE-CLASSIFICATION-3-2')
    if integral:
        if os.path.exists('data'):
            shutil.rmtree('data')
    else:
        for file in os.listdir('data'):
            file = os.path.join('data', file)
            if os.path.isfile(file):
                os.remove(file)

def move_files_recursively(src,dest='data',files=None):
    if not os.path.exists(dest):
        os.makedirs(dest)
    files_to_move = []
    if files is None:
        for root, dirs, files in os.walk(src):
            for file in files:
                if file.endswith('.jpg'):
                    files_to_move.append(os.path.join(root, file))
    else:
        interesting_files = list(set(files) & set(os.listdir(src)))
        for file in interesting_files:
            files_to_move.append(os.path.join(src, file))
    for file in files_to_move:
        shutil.move(file, dest)

In [24]:
clean_up(True)
rf = Roboflow(api_key="PYZngL70ijcagLdXpb6l")
project = rf.workspace("material-identification").project("garbage-classification-3")
dataset = project.version(2).download("tensorflow")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in GARBAGE-CLASSIFICATION-3-2 to tensorflow:: 100%|██████████| 204830/204830 [00:07<00:00, 28045.29it/s]




Extracting Dataset Version Zip to GARBAGE-CLASSIFICATION-3-2 in tensorflow:: 100%|██████████| 10472/10472 [00:00<00:00, 11414.68it/s]


In [25]:
train = pd.read_csv('GARBAGE-CLASSIFICATION-3-2/train/_annotations.csv')
test = pd.read_csv('GARBAGE-CLASSIFICATION-3-2/test/_annotations.csv')
val = pd.read_csv('GARBAGE-CLASSIFICATION-3-2/valid/_annotations.csv')
combined_data = pd.concat([train, test, val], ignore_index=True)

In [26]:
move_files_recursively('GARBAGE-CLASSIFICATION-3-2')

In [27]:
map_classes = {
    'METAL': 'METAL_CARDBOARD_PLASTIC',
    'CARDBOARD': 'METAL_CARDBOARD_PLASTIC',
    'PLASTIC': 'METAL_CARDBOARD_PLASTIC',
    'PAPER': 'PAPER',
    'GLASS': 'GLASS',
    'BIODEGRADABLE': 'BIODEGRADABLE'
}
def get_class(c):
    for key in map_classes.keys():
        if key==c:
            return map_classes.get(key)
    return c

def remove_until_char(input_string,remove_char,include_char=True):
    match = re.search(remove_char, input_string)
    if match:
        return input_string[match.start() + 1 if include_char else 0:]
    return input_string

def get_set_name(ratios=None):
    if ratios is None:
        ratios = [0.6, 0.25, 0.15]
    rand = np.random.uniform()
    if rand <= ratios[0]:
        set_name = 'TRAIN'
    elif rand <= ratios[0] + ratios[1]:
        set_name = 'TEST'
    else:
        set_name = 'VAL'
    return set_name

def edit_row(row):
    row['class'] = get_class(row['class'])
    return row

for index, row in combined_data.iterrows():
    combined_data.loc[index] = edit_row(row)

classes_occurences = combined_data['class'].value_counts()    
classes_occurences

class
BIODEGRADABLE              45407
METAL_CARDBOARD_PLASTIC    16484
GLASS                       7809
PAPER                       4390
Name: count, dtype: int64

In [28]:
balanced_data = pd.DataFrame(columns=combined_data.columns)
for class_label in combined_data['class'].unique():
    class_samples = combined_data[combined_data['class'] == class_label].head(np.min(classes_occurences))
    balanced_data = pd.concat([balanced_data, class_samples], ignore_index=True)
print(f'Balanced dataframe shape: {balanced_data.shape}')
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
print(f'Balanced dataframe shape shuffled: {balanced_data.shape}')

Balanced dataframe shape: (17560, 8)
Balanced dataframe shape shuffled: (17560, 8)


In [29]:
ratios = [0.6, 0.25, 0.15]
cleansed_data = pd.DataFrame(columns=['set','filename','label','xmin_relative','ymin_relative','1','2','xmax_relative','ymax_relative','3','4'])
for index, row in balanced_data.iterrows():
    width = row['width']
    height = row['height']
    row['label']=row['class']
    row['xmin_relative'] = row['xmin'] / width
    row['ymin_relative'] = row['ymin'] / height
    row['xmax_relative'] = row['xmax'] / width
    row['ymax_relative'] = row['ymax'] / height
    row['set'] = get_set_name(ratios)
    row['1'] = None
    row['2'] = None
    row['3'] = None
    row['4'] = None
    cleansed_data.loc[index] = row[['set','filename','label','xmin_relative','ymin_relative','1','2','xmax_relative','ymax_relative','3','4']]
    if index % 500 == 0:
        print(f'Processed {index+1}/{balanced_data.shape[0]}')

Processed 1/17560
Processed 501/17560
Processed 1001/17560
Processed 1501/17560
Processed 2001/17560
Processed 2501/17560
Processed 3001/17560
Processed 3501/17560
Processed 4001/17560
Processed 4501/17560
Processed 5001/17560
Processed 5501/17560
Processed 6001/17560
Processed 6501/17560
Processed 7001/17560
Processed 7501/17560
Processed 8001/17560
Processed 8501/17560
Processed 9001/17560
Processed 9501/17560
Processed 10001/17560
Processed 10501/17560
Processed 11001/17560
Processed 11501/17560
Processed 12001/17560
Processed 12501/17560
Processed 13001/17560
Processed 13501/17560
Processed 14001/17560
Processed 14501/17560
Processed 15001/17560
Processed 15501/17560
Processed 16001/17560
Processed 16501/17560
Processed 17001/17560
Processed 17501/17560


In [30]:
cleansed_data.to_csv('data/annotations.csv', index=False)
cleansed_data

Unnamed: 0,set,filename,label,xmin_relative,ymin_relative,1,2,xmax_relative,ymax_relative,3,4
0,TEST,glass2710_jpg.rf.1ae554fb0a41febc4e2e5b97c0f18...,GLASS,0.790865,0.009615,,,0.911058,0.091346,,
1,TRAIN,paper1412_jpg.rf.53a29822cb375d05cdd7c05cc5553...,PAPER,0.822115,0.100962,,,0.997596,0.646635,,
2,TEST,glass2756_jpg.rf.254540aa14cdda626e9b677f459ee...,GLASS,0.004808,0.387019,,,0.076923,0.444712,,
3,VAL,paper304_jpg.rf.2bc2335d0e32fb9b33748efe72f483...,PAPER,0.002404,0.002404,,,0.802885,0.987981,,
4,TRAIN,glass1838_jpg.rf.2b2c18924702d507b8c5863883c54...,METAL_CARDBOARD_PLASTIC,0.401442,0.634615,,,0.509615,0.846154,,
...,...,...,...,...,...,...,...,...,...,...,...
17555,TEST,biodegradable2190_jpeg.rf.15de0aa38cb90682a7b9...,BIODEGRADABLE,0.634615,0.485577,,,0.670673,0.528846,,
17556,TRAIN,glass690_jpg.rf.0f15979ae57e2c8d5d32dacb494f69...,GLASS,0.036058,0.036058,,,0.319712,0.396635,,
17557,TRAIN,glass1027_jpg.rf.7758a9a4304396a1d723a98284cdb...,GLASS,0.3125,0.694712,,,0.377404,0.728365,,
17558,VAL,paper1589_jpeg.rf.ae3d5de11a89a5d112c1b9c83085...,PAPER,0.367788,0.584135,,,0.71875,0.995192,,
