In [None]:
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn as sns
DIR_OUTPUT = Path('./testOutput')

In [None]:
dir_contents = Path.glob(DIR_OUTPUT, '*.txt')
for dc in dir_contents:
    print(dc.parts[-1])

In [None]:
def parse_synthetic_txt(path):
    '''
    Parses text files generated from synthetic data pipeline that found in the given directory
    
    Structure of text file:
        - each row corresponds to a tool in the object
        - values in row are space separated
        - first entry in row is the class label
        - rest of entries are bounding box coordinates in COCO format (x, y, w, h)
    - empty file represent background image

    Input:
        - path (Path object): directory where text files are found

    Output 
        - data_dict (dict): bounding box coordinates and labels for each image
            - Key: file_name of image w/o extension
                - sub-dictionary with Keys:
                - labels
                - bbox
        - unique_labels (set): unique set of labels found in directory
    '''
    data_dict = dict()
    unique_labels = set()
    dir_contents = Path.glob(path, '*.txt')
    for dc in dir_contents:
        meta_fn = dc.parts[-1]

        with open(Path.joinpath(path, meta_fn), 'r') as f:
            # note empty will not get processed
            for line in f:
                meta_tool = line.rstrip().split(' ')
                label = meta_tool[0]
                bbox = tuple([int(num) for num in meta_tool[1:]])
            
                try:
                    data_dict[meta_fn[:-4]]['labels'].append(label)
                    data_dict[meta_fn[:-4]]['bbox'].append(bbox)
                except KeyError:
                    data_dict[meta_fn[:-4]] = {'labels': [label],
                                            'bbox': [bbox]
                                            }
                unique_labels.add(label)

    return data_dict, list(unique_labels)

data_dict, unique_labels = parse_synthetic_txt(DIR_OUTPUT)
print(data_dict)
print(unique_labels)

In [None]:
def synthetic_labels2ints(data, unique_labels):
    '''
    YOLO model requires class labels to be integers, so convert the unique set of labels to integer classes

    Input:
        - data (dict): bounding box coordinates and labels for each image
            - Key: file_name of image w/o extension
                - sub-dictionary with Keys:
                - labels
                - bbox
        - unique_labels (set): unique set of labels found in directory
        
    Output: 
        - (dict): bounding box coordinates and labels as integers for each image
            - Key: file_name of image w/o extension
                - sub-dictionary with Keys:
                - labels
                - bbox
    '''
    data_dict = dict()

    # create correspondence between label and integers
    label_int_dict = {label:i for i, label in enumerate(unique_labels)}

    # replace labels with correpsponding integer
    for key, value in data.items():
        labels = value['labels']
        labels_ints = [label_int_dict[lab] for lab in labels]

        data_dict[key] = {'labels': tuple(labels_ints),
                            'bbox': tuple(value['bbox'])
                            }
    return data_dict


data_dict2 = synthetic_labels2ints(data_dict, unique_labels)
print(data_dict2)
print(unique_labels)

# Explore Distribution of Bounding Boxes

In [None]:
# https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time
### convert Data Dict into Dataframe ###
rows_list = []
for k, v in data_dict.items():
        labels = v['labels']
        boxes = v['bbox']
        for i, box in enumerate(boxes):
                # key = col_name
                x, y, w, h = box[0], box[1], box[2], box[3]
                temp_dict = {'image': k,
                                'label': labels[i],
                                'x': x,
                                'y': y,
                                'width': w,
                                'height': h
                                }
                rows_list.append(temp_dict)
df = pd.DataFrame(rows_list)  

In [None]:
df

#### Number of Labels

In [None]:
### Number of Labels ###
count_labels = df.groupby('label')['image'].count()
count_labels

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax = sns.barplot(x=count_labels.index, y=count_labels.values, ax=ax)
ax.set_title('Total Labels in Run')
ax.set_xlabel('Label')
ax.set_ylabel('Total')
for i, val in enumerate(count_labels.values):
    ax.text(i, val+1, val, color='black', ha="center")

#### Number of Labels per Tray

In [None]:
num_labs_tray = df.groupby('image').count().set_index('label').groupby('label').count()['x']
num_labs_tray

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax = sns.barplot(x=num_labs_tray.index, y=num_labs_tray.values, ax=ax)
ax.set_title('Count of Number of Trays by Number of Labels')
ax.set_xlabel('Total Labels per Tray')
ax.set_ylabel('Number of Trays')
for i, val in enumerate(num_labs_tray.values):
    ax.text(i, val+0.5, val, color='black', ha="center")

In [None]:
### more straight forward way to get at counts then converting to dataframe ###
from collections import Counter
count_labels = list()
num_labs_tray = list()
total_combinations = list()
for k, v in data_dict2.items():
        labels = list(v['labels'])
        boxes = v['bbox']
        labels.sort()
        count_labels.extend(labels)
        num_labs_tray.append(len(labels))
        total_combinations.append('-'.join([str(i) for i in labels]))
count_labels = Counter(count_labels)
num_labs_tray = Counter(num_labs_tray)
total_combinations = Counter(total_combinations)

# replace integer key with label

count_labels = dict((unique_labels[key], value) for (key, value) in count_labels.items())
print(count_labels)
print(num_labs_tray)
print(total_combinations)


#### Total Combinations

In [None]:
total_combinations_df = pd.Series(total_combinations).sort_index()
fig, ax = plt.subplots(figsize=(12, 8))
ax = sns.barplot(total_combinations_df.index, total_combinations_df.values, palette='deep', ax=ax)
ax.set_title('Count of Combinations of Labels')
ax.set_xlabel('Combinations')
ax.set_ylabel('Number of Labels')
for tick in ax.get_xticklabels():
    tick.set_rotation(90)
# for i, val in enumerate(total_combinations_df.values):
#     ax.text(i, val, val, color='black', ha="center")

In [None]:
palette = sns.color_palette("icefire", len(unique_labels))
palette

In [None]:
color_dict = dict()
for i, lab in enumerate(unique_labels):
    color_dict[lab] = palette[i]
color_dict

#### Distribution of Bounding Boxes on Trays

In [None]:
from PIL import Image
empty_tray = Image.open('emptyTrays/RelineCore1LevelB_crop.jpeg')

fig, ax = plt.subplots()
fig.set_figwidth(8)
fig.set_figheight(24)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_title('Distribution of Bounding Boxes on Tray')

plt.imshow(empty_tray, alpha=0.25)

for k, v in data_dict.items():
    labels = v['labels']
    boxes = v['bbox']
    for i, box in enumerate(boxes):
        x, y, w, h = box
        rect = patches.Rectangle((x, y), w, h, linewidth=1, edgecolor=color_dict[labels[i]], facecolor='none', alpha=0.5)
        ax.add_patch(rect) 
