In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
import csv

Read ground truth from the input and compare it with the output.

In [2]:
ground_truth = pd.read_csv('../../data_files/train-annotations-human-imagelabels-boxable.csv')
neo4j_data = pd.read_csv('../../data_files/export.csv')

In [3]:
print(ground_truth.head())
ground_truth.shape

            ImageID        Source  LabelName  Confidence
0  000002b66c9c498e  verification  /m/014j1m           0
1  000002b66c9c498e  verification  /m/014sv8           1
2  000002b66c9c498e  verification   /m/01599           0
3  000002b66c9c498e  verification   /m/015p6           0
4  000002b66c9c498e  verification  /m/015x4r           0


(8996795, 4)

In [4]:
source = ground_truth['Source'].unique()
print(source)
conf = ground_truth['Confidence'].unique()
print(conf)

['verification' 'crowdsource-verification']
[0 1]


Remove all row with confidence = 0 and remove column confidence.

In [5]:
ground_truth_filtered = ground_truth[ground_truth['Confidence'] == 1]
print('Old Shape: ' + str(ground_truth.shape))
print('New Shape: ' + str(ground_truth_filtered.shape))
ground_truth_filtered.drop(['Confidence'], axis=1, inplace=True)
print(ground_truth_filtered.head())

Old Shape: (8996795, 4)
New Shape: (6622219, 4)
             ImageID        Source  LabelName
1   000002b66c9c498e  verification  /m/014sv8
8   000002b66c9c498e  verification  /m/01bl7v
9   000002b66c9c498e  verification  /m/01d40f
11  000002b66c9c498e  verification  /m/01g317
15  000002b66c9c498e  verification  /m/01mzpv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth_filtered.drop(['Confidence'], axis=1, inplace=True)


In [6]:
ground_truth_filtered.drop(['Source'], axis=1, inplace=True)
print(ground_truth_filtered.head())

             ImageID  LabelName
1   000002b66c9c498e  /m/014sv8
8   000002b66c9c498e  /m/01bl7v
9   000002b66c9c498e  /m/01d40f
11  000002b66c9c498e  /m/01g317
15  000002b66c9c498e  /m/01mzpv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth_filtered.drop(['Source'], axis=1, inplace=True)


In [7]:
class_descriptions = pd.read_csv('../../data_files/class-descriptions-boxable.csv')
print(class_descriptions.head())

     /m/011k07    Tortoise
0  /m/011q46kg   Container
1    /m/012074      Magpie
2    /m/0120dh  Sea turtle
3    /m/01226z    Football
4    /m/012n7d   Ambulance


In [8]:
# Step 1: Save the current column names as they're actually data
first_row = pd.DataFrame([class_descriptions.columns.tolist()], columns=['LabelName', 'Description'])

print(first_row)

# Step 2: Rename the columns properly
class_descriptions.columns = ['LabelName', 'Description']

# Step 3: Concatenate the saved data row with the original dataframe
class_descriptions = pd.concat([first_row, class_descriptions], ignore_index=True)

print(class_descriptions.head())

   LabelName Description
0  /m/011k07    Tortoise
     LabelName Description
0    /m/011k07    Tortoise
1  /m/011q46kg   Container
2    /m/012074      Magpie
3    /m/0120dh  Sea turtle
4    /m/01226z    Football


In [9]:
#convert class to number
sorted(class_descriptions['Description'].unique())
print(class_descriptions.head())

     LabelName Description
0    /m/011k07    Tortoise
1  /m/011q46kg   Container
2    /m/012074      Magpie
3    /m/0120dh  Sea turtle
4    /m/01226z    Football


In [10]:
def convert_descriptions_to_numbers(df, description_column='Description'):
    # Get unique descriptions and sort alphabetically
    all_descriptions = []
    
    # Extract all descriptions (handling comma-separated values if needed)
    for desc in df[description_column]:
        if isinstance(desc, str) and ',' in desc:
            all_descriptions.extend([d.strip() for d in desc.split(',')])
        else:
            all_descriptions.append(desc)
    
    # Get unique values and sort
    unique_descriptions = sorted(set(all_descriptions))
    
    # Create mapping dictionary (1-indexed)
    desc_to_num = {desc: i+1 for i, desc in enumerate(unique_descriptions)}
    
    # Create a new column with the numeric values
    df['Description_Num'] = df[description_column].apply(
        lambda x: [desc_to_num[d.strip()] for d in x.split(',')] if isinstance(x, str) and ',' in x else desc_to_num[x]
    )
    
    return df, desc_to_num


In [11]:
df, mapping =  convert_descriptions_to_numbers(class_descriptions)
print(df.head())
print(mapping)

     LabelName Description  Description_Num
0    /m/011k07    Tortoise              545
1  /m/011q46kg   Container              131
2    /m/012074      Magpie              321
3    /m/0120dh  Sea turtle              445
4    /m/01226z    Football              202
{'Accordion': 1, 'Adhesive tape': 2, 'Aircraft': 3, 'Airplane': 4, 'Alarm clock': 5, 'Alpaca': 6, 'Ambulance': 7, 'Animal': 8, 'Ant': 9, 'Antelope': 10, 'Apple': 11, 'Armadillo': 12, 'Artichoke': 13, 'Auto part': 14, 'Axe': 15, 'Backpack': 16, 'Bagel': 17, 'Baked goods': 18, 'Balance beam': 19, 'Ball': 20, 'Balloon': 21, 'Banana': 22, 'Band-aid': 23, 'Banjo': 24, 'Barge': 25, 'Barrel': 26, 'Baseball bat': 27, 'Baseball glove': 28, 'Bat (Animal)': 29, 'Bathroom accessory': 30, 'Bathroom cabinet': 31, 'Bathtub': 32, 'Beaker': 33, 'Bear': 34, 'Bed': 35, 'Bee': 36, 'Beehive': 37, 'Beer': 38, 'Beetle': 39, 'Bell pepper': 40, 'Belt': 41, 'Bench': 42, 'Bicycle': 43, 'Bicycle helmet': 44, 'Bicycle wheel': 45, 'Bidet': 46, 'Billboard':

In [12]:
ground_truth_filtered = pd.merge(ground_truth_filtered, df, left_on='LabelName', right_on='LabelName')
print(ground_truth_filtered.head())

            ImageID  LabelName Description  Description_Num
0  000002b66c9c498e  /m/014sv8   Human eye              264
1  000002b66c9c498e  /m/01bl7v         Boy               64
2  000002b66c9c498e  /m/01d40f       Dress              170
3  000002b66c9c498e  /m/01g317      Person              382
4  000002b66c9c498e  /m/01mzpv       Chair              105


In [13]:
dataset_id = json.load(open('../../data_files/img_hist.json'))
print(dataset_id[0])

{'ID': '0000253ea4ecbf19', 'Histogram': {'R': [[16218.0], [81872.0], [102812.0], [87187.0], [68959.0], [48212.0], [30968.0], [20341.0], [13956.0], [10808.0], [8943.0], [7782.0], [7143.0], [6687.0], [6091.0], [5837.0], [5554.0], [5662.0], [5594.0], [5553.0], [5727.0], [5846.0], [6036.0], [6352.0], [6948.0], [7575.0], [8382.0], [10874.0], [15664.0], [19988.0], [30502.0], [116359.0]], 'G': [[15489.0], [62008.0], [87836.0], [76381.0], [71294.0], [53350.0], [38654.0], [27393.0], [19388.0], [14189.0], [11656.0], [9716.0], [8669.0], [7725.0], [7012.0], [6599.0], [6167.0], [6079.0], [5800.0], [5802.0], [5824.0], [5876.0], [6031.0], [6337.0], [6765.0], [7456.0], [8077.0], [9153.0], [11139.0], [16531.0], [32234.0], [129802.0]], 'B': [[25453.0], [128156.0], [139637.0], [83363.0], [45725.0], [26615.0], [15913.0], [11312.0], [9078.0], [7713.0], [7013.0], [6432.0], [5851.0], [5529.0], [5294.0], [5176.0], [5217.0], [5100.0], [5136.0], [5197.0], [5325.0], [5459.0], [5667.0], [6133.0], [6480.0], [7176.

In [14]:
dataset_id = pd.DataFrame(dataset_id)
print(dataset_id.head())

                 ID                                          Histogram
0  0000253ea4ecbf19  {'R': [[16218.0], [81872.0], [102812.0], [8718...
1  0000271195f2c007  {'R': [[67798.0], [60600.0], [42859.0], [35802...
2  0000286a5c6a3eb5  {'R': [[14589.0], [45916.0], [51335.0], [34853...
3  00003bfccf5f36c2  {'R': [[38459.0], [27541.0], [23055.0], [21334...
4  000045257f66b9e2  {'R': [[12283.0], [35896.0], [55813.0], [50520...


In [15]:
dataset_id_copy = dataset_id.copy()
dataset_id_copy.rename(columns={'ID': 'ImageID'}, inplace=True)
dataset_id_copy.drop(['Histogram'], axis=1, inplace=True)
print(dataset_id_copy.head())

            ImageID
0  0000253ea4ecbf19
1  0000271195f2c007
2  0000286a5c6a3eb5
3  00003bfccf5f36c2
4  000045257f66b9e2


In [16]:
dataset_id_copy.count()

ImageID    105185
dtype: int64

In [17]:
new_filtered_ground_truth = ground_truth_filtered[ground_truth_filtered['ImageID'].isin(neo4j_data['image_Id'])]
print('Old Shape: ' + str(ground_truth_filtered.shape))
print('New Shape: ' + str(new_filtered_ground_truth.shape))
print(new_filtered_ground_truth.head())

Old Shape: (6622219, 4)
New Shape: (373952, 4)
              ImageID  LabelName Description  Description_Num
218  0000253ea4ecbf19   /m/01lrl   Carnivore               92
219  0000253ea4ecbf19  /m/035r7c   Human leg              270
220  0000253ea4ecbf19   /m/04rky      Mammal              322
221  0000253ea4ecbf19   /m/05s2s       Plant              394
222  0000253ea4ecbf19   /m/07j7r        Tree              554


In [18]:
# Group by imageid and collect descriptions in lists
result = (new_filtered_ground_truth
          .groupby('ImageID')
          .agg({
              'LabelName': 'first',  # Keep first labelname for each imageid
              'Description': lambda x: x.tolist(),  # Collect all descriptions in a list
              'Description_Num': lambda x: x.tolist()  # Collect all descriptions in a list
          })
          .reset_index())

In [19]:
result.drop(['LabelName'], axis=1, inplace=True)
print(result.head())

            ImageID                                        Description  \
0  0000253ea4ecbf19  [Carnivore, Human leg, Mammal, Plant, Tree, Hu...   
1  0000286a5c6a3eb5  [Human eye, Sunglasses, Shorts, Person, Human ...   
2  00003bfccf5f36c2  [Person, Lantern, Chair, Table, Tree, Furnitur...   
3  000045257f66b9e2  [Boy, Person, Cowboy hat, Hat, Fedora, Human b...   
4  0000530c47410921                    [Toy, Bird, Duck, Tire, Animal]   

                                     Description_Num  
0                   [92, 270, 322, 394, 554, 265, 8]  
1  [264, 506, 457, 382, 271, 221, 270, 263, 267, ...  
2            [382, 304, 105, 515, 554, 213, 71, 588]  
3  [64, 382, 139, 244, 189, 262, 505, 270, 595, 2...  
4                             [548, 50, 175, 537, 8]  


In [20]:
result.shape

(97273, 3)

In [21]:
result.to_csv('../../data_files/filtered_ground_truth_num_ver_real.csv', index=False)

In [105]:
mapping_df = pd.DataFrame(list(mapping.items()), columns=['Description', 'Description_Num'])
print(mapping_df.head())
mapping_df.to_csv('../../data_files/mapping.csv', index=False)

     Description  Description_Num
0      Accordion                1
1  Adhesive tape                2
2       Aircraft                3
3       Airplane                4
4    Alarm clock                5
