In [15]:
# need to get all  images on perf out of s3, save them, then develop a mapping between the image name and the perf data
import boto3
import os
import pandas as pd
import shutil 
from tqdm.notebook import tqdm
import hashlib

In [4]:
# load in the object_id_to_image_id.csv file
full_mapping = pd.read_csv('/home/jtyo/Repos/PersonalRepos/deep-person-reid/reid-data/performance/object_id_to_image_id.csv')
# now for each image id, get the image from s3

In [5]:
full_mapping

Unnamed: 0,id,image_id,s3_path
0,1,4,https://labeling-detected-objects-bucket.s3.am...
1,2,3,https://labeling-detected-objects-bucket.s3.am...
2,3,2,https://labeling-detected-objects-bucket.s3.am...
3,4,4,https://labeling-detected-objects-bucket.s3.am...
4,5,2,https://labeling-detected-objects-bucket.s3.am...
...,...,...,...
250414,282507,152227,
250415,282508,152228,
250416,282509,152227,
250417,282510,152230,


In [10]:
s3 = boto3.client("s3")
original_imgs_dir = '/home/jtyo/tmp/mudd_originals'
errors = 0
for s3_url in tqdm(full_mapping['s3_path']):
    try:
        key = s3_url.split('/')[-1]
        if os.path.exists(os.path.join(original_imgs_dir, key)):
            continue
        response = s3.get_object(Bucket='labeling-detected-objects-bucket', Key=key)['Body']
        with open(os.path.join(original_imgs_dir, key), 'wb') as out_file:
                shutil.copyfileobj(response, out_file)
    except Exception as e:
        errors += 1

  0%|          | 0/250419 [00:00<?, ?it/s]

In [11]:
print('errors', errors)

errors 2424


In [12]:
mudd_processed_base_path = '/home/jtyo/Repos/PersonalRepos/deep-person-reid/reid-data/performance'
final_folders = [
    'bounding_box_test', 
    'bounding_box_train',
    'gallery_all',
    'gallery_mud',
    'gallery_no_mud',
    'query',
    'query_all',
    'query_mud',
    'query_no_mud'
]

In [16]:
def get_image_checksum(img_path):
    with open(img_path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

In [17]:
# Now to make things more efficient and quick, lets get a checksum of all images, store in a dict, and then we can easily check if a mudd image exists in the original set
# for every image in original_imgs_dir, compute the checksum and save in memory
original_imgs_checksums = {}
for img in tqdm(os.listdir(original_imgs_dir)):
    original_imgs_checksums[get_image_checksum(os.path.join(original_imgs_dir, img))] = img

  0%|          | 0/247995 [00:00<?, ?it/s]

In [18]:
# now for each image in the mudd_processed_base_path, check if it exists in the original_imgs_checksums
# then we will save a json file that maps the mudd_processed image name to the original image name
mudd_processed_to_original = {}
missed_images = {}
for folder in tqdm(final_folders):
    for img in os.listdir(os.path.join(mudd_processed_base_path, folder)):
        img_checksums = get_image_checksum(os.path.join(mudd_processed_base_path, folder, img))
        if img_checksums in original_imgs_checksums:
            mudd_processed_to_original[os.path.join(folder, img)] = original_imgs_checksums[img_checksums]
        else:
            print(f'Could not find {img} in original images')
            missed_images[img] = img_checksums

print(f'Found {len(mudd_processed_to_original)} images in the original set')
print(f'Missed {len(missed_images)} images in the original set')

  0%|          | 0/9 [00:00<?, ?it/s]

Found 5385 images in the original set
Missed 0 images in the original set


In [22]:
# now save the mapping to a json file
import json
with open(os.path.join(mudd_processed_base_path, 'mudd_processed_to_original.json'), 'w') as f:
    json.dump(mudd_processed_to_original, f)

In [21]:
for i, (k, v) in enumerate(mudd_processed_to_original.items()):
    print(k, v)
    if i > 10:
        break

bounding_box_test/41_91905_1528.png 91905.png
bounding_box_test/25_162073_925.png 162073.png
bounding_box_test/36_266159_3830.png 266159.png
bounding_box_test/41_87791_1438.png 87791.png
bounding_box_test/43_178306_1048.png 178306.png
bounding_box_test/36_266102_3826.png 266102.png
bounding_box_test/36_265909_3806.png 265909.png
bounding_box_test/38_35697_2224.png 35697.png
bounding_box_test/39_177197_1036.png 177197.png
bounding_box_test/39_176626_1032.png 176626.png
bounding_box_test/43_3748_2192.png 3748.png
bounding_box_test/40_93369_1543.png 93369.png


In [23]:
# see if the text 28749223 is anywhere in mudd_processed_to_original 
for k, v in mudd_processed_to_original.items():
    if '28749223' in k or '28749223' in v:
        print(k, v)

bounding_box_train/28749223_131645_845.png 131645.png
bounding_box_train/28749223_131666_850.png 131666.png
bounding_box_train/28749223_131626_842.png 131626.png
bounding_box_train/28749223_27215_29.png 27215.png
bounding_box_train/28749223_26822_26.png 26822.png
bounding_box_train/28749223_133564_852.png 133564.png
bounding_box_train/28749223_131649_849.png 131649.png
bounding_box_train/28749223_26819_25.png 26819.png
bounding_box_train/28749223_131655_843.png 131655.png
bounding_box_train/28749223_131640_848.png 131640.png
bounding_box_train/28749223_131646_847.png 131646.png
bounding_box_train/28749223_1745_2210.png 1745.png
bounding_box_train/28749223_131664_846.png 131664.png
bounding_box_train/28749223_131647_844.png 131647.png


In [24]:
# now make sure what I'm using as the detected object id and the object_id always lign up
for k, v in mudd_processed_to_original.items():
    if k.split('_')[-2] != v.split('.')[0]:
        print(k, v)

bounding_box_train/87_42469_2258.png 37818.png


In [26]:
# we need to do some analysis. Basically, we want to know: 
# Instead of looking at the person_id statistics, we wanna know:
# How many "photos" are there? 
# How many "people" are there in each photo?

In [30]:
# Read in the cvs file into a pandas dataframe 
with open('/home/jtyo/Repos/PersonalRepos/deep-person-reid/reid-data/performance/object_id_to_image_id.csv', 'r') as f:
    df = pd.read_csv(f)
df

Unnamed: 0,id,image_id,s3_path
0,1,4,https://labeling-detected-objects-bucket.s3.am...
1,2,3,https://labeling-detected-objects-bucket.s3.am...
2,3,2,https://labeling-detected-objects-bucket.s3.am...
3,4,4,https://labeling-detected-objects-bucket.s3.am...
4,5,2,https://labeling-detected-objects-bucket.s3.am...
...,...,...,...
250414,282507,152227,
250415,282508,152228,
250416,282509,152227,
250417,282510,152230,


In [31]:
# now get the number of unique id's and image_id's
print(f'Number of unique object_id\'s: {len(df["id"].unique())}')
print(f'Number of unique image_id\'s: {len(df["image_id"].unique())}')

Number of unique object_id's: 250419
Number of unique image_id's: 124596


In [32]:
# new get the average, mean, median, max, etc. of the number of id's per image_id
# first group by image_id, then count the number of id's per image_id
# then get the mean, median, max, min, etc. of the number of id's per image_id
stats = {}
for image_id, group in df.groupby('image_id'):
    stats[image_id] = {
        'num_ids': len(group['id'].unique()),
    }

# now print the statisitcs 
print(f'Average number of id\'s per image_id: {sum([v["num_ids"] for k, v in stats.items()]) / len(stats)}')
print(f'Median number of id\'s per image_id: {sorted([v["num_ids"] for k, v in stats.items()])[len(stats) // 2]}')
print(f'Max number of id\'s per image_id: {max([v["num_ids"] for k, v in stats.items()])}')
print(f'Min number of id\'s per image_id: {min([v["num_ids"] for k, v in stats.items()])}')


Average number of id's per image_id: 2.0098478281806798
Median number of id's per image_id: 1
Max number of id's per image_id: 7
Min number of id's per image_id: 1


In [54]:
# now do this again, but filter for only detected objects that also appear in mudd_processed_to_original

stats = {}
ids_we_care_about = set([int(x.split('.')[0].strip()) for x in mudd_processed_to_original.values()])
all_images_we_could_care_about = {}
for image_id, group in df.groupby('image_id'):
    these_selected_ids = []
    is_image_we_care_about = False
    for g in group['id']:
        if g in ids_we_care_about:
            these_selected_ids.append(g)
            is_image_we_care_about = True
    if len(these_selected_ids) > 0: 
        stats[image_id] = {
            'num_ids': len(these_selected_ids),
            'ids': these_selected_ids
        }
    if is_image_we_care_about:
        s3_urls_to_pull = []
        for g in group['id']:
            if g not in ids_we_care_about:
                s3_urls_to_pull.append(f"https://labeling-detected-objects-bucket.s3.amazonaws.com/{g}.png")
        all_images_we_could_care_about[image_id] = {
            'num_ids': len(group['id'].unique()), 
            'ids': group['id'].unique(),
            's3_urls': group['s3_path'].unique(),
            's3_urls_need_pulled': s3_urls_to_pull
        }

print('number of images considered', len(stats))
# now print the statisitcs 
print(f'Average number of id\'s per image_id: {sum([v["num_ids"] for k, v in stats.items()]) / len(stats)}')
print(f'Median number of id\'s per image_id: {sorted([v["num_ids"] for k, v in stats.items()])[len(stats) // 2]}')
print(f'Max number of id\'s per image_id: {max([v["num_ids"] for k, v in stats.items()])}')
print(f'Min number of id\'s per image_id: {min([v["num_ids"] for k, v in stats.items()])}')

total_ids_in_stats = sum([v["num_ids"] for k, v in stats.items()])
total_ids_in_all_we_care_about = sum([v["num_ids"] for k, v in all_images_we_could_care_about.items()])

print(f'If we pull all images associated with the images we care about, we get {total_ids_in_all_we_care_about} ids')
print("Here are the statitics for all we care about")
print(f'Average number of id\'s per image_id: {total_ids_in_all_we_care_about / len(all_images_we_could_care_about)}')
print(f'Median number of id\'s per image_id: {sorted([v["num_ids"] for k, v in all_images_we_could_care_about.items()])[len(all_images_we_could_care_about) // 2]}')
print(f'Max number of id\'s per image_id: {max([v["num_ids"] for k, v in all_images_we_could_care_about.items()])}')
print(f'Min number of id\'s per image_id: {min([v["num_ids"] for k, v in all_images_we_could_care_about.items()])}')

print(f'so if we were to pull the images missing from stats, we would get an extra {total_ids_in_all_we_care_about - total_ids_in_stats} objects')

print('\n\n')
print(f'to verify, we have {sum([len(v["s3_urls_need_pulled"]) for k, v in all_images_we_could_care_about.items()])} images to pull')
pull_me = []
for k, v in all_images_we_could_care_about.items():
    for s3_url in v['s3_urls_need_pulled']:
        pull_me.append(s3_url)

number of images considered 3355
Average number of id's per image_id: 1.1639344262295082
Median number of id's per image_id: 1
Max number of id's per image_id: 6
Min number of id's per image_id: 1
If we pull all images associated with the images we care about, we get 9087 ids
Here are the statitics for all we care about
Average number of id's per image_id: 2.70849478390462
Median number of id's per image_id: 2
Max number of id's per image_id: 7
Min number of id's per image_id: 1
so if we were to pull the images missing from stats, we would get an extra 5182 objects



to verify, we have 5182 images to pull


In [56]:
s3 = boto3.client("s3")
original_imgs_dir = '/home/jtyo/Repos/PersonalRepos/deep-person-reid/reid-data/performance/mil_images'
errors = 0
for s3_url in tqdm(pull_me):
    try:
        key = s3_url.split('/')[-1]
        if os.path.exists(os.path.join(original_imgs_dir, key)):
            continue
        response = s3.get_object(Bucket='labeling-detected-objects-bucket', Key=key)['Body']
        with open(os.path.join(original_imgs_dir, key), 'wb') as out_file:
                shutil.copyfileobj(response, out_file)
    except Exception as e:
        errors += 1


  0%|          | 0/5182 [00:00<?, ?it/s]