In [None]:
# need to get all  images on perf out of s3, save them, then develop a mapping between the image name and the perf data
import boto3
import os
import pandas as pd
import shutil 
from tqdm.notebook import tqdm
import hashlib

In [4]:
# load in the object_id_to_image_id.csv file
full_mapping = pd.read_csv('/home/jtyo/Repos/PersonalRepos/deep-person-reid/reid-data/performance/object_id_to_image_id.csv')
# now for each image id, get the image from s3

In [5]:
full_mapping

Unnamed: 0,id,image_id,s3_path
0,1,4,https://labeling-detected-objects-bucket.s3.am...
1,2,3,https://labeling-detected-objects-bucket.s3.am...
2,3,2,https://labeling-detected-objects-bucket.s3.am...
3,4,4,https://labeling-detected-objects-bucket.s3.am...
4,5,2,https://labeling-detected-objects-bucket.s3.am...
...,...,...,...
250414,282507,152227,
250415,282508,152228,
250416,282509,152227,
250417,282510,152230,


In [None]:
s3 = boto3.client("s3")
original_imgs_dir = '/home/jtyo/tmp/mudd_originals'

for s3_url in tqdm(full_mapping['s3_path']):
    key = s3_url.split('/')[-1]
    response = s3.get_object(Bucket='labeling-detected-objects-bucket', Key=key)['Body']
    with open(os.path.join(original_imgs_dir, key), 'wb') as out_file:
            shutil.copyfileobj(response, out_file)    

  0%|          | 0/250419 [00:00<?, ?it/s]

In [ ]:
mudd_processed_base_path = '/home/jtyo/Repos/PersonalRepos/deep-person-reid/reid-data/performance'
final_folders = [
    'bounding_box_test', 
    'bounding_box_train',
    'gallery_all',
    'gallery_mud',
    'gallery_no_mud',
    'query',
    'query_all',
    'query_mud',
    'query_no_mud'
]

In [ ]:
def get_image_checksum(img_path):
    with open(img_path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

In [ ]:
# Now to make things more efficient and quick, lets get a checksum of all images, store in a dict, and then we can easily check if a mudd image exists in the original set
# for every image in original_imgs_dir, compute the checksum and save in memory
original_imgs_checksums = {}
for img in tqdm(os.listdir(original_imgs_dir)):
    original_imgs_checksums[get_image_checksum(os.path.join(original_imgs_dir, img))] = img

In [ ]:
# now for each image in the mudd_processed_base_path, check if it exists in the original_imgs_checksums
# then we will save a json file that maps the mudd_processed image name to the original image name
mudd_processed_to_original = {}
missed_images = {}
for folder in tqdm(final_folders):
    for img in os.listdir(os.path.join(mudd_processed_base_path, folder)):
        img_checksums = get_image_checksum(os.path.join(mudd_processed_base_path, folder, img))
        if img_checksums in original_imgs_checksums:
            mudd_processed_to_original[os.path.join(folder, img)] = original_imgs_checksums[img_checksums]
        else:
            print(f'Could not find {img} in original images')
            missed_images[img] = img_checksums

print(f'Found {len(mudd_processed_to_original)} images in the original set')
print(f'Missed {len(missed_images)} images in the original set')