# Make the noteboook auto reload

In [2]:
%load_ext autoreload
%autoreload 2

# DEPICT COCO Preprocessing Instructions:
- Run each cell sequentially in order to produce the `DEPICT_coco.csv`

- Inputs: None! All downloads will be handeled by the notebook internally.  
- Outputs: `DEPICT_coco.csv` (the base dataframe used to replicate DEPICT results)

# Download Necessary Coco Files (uncomment upon first run of cell!)
- This cell will create two new folders: images/ and annotations/
- `images/` will hold the downloaded coco images
- `annocations/` will hold the concept labels from coco

In [4]:
# !chmod +x get_coco.sh
# !./get_coco.sh

--2024-08-20 13:14:57--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.97.4, 54.231.162.217, 54.231.132.65, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.97.4|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2024-08-20 13:16:02 (3.71 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  


# Imports

In [3]:
import os 
import pandas as pd
import json
import numpy
import random
import numpy as np
import torch
from tqdm import tqdm
import yaml
random.seed(88)
np.random.seed(88)

- The cell below will extract information from the coco_train2017.json and coco_val2017.json files into more easily usable datastructures 

In [3]:
# Builds 'cat_dict': key category_id | val: name
#category_id corresponds to a concept's object id in the origional coco dataset
train_file = open(os.path.join('annotations','instances_train2017.json'))
train_data = json.load(train_file)

val_file = open(os.path.join('annotations','instances_val2017.json'))
val_data = json.load(val_file)

annotations_allInfo = train_data['annotations'] + val_data['annotations']
categories = train_data['categories']

# 'images' (list) will store filename, id, and split for each image used later
images =[]

for img in train_data['images']:
    images.append({'file_name': img['file_name'],
                   'id': img['id'],
                  'split':'train'})

for img in val_data['images']:
    images.append({'file_name': img['file_name'],
                   'id': img['id'],
                  'split':'val'})

#cat_dict: key category_id | val: name
cat_dict = {}
for obj in tqdm(categories):
    if obj['id'] not in cat_dict:
        cat_dict[obj['id']] = obj['name']

#Builds dictionary mapping image_id --> concepts in image
#annotations: key: image_id | val:[category_id,...,category_id]
annotations = {}
for ann in tqdm(annotations_allInfo):
    image_id = ann['image_id']
    category_id = ann['category_id']
    if image_id not in annotations:
        annotations[image_id] = [category_id]
    else:
        annotations[image_id].append(category_id)

100%|██████████| 80/80 [00:00<00:00, 543831.96it/s]
100%|██████████| 896782/896782 [00:00<00:00, 1534458.98it/s]


- Build DataFrame of Concepts

In [4]:
concept_list = list(cat_dict.values())
feature_to_init_concept_idx = {name:i for i,name in enumerate(cat_dict.values())}
toDf = []

# Loop through each image and build its row in the dataframe including the image_id, filename, split, and concept counts
for i, img in enumerate(tqdm(images)):
    id = img['id']
    file_name = os.getcwd()+ f"/images/{img['split']}2017/" + img['file_name']
    if id in annotations:
        category_ids = annotations[id]
        unique_ids, counts = np.unique(category_ids, return_counts=True)
        unique_names = [cat_dict[cat_id] for cat_id in unique_ids]
        init_concepts = np.zeros(80,dtype=np.int8).tolist()
        caption = ""
        for count, name in zip(counts, unique_names):
            caption += f"{count} {name}, "
            init_concepts[feature_to_init_concept_idx[name]] = count
            
        #remove unneeded final comma and space from caption
        caption = caption[:-2]
        #Default all coco_train2017 images to train, will set test split in cells below
        toDf.append([id, file_name, caption, *init_concepts, img['split']])

  0%|          | 0/123287 [00:00<?, ?it/s]

100%|██████████| 123287/123287 [00:03<00:00, 31193.00it/s]


Verify Created Dataframe has correct number of rows

In [5]:
df = pd.DataFrame(toDf,columns=["image_id","file_name","text",*concept_list,"split"])
print("Note: The df we are building doesn't yet have the scene label columns present in df_final.") 
print(f"Current number of rows:",len(df))
assert(122218 == len(df))
print("Yay! The df we are building has the correct number of rows!")

Note: The df we are building doesn't yet have the scene label columns present in df_final.
Current number of rows: 122218
Yay! The df we are building has the correct number of rows!


# Verify current train split size (soon to be split into train/test). Expect 117,266 images currently in train split.

In [None]:
cur_train_split_size = len(df.loc[df['split']=='train'])
print(cur_train_split_size)

# Generate Final Train/Test Split: currently train/test images are all marked as "train" in their split column. Among these images, we will randomly select 10k to be in the test set with a seed set. 

In [None]:
# Set Seed
random.seed(88)
# Sample 10k index id's among all image_id's (117,266 image_id's in total)
# +1 as right bound is exclusive
test_idxs = random.sample(range(0,cur_train_split_size+1),10000)
# Label corresponding idxs split as test
df.loc[test_idxs,'split'] = 'test'
# Get split counts
counts = df['split'].value_counts()

#load in official coco train/val/test image ids for comparison
df_DEPICT_train_ids = pd.read_csv("DEPICT_splits/DEPICT_coco_train_img_ids.csv")
df_DEPICT_val_ids = pd.read_csv("DEPICT_splits/DEPICT_coco_val_img_ids.csv")
df_DEPICT_test_ids = pd.read_csv("DEPICT_splits/DEPICT_coco_test_img_ids.csv")

#Verify the train/val/test image ids generated by notebook match official DEPICT splits
assert(set((df.loc[df['split']=='train'].image_id)) == set(df_DEPICT_train_ids.id))
assert(set((df.loc[df['split']=='val'].image_id)) == set(df_DEPICT_val_ids.id))
assert(set((df.loc[df['split']=='test'].image_id)) == set(df_DEPICT_test_ids.id))
print("Yay! The train/val/test image id splits match the official DEPICT splits.")
print(counts)

Yay! The train/val/test image id splits match the official DEPICT splits.
split
train    107266
test      10000
val        4952
Name: count, dtype: int64


# Save Coco Concepts DataFrame Intermediate for Scene Classifier

In [8]:
df.to_csv("DEPICT_coco_concepts.csv",index=False)

# Set Device as apporiate for your machine

In [9]:
device = ('cuda' if torch.cuda.is_available()  else 'cpu')

# Download Scene Ids from MIT Places365 & Format Scene Id DataFrame

In [10]:
!wget -c https://raw.githubusercontent.com/CSAILVision/places365/master/categories_places365.txt

df_sceneIdxs = pd.read_csv("categories_places365.txt",header=None,)
df_sceneIdxs.columns = ["category"]
df_sceneIdxs["category"] = df_sceneIdxs["category"].apply(lambda x:x.split()[0])
df_sceneIdxs.to_csv("sceneIdxs.csv",index=False)
!rm -rf categories_places365.txt

--2024-08-20 15:30:34--  https://raw.githubusercontent.com/CSAILVision/places365/master/categories_places365.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6833 (6.7K) [text/plain]
Saving to: ‘categories_places365.txt’


2024-08-20 15:30:34 (55.8 MB/s) - ‘categories_places365.txt’ saved [6833/6833]



# Generate Scene Labels for Coco Images Using MIT Places365 ResNet

In [None]:
import torchvision.models as models
from torch.nn import functional as F
import os
from PIL import Image
import pandas as pd
import numpy as np
import dataset
import sys


parent_output = "scene_info"
os.makedirs(parent_output,exist_ok=True)
# ############################################################
# code used from: https://github.com/CSAILVision/places365/blob/master/run_placesCNN_basic.py
# boxed code is from that page
############################################################
# th architecture to use (they have lots of other ones to try on that link above??)
arch = 'resnet18'

#load the pre-trained weights
model_file = '%s_places365.pth.tar' % arch
if not os.access(model_file, os.W_OK):
    weight_url = 'http://places2.csail.mit.edu/models_places365/' + model_file
    os.system('wget ' + weight_url)


model = models.__dict__[arch](num_classes=365)
checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()}
model.load_state_dict(state_dict)
model.eval()
model.to(device)
################################################################################

names = []
for i in range(1,366):
    names.append(str(i))

pred_out = []
certainty_out = []

#index corresponds with model output index
#Ex) scene_idxs[0] = airfield
scene_idxs = pd.read_csv('sceneIdxs.csv')['category'].to_numpy()

print("generating labels")
# loader uses the image transformations that used by MIT Places model
loader = dataset.get_loader(128)
with torch.no_grad():
    softmax = torch.nn.Softmax(dim=0)
    j = 1
    for ids, imgs in tqdm(loader):
        print(f"starting itteration: {j}")
        j +=1
        imgs = imgs.to(device)
        preds = model(imgs)

        #idxs sorted high --> low predictions
        preds = preds.cpu().numpy()
        sorted_idxs = np.argsort(preds, axis=1)
        sorted_idxs = np.fliplr(sorted_idxs)
        for i,single_sort in enumerate(sorted_idxs):
            scene_rankings = scene_idxs[single_sort]
            pred_out.append([ids[i].item(),*scene_rankings])
            row_preds = softmax(torch.tensor(preds[i][single_sort]))
            certainty_out.append([ids[i].item(),*(row_preds.tolist())])

pred_out_df = pd.DataFrame(data=pred_out,columns=['image_id',*names])
certainty_out_df = pd.DataFrame(data=certainty_out,columns=['image_id',*names])

# Save MIT Places CNN Scene predictions
pred_out_df.to_csv(f'{parent_output}/id_to_scene.csv',index=False)

# Save Prediction certainty for each image's predicted scene
certainty_out_df.to_csv(f'{parent_output}/id_to_pred.csv', index=False)

print("saved to: id_to_scene.csv")
print("done")

# Download the MIT Places Scene Hierarchy information as csv
- View the official MIT Places365 Scene Hierarchy Spreadsheet [HERE](https://docs.google.com/spreadsheets/d/1H7ADoEIGgbF_eXh9kcJjCs5j_r3VJwke4nebhkdzksg/edit?gid=142478777#gid=142478777)
- For DEPICT, we are interested in the Level 2 Indoor Scenes Consisting of: 
    1) shopping and dining
    2) workplace (office building, factory, lab, etc.)
    3) home or hotel
    4) transportation (vehicle interiors, stations, etc.)
    5) sports and leisure
    6) cultural (art, education, religion, millitary, law, politics, etc.)


In [11]:
if not os.path.isfile("sceneHierarchy.csv"):
    !wget --output-document=sceneHierarchy.csv "https://docs.google.com/spreadsheets/d/1H7ADoEIGgbF_eXh9kcJjCs5j_r3VJwke4nebhkdzksg/export?format=csv&gid=142478777"
else:
    print("already have sceneHierarchy.csv")

already have sceneHierarchy.csv


# The cell below:
1) formats the sceneHierarch.csv
2) reads in the scene predictions generated by MIT Places365 Resnet
3) joins scene predictions with corresponding scene hierarchy for prediction

In [14]:
mother_df = pd.read_csv('sceneHierarchy.csv')
mother_df = mother_df.rename({'Unnamed: 0':"scene"},axis=1)
mother_df.columns = mother_df.iloc[0]
mother_df.drop(0,axis=0,inplace=True)
mother_df['category'] = mother_df['category'].apply(lambda x:x.split("'")[1])
mother_df.set_index('category',inplace=True)

# NOTE: 'parent_output' is set in the previous cell
id_to_sc_df = pd.read_csv(f'{parent_output}/id_to_scene.csv')
id_to_sc_df = id_to_sc_df[['image_id','1']]
id_to_sc_df = id_to_sc_df.rename({'1':'category'},axis=1)
id_to_sc_df.set_index('category',inplace=True)

id_to_sceneInfo_df = id_to_sc_df.join(mother_df)
id_to_sceneInfo_df.reset_index(inplace=True)
id_to_sceneInfo_df['image_id'] = id_to_sceneInfo_df['image_id'].astype(int)
id_to_sceneInfo_df = id_to_sceneInfo_df[['image_id', 'category'] + id_to_sceneInfo_df.columns[2:].tolist()]
#Set binary columns to int
for col in id_to_sceneInfo_df.columns[2:]:
    id_to_sceneInfo_df[col] = id_to_sceneInfo_df[col].astype(int)
id_to_sceneInfo_df.to_csv(f'{parent_output}/id_to_sceneInfo.csv',index=False)

# Merge scene and coco concept info a DataFrame and save final DataFrame as `coco_DEPICT.csv`

In [15]:
#Set indexes for join
df_img_id = df.set_index("image_id")
id_to_sceneInfo_df_img_id = id_to_sceneInfo_df.set_index("image_id")

#Join the scene info with coco concept info
df_merged = df_img_id.join(id_to_sceneInfo_df_img_id)
df_merged.reset_index(inplace=True)
df_merged.to_csv("DEPICT_coco.csv",index=False)
print('df_merged shape:', df_merged.shape)
df_merged.head()

df_merged shape: (122218, 104)


Unnamed: 0,image_id,file_name,text,person,bicycle,car,motorcycle,airplane,bus,train,...,"water, ice, snow","mountains, hills, desert, sky","forest, field, jungle",man-made elements,"transportation (roads, parking, bridges, boats, airports, etc.)","cultural or historical building/place (millitary, religious)","sports fields, parks, leisure spaces",industrial and construction,"houses, cabins, gardens, and farms","commercial buildings, shops, markets, cities, and towns"
0,391895,/data2/diffusion/notebook/coco_preprocessing/i...,"2 person, 1 bicycle, 1 motorcycle",2,1,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,522418,/data2/diffusion/notebook/coco_preprocessing/i...,"1 person, 1 knife, 1 cake, 1 sink",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,184613,/data2/diffusion/notebook/coco_preprocessing/i...,"14 person, 9 cow, 1 umbrella",14,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
3,318219,/data2/diffusion/notebook/coco_preprocessing/i...,"2 person, 3 tv, 4 mouse, 2 keyboard",2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,554625,/data2/diffusion/notebook/coco_preprocessing/i...,"5 person, 5 tv, 5 mouse, 4 keyboard",5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# You've now generated the base dataframe needed to replicate results from DEPICT!

In [4]:
df_og = pd.read_csv("DEPICT_coco.csv")

In [6]:
#Load config and setup environment 
def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config
path_to_config = 'config.yaml'
# paths variable holds all paths from the config
paths = load_config(path_to_config)
#Dictionary: key: idx | value: concept name
# idx corresponds here to the index after only selecting columns[3:83] of a given row
idx_to_name = {i:name for i,name in enumerate(df_og.columns[3:83])}

# Create 25 permuted captions for each of the 15 coco concepts considered in DEPICT

In [7]:
def generate_permuted_captions(df_in,num_permutations):
    df = df_in.copy()
    # loop thorugh each concept
    for concept in tqdm(paths['coco_concepts']):
        #permute each concept 25 times
        for n_perm in tqdm(range(num_permutations)):
            temp_df = df.copy()
            #permute concept column
            temp_df[concept] = np.random.permutation(temp_df[concept])

            #list to store each new caption
            new_captions = []
            for i in range(len(temp_df)):
                #get concepts an image has
                non_zero_concept_idxs = np.nonzero(temp_df.loc[i][3:83])[0]
                caption = ""
                #for all present concepts append to the current image's caption
                for idx in non_zero_concept_idxs:
                    name = idx_to_name[idx]
                    count = temp_df.loc[i,name]
                    caption += str(count) + f" {name}, "
                #remove final comma and space
                caption = caption[:-2]
                new_captions.append(caption)
            #define new column name and add new column to df
            new_col_name = f"permute_{concept}_{n_perm}"
            df[new_col_name] = new_captions
    return df

In [None]:
df_with_permuted = generate_permuted_captions(df_og,25)