This notebook have 2 main usages:
- Reduce train and test images **before downloading CLOC** (39 million might be too many, use sampling to get only a small portion). This output new metadata files for the new train and test set.
- Cleans up the metadata for the CLOC dataset **after downloading CLOC**. CLOC dataset provides a huge list of image URLs hosted in Flickr server. However, some of these URLs may no longer be valid. This notebook removes these invalid URLs in the metadata of the training and test sets.

In [None]:
import pandas as pd
import torch
import os
from tqdm import tqdm
import random

## Get a small portion of test and training images

In [None]:
NUM_TEST_IMAGES = 10000 
NUM_TRAIN_IMAGES = 500000

original_metadata_path = "/mnt/hdd/continuallearning/CLOC/data_preparation/release/"
new_metadata_path = "/mnt/hdd/cloc/CLOC/data_preparation/release/small/"
test_file = "yfcc100m_metadata_with_labels_usedDataRatio0.05_t110000_t250_valid_files_2004To2014_compact_val.csv"
download_link_and_locations = "download_link_and_locations.csv"

In [None]:
def generate_test_file():
    lines = []
    with open(original_metadata_path + test_file, "r") as file:
        lines = file.readlines()
    assert NUM_TEST_IMAGES <= len(lines), "NUM_TEST_IMAGES is more than number of lines in test_file"
    selected_lines = random.sample(lines, NUM_TEST_IMAGES)
    with open(new_metadata_path + test_file, "w") as file:
        file.writelines(selected_lines)

    print(f"{NUM_TEST_IMAGES} random lines have been written to small/{test_file}")

def generate_train_files():
    print("Loading train data files...")
    labels = torch.load(original_metadata_path + 'train_labels.torchSave')
    time_taken = torch.load(original_metadata_path + 'train_time.torchSave')
    user = torch.load(original_metadata_path + 'train_user.torchSave')
    userID = torch.load(original_metadata_path + 'train_userID.torchSave')
    store_loc = torch.load(original_metadata_path + 'train_store_loc.torchSave')
    print(f"Sampling {NUM_TRAIN_IMAGES} images from the train data files...")
    
    num_train_images = len(labels)
    assert NUM_TRAIN_IMAGES <= num_train_images, "NUM_TRAIN_IMAGES is more than number of images in train set"
    random_indices = random.sample(range(num_train_images), NUM_TRAIN_IMAGES)
    
    new_labels = [labels[i] for i in random_indices]
    new_time_taken = [time_taken[i] for i in random_indices]
    new_user = [user[i] for i in random_indices]
    new_userID = [userID[i] for i in random_indices]
    new_store_loc = [store_loc[i] for i in random_indices]
    
    # Define the new paths for saving the data
    new_labels_path = new_metadata_path + 'train_labels.torchSave'
    new_time_taken_path = new_metadata_path + 'train_time.torchSave'
    new_user_path = new_metadata_path + 'train_user.torchSave'
    new_userID_path = new_metadata_path + 'train_userID.torchSave'
    new_store_loc_path = new_metadata_path + 'train_store_loc.torchSave'
    
    print("Saving to new train data files...")

    # Save the selected data to the new files
    torch.save(new_labels, new_labels_path)
    torch.save(new_time_taken, new_time_taken_path)
    torch.save(new_user, new_user_path)
    torch.save(new_userID, new_userID_path)
    torch.save(new_store_loc, new_store_loc_path)

def generate_download_link_and_locations():
    labels = torch.load(new_metadata_path + 'train_labels.torchSave')
    time_taken = torch.load(new_metadata_path + 'train_time.torchSave')
    user = torch.load(new_metadata_path + 'train_user.torchSave')
    userID = torch.load(new_metadata_path + 'train_userID.torchSave')
    store_loc = torch.load(new_metadata_path + 'train_store_loc.torchSave')
    print("Loading train data files...")
    assert(len(labels) == NUM_TRAIN_IMAGES)
    test_lines = []
    with open(new_metadata_path + test_file, "r") as file:
        test_lines = file.readlines()
    
    valid = {}
    for i in range(len(test_lines)):
        line_splitted = test_lines[i].split(",")
        loc = line_splitted[-1][:-1].strip()
        valid[loc] = True
    for i in range(len(labels)):
        loc = store_loc[i].strip()
        valid[loc] = True
    
    print("Saving to new download csv file...")
    lines = []
    with open(original_metadata_path + download_link_and_locations, "r") as file:
        lines = file.readlines()
    print(len(lines))
    prefix = "images/"
    selected_lines = []
    first = False
    for line in lines:
        line_splitted = line.split(",")
        assert(len(line_splitted) == 2)
        loc = line_splitted[1][len(prefix):].strip()
        if loc in valid:
            selected_lines.append(line)
    print(len(selected_lines))
    # Write the selected lines to the new CSV file
    with open(new_metadata_path + download_link_and_locations, "w") as output_file:
        output_file.writelines(selected_lines)

In [None]:
generate_test_file()

In [None]:
# Required ~ 10 GB Ram free to run this
generate_train_files()

In [None]:
# Required ~ 8 GB Ram free to run this
generate_download_link_and_locations()

In [None]:
root = "/mnt/hdd/cloc/CLOC/data_preparation/download_images/dataset/images/"
metadata_path = "/mnt/hdd/cloc/CLOC/data_preparation/release/small/"

In [None]:
assert root != "/path/to/CLOC/release/dataset/images/", "Please provide a valid path"
assert metadata_path != "/path/to/CLOC/metadata/", "Please provide a valid path"

# Training set

In [None]:
labels = torch.load(metadata_path + 'train_labels.torchSave')
time_taken = torch.load(metadata_path + 'train_time.torchSave')
user = torch.load(metadata_path + 'train_user.torchSave')
userID = torch.load(metadata_path + 'train_userID.torchSave')
store_loc = torch.load(metadata_path + 'train_store_loc.torchSave')

In [None]:
len(labels)

In [None]:
# Check whether each image pointer exists in the downloaded files, if so add it to the index_list
index_list = []
for i in tqdm(range(len(labels))):
    path = root + store_loc[i].strip()
    if os.path.isfile(path):
        index_list.append(i)

In [None]:
labels_clean = [labels[i] for i in index_list]
time_taken_clean = [time_taken[i] for i in index_list]
user_clean = [user[i] for i in index_list]
userID_clean = [userID[i] for i in index_list]
store_loc_clean = [store_loc[i] for i in index_list]

In [None]:
len(labels_clean)

In [None]:
assert len(labels_clean) > 0, "Something went wrong, ensure that the root path is valid."

In [None]:
# You may perform some sanity checks before overwriting the original CLOC metadata with the following files.
torch.save(labels_clean, metadata_path + 'train_labels.torchSave')
torch.save(time_taken_clean, metadata_path + 'train_time.torchSave')
torch.save(user_clean, metadata_path + 'train_user.torchSave')
torch.save(userID_clean, metadata_path + 'train_userID.torchSave')
torch.save(store_loc_clean, metadata_path + 'train_store_loc.torchSave')

# Test set

In [None]:
test_set_file = "yfcc100m_metadata_with_labels_usedDataRatio0.05_t110000_t250_valid_files_2004To2014_compact_val.csv"
df = pd.read_csv(metadata_path + test_set_file)

In [None]:
len(df)

In [None]:
df.head()

In [None]:
# Check whether each image pointer exists in the downloaded files, if so add it to the index_list
index_list = []
for i in tqdm(range(len(df.iloc[:,4]))):
    path = root + df.iloc[i,4].strip()
    if os.path.isfile(path):
        index_list.append(i)

In [None]:
df_clean = df.iloc[index_list,:]

In [None]:
len(index_list)

In [None]:
assert len(index_list) > 0, "Something went wrong, ensure that the root path is valid."

In [None]:
df_clean.to_csv(metadata_path + test_set_file, index=False) 