In [2]:
import json
import cv2
import urllib.request
import os
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
import requests

In [2]:
def read_dataset(filename="updated_dataset.json"):
    with open(filename, "r") as f:
        return json.load(f)


def save(dataset, filename="updated_dataset.json"):
    with open(filename, "w") as f:
        json.dump(dataset, f, indent=4)

In [6]:
def download_picture_from_url(url, path):
    # create path if not exists
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    urllib.request.urlretrieve(url, path)


def download_thumbnail_from_video_url(url, path):
    # create path if not exists
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    # get video
    video = cv2.VideoCapture(url)

    # get frame
    for _ in range(10):
        success, image = video.read()

    if success:
        # save image
        cv2.imwrite(path, image)


In [3]:
url = "https://drive.google.com/u/1/uc?id=1Zq7nzWhzoJkfQQ2GnbQ_rw5djVfzecAZ&export=download"
response = requests.get(url)

with open("updated_dataset.json", "wb") as f:
    f.write(response.content)

In [7]:
dataset = read_dataset()

In [8]:
n_threads = 6 # Change this to set the number of threads to use
executor = ThreadPoolExecutor(max_workers=n_threads)

# Create a separate progress bar for the outer loop
pbar_outer = tqdm(total=len(dataset), desc="Iterating over accounts")

def download_pictures(username, details):
    pbar_inner = tqdm(total=len(details['posts']), desc=f"Downloading photos of {username}' posts", leave=True)
    # n_threads = 16 # Change this to set the number of threads to use
    executor = ThreadPoolExecutor()

    def job(type, url, path):
        if type == 'VIDEO':
            download_thumbnail_from_video_url(url, path)
        else:
            download_picture_from_url(url, path) 
        pbar_inner.update(1)

    for id, post in details['posts'].items():
        media_url = post.get('media_url', None)
        path = f'images/{username}/{id}.jpg'
        if media_url is None:
            continue
        
        executor.submit(job, post['media_type'], media_url, path)

    # Wait for all tasks to finish
    executor.shutdown()
    
    while pbar_inner.n < pbar_inner.total:
        pbar_inner.update(1)

    # finish progress bar
    pbar_inner.close()

    # Update the outer progress bar after each thread completes
    pbar_outer.update(1)

# Iterate over the dataset dictionary using a generator expression
for i, (username, details) in enumerate(dataset.items()):
    # Submit the download_pictures function to the executor
    executor.submit(download_pictures, username, details)

# Wait for all tasks to finish
executor.shutdown()

# Close the outer progress bar
pbar_outer.close()


Iterating over accounts:   0%|          | 0/264 [00:00<?, ?it/s]

Downloading photos of muradosmann' posts:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading photos of yamashitaphoto' posts:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading photos of jimmychin' posts:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading photos of benlowy' posts:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading photos of asasjostromphotography' posts:   0%|          | 0/100 [00:00<?, ?it/s]

Downloading photos of petesouza' posts:   0%|          | 0/100 [00:00<?, ?it/s]