In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import PIL
from io import BytesIO
from PIL import Image
import os
import random
import numpy as np
import matplotlib.pyplot as plt

# pycurl

In [None]:
try:
  import pycurl
except ImportError:
  %pip install pycurl
  import pycurl
import certifi
try:
  import datasets
except ImportError:
  %pip install datasets
  import datasets
from datasets import load_dataset

# **DALL-E 2 Image Gallery Scraping**

In [None]:
def get_image_ids_in_page(page: int) -> list[str]:
  """Retrieves the image IDs found on the given page."""
  # Creating a buffer as the cURL is not allocating a buffer for the network response
  buffer = BytesIO()
  c = pycurl.Curl()
  #initializing the request URL
  c.setopt(c.URL, f'https://dalle2.gallery/api/images/aggregated?page={page}')
  #setting options for cURL transfer
  c.setopt(c.WRITEDATA, buffer)
  #setting the file name holding the certificates
  c.setopt(c.CAINFO, certifi.where())
  c.setopt(pycurl.SSL_VERIFYPEER, 0)
  c.setopt(pycurl.SSL_VERIFYHOST, 0)
  # perform file transfer
  c.perform()

  #Ending the session and freeing the resources
  c.close()
  return [id for item in json.JSONDecoder().decode(buffer.getvalue().decode()) for id in item['Ids']]

def aggregate_ids_in_pages(pages) -> list[str]:
  """Aggregates all image IDs found in the given pages. Best to order the pages in decreasing order."""
  return [item for sublist in [get_image_ids_in_page(page) for page in pages] for item in sublist]

def get_image_from_id(image_id) -> Image:
  """Gets an image from an id."""
  # Creating a buffer as the cURL is not allocating a buffer for the network response
  buffer = BytesIO()
  c = pycurl.Curl()
  #initializing the request URL
  c.setopt(c.URL, f'https://dalle2.gallery/api/images/{image_id}/generated')
  #setting options for cURL transfer
  c.setopt(c.WRITEDATA, buffer)
  #setting the file name holding the certificates
  c.setopt(c.CAINFO, certifi.where())
  c.setopt(pycurl.SSL_VERIFYPEER, 0)
  c.setopt(pycurl.SSL_VERIFYHOST, 0)
  # perform file transfer
  c.perform()
  #Ending the session and freeing the resources
  c.close()
  return Image.open(buffer)

In [None]:
def download_dalle_images():
  agg_ids = aggregate_ids_in_pages(sorted(list(range(0,500)), reverse = True))
  filepath = '/content/drive/My Drive/ISYE 6740/DALL-E/'
  f = open(f'{filepath}DALL-E_ids.txt', 'w')
  f.write(','.join(agg_ids))
  f.close()
  for i, id in enumerate(agg_ids):
    get_image_from_id(id).save(f'{filepath}dalle_{i}.png')

In [None]:
# validate # of images
files = os.listdir('/content/drive/My Drive/ISYE 6740/DALL-E/')
print(len([file for file in files if file.endswith(".png")]))

In [None]:
sizes = [os.path.getsize('/content/drive/My Drive/ISYE 6740/DALL-E/' + file) for file in files]

In [None]:
sum(sizes)/1000000

# **Midjourney Images**

In [None]:
test = open(filepath + midjourney_files[0]).read()
test_json = json.JSONDecoder().decode(test)
test_url = test_json['messages'][3113][0]['attachments'][0]['url'] #['messages'][list of 3114 elements][0 = get dict in list]['attachments'][0 = get attachment in list]['url']t

In [None]:
def get_image_from_url(url):
  # Creating a buffer as the cURL is not allocating a buffer for the network response
  buffer = BytesIO()
  c = pycurl.Curl()
  #initializing the request URL
  c.setopt(c.URL, url)
  custom_headers = ['User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0/8mqLkJuL-86']
  c.setopt(pycurl.HTTPHEADER, custom_headers)
  #setting options for cURL transfer
  c.setopt(c.WRITEDATA, buffer)
  #setting the file name holding the certificates
  c.setopt(c.CAINFO, certifi.where())
  c.setopt(pycurl.SSL_VERIFYPEER, 0)
  c.setopt(pycurl.SSL_VERIFYHOST, 0)
  # perform file transfer
  c.perform()
  #Ending the session and freeing the resources
  c.close()
  if not buffer.getvalue():
    return None
  return Image.open(buffer)

In [None]:
def download_midjourney_images():
  urls = []
  filepath = '/content/drive/My Drive/ISYE 6740/Midjourney/'
  midjourney_files = [file for file in os.listdir(filepath) if file.endswith('json')]
  for i, mjf in enumerate(midjourney_files):
    print(f"file {i}")
    json_file = json.JSONDecoder().decode(open(filepath + mjf).read())
    for msg in json_file['messages']:
      if len(msg[0]['attachments']) > 0:
        urls.append(msg[0]['attachments'][0]['url'])

  random.shuffle(urls)
  f = open(f'{filepath}Midjourney_URLs.txt', 'w')
  f.write(','.join(urls))
  f.close()
  print(f"{len(urls)} images found")
  i = 0
  for u in urls:
    image = get_image_from_url(u)
    if image is None:
      print(f"skipped {u}")
      continue

    image.save(f'{filepath}midjourney_{i}.png')
    if i % 50 == 0:
      print(f'image {i} saved')
    i += 1
    if i >= 10000:
      break

In [None]:
download_midjourney_images()

In [None]:
f = open(f'{filepath}Midjourney_URLs.txt')
urls = f.read().split(",")
urls[86]

In [None]:
filepath = '/content/drive/My Drive/ISYE 6740/Midjourney/'
i = 0
for u in urls:
  image = get_image_from_url(u)
  if image is None:
    print(f"skipped {u}")
    continue

  image.save(f'{filepath}midjourney_{i}.png')
  if i % 50 == 0:
    print(f'image {i} saved')
  i += 1
  if i >= 10000:
    break

In [None]:
get_image_from_url(urls[86])

# **StableDiffusion Images**

In [None]:
def save_ddb_images():
  filepath = '/content/drive/My Drive/ISYE 6740/Stable Diffusion/'
  sd_dataset = load_dataset('poloclub/diffusiondb', '2m_random_10k')
  my_data = sd_dataset['train']
  for i in range(len(my_data)):
    my_data[i]['image'].save(f'{filepath}stablediff_{i}.png')

In [None]:
save_ddb_images()

# Downsampling Images

In [None]:
dalle_path = '/content/drive/My Drive/ISYE 6740/DALL-E/'
midjourney_path = '/content/drive/My Drive/ISYE 6740/Midjourney/'
stablediff_path = '/content/drive/My Drive/ISYE 6740/Stable Diffusion/'

folders = [dalle_path, midjourney_path, stablediff_path]

def image_to_json(folder):
  to_json = json.JSONEncoder()
  encoded_images = []
  images = [im for im in sorted(os.listdir(folder)) if im.endswith('png')]
  f = open(f'{folder}Encoded Images.txt', 'w')
  for i, image in enumerate(images):
    arr = np.asarray(Image.open(folder+image).resize((64, 64))).tolist()
    f.write(to_json.encode(arr)+'\n')
    f.flush()
    if i % 100 == 0:
      print(i)
  f.close()

In [None]:
image_to_json(midjourney_path)

In [None]:
image_to_json(stablediff_path)

# Reconstructing Images

In [None]:
r = open(f'/content/drive/My Drive/ISYE 6740/DALL-E/Encoded Images.txt')
lines = r.readlines()
Image.fromarray(np.asarray(json.JSONDecoder().decode(lines[0])).astype('uint8'), 'RGB').resize((1000, 1000), Image.NEAREST)

In [None]:
len(lines)

In [None]:
r = open(f'/content/drive/My Drive/ISYE 6740/Stable Diffusion/Encoded Images.txt')
lines = r.readlines()
len(lines)

In [None]:
len(lines)
Image.open(midjourney_path + 'midjourney_2499.png').resize((64, 64))

In [None]:
r = open(f'/content/drive/My Drive/ISYE 6740/Midjourney/Encoded Images.txt')
lines = r.readlines()
len(lines)