In [1]:
import aiohttp
import requests
import os
from pathlib import Path
import pandas as pd
import utils
import asyncio
import math

In [2]:
try:
    os.mkdir(utils.DATASETS_AMAZON_IMAGES_PATH)
except:
    pass

In [14]:
df = pd.read_csv(Path('datasets/amazon/books_data.csv'), usecols=['Title', 'image'])
df = df[df['image'].isna() == False]
df = df[df['Title'].isna() == False]

In [15]:
async def load__and_save_img(arr, session):
    title, image_url = arr
    title_path = utils.convert_str_to_datasets_amazon_images_path(title)
    # is_file checks if file exists
    if title_path.is_file() == True:
        return
    async with session.get(image_url) as response:
        content = await response.read()
    if response.status == 200:
        with open(title_path, 'wb') as file:
             file.write(content)
    else:
         print(f'Status code : {response.status}')
         raise Exception()

In [16]:
async def save_images(start, count, session):
    _ = [await load__and_save_img(row, session) for row in df.values[start:start + count]]

In [17]:
async def run_tasks():
    batch_size = 1000
    end = math.ceil(df.shape[0] / batch_size)
    async with aiohttp.ClientSession() as session:
        tasks = [save_images(i * batch_size, batch_size, session) for i in range(end)]
        results = await asyncio.gather(*tasks)
        return results

_ = await run_tasks()

In [18]:
# save no cover image
try:
    os.mkdir(utils.DATASETS_AMAZON_STATIC_IMAGES_PATH)
except:
    pass

response = requests.get('https://books.google.nl/googlebooks/images/no_cover_thumb.gif')
if response.status_code == 200:
    with open(utils.DATASETS_AMAZON_STATIC_IMAGES_PATH / "no_cover.jpg", 'wb') as file:
        file.write(response.content)
else:
    print(f"RESPONSE CODE {response.status_code}")