In [1]:
from io import BytesIO
from PIL import Image
import time

import numpy as np
from pandarallel import pandarallel
import pandas as pd
import requests
from skimage.transform import resize

In [3]:
# Left join of clean_images & full
clean_images = pd.read_csv("../data/landmarks_washington_clean_images.csv")
full_images = pd.read_csv("../data/landmarks_washington_full.csv")
clean_left = pd.merge(clean_images, full_images, on="landmark_id", how="left")

clean_left

Unnamed: 0,landmark_id,image_id,url,name,supercategory,location,latitude,longitude,category
0,262,1272ef793ebba7a2,https://upload.wikimedia.org/wikipedia/commons...,Comet Falls,waterfall,"Mount Rainier National Park,Lewis County,Washi...",46.7958,121.780,http://commons.wikimedia.org/wiki/Category:Com...
1,262,3f509df1d66997a0,https://upload.wikimedia.org/wikipedia/commons...,Comet Falls,waterfall,"Mount Rainier National Park,Lewis County,Washi...",46.7958,121.780,http://commons.wikimedia.org/wiki/Category:Com...
2,262,a263751cf3b9d364,https://upload.wikimedia.org/wikipedia/commons...,Comet Falls,waterfall,"Mount Rainier National Park,Lewis County,Washi...",46.7958,121.780,http://commons.wikimedia.org/wiki/Category:Com...
3,262,9981810cd64b2e5b,https://upload.wikimedia.org/wikipedia/commons...,Comet Falls,waterfall,"Mount Rainier National Park,Lewis County,Washi...",46.7958,121.780,http://commons.wikimedia.org/wiki/Category:Com...
4,262,172999b2cc578a66,https://upload.wikimedia.org/wikipedia/commons...,Comet Falls,waterfall,"Mount Rainier National Park,Lewis County,Washi...",46.7958,121.780,http://commons.wikimedia.org/wiki/Category:Com...
...,...,...,...,...,...,...,...,...,...
4629,203087,1ee045e5a3bc9568,https://upload.wikimedia.org/wikipedia/commons...,Jack Block Park,park,"Seattle,King County,Washington,Pacific Northwe...",47.5831,122.371,http://commons.wikimedia.org/wiki/Category:Jac...
4630,203087,adb2a9e25454e0d1,https://upload.wikimedia.org/wikipedia/commons...,Jack Block Park,park,"Seattle,King County,Washington,Pacific Northwe...",47.5831,122.371,http://commons.wikimedia.org/wiki/Category:Jac...
4631,203087,3d904df7f6c9f92a,https://upload.wikimedia.org/wikipedia/commons...,Jack Block Park,park,"Seattle,King County,Washington,Pacific Northwe...",47.5831,122.371,http://commons.wikimedia.org/wiki/Category:Jac...
4632,203087,3895947116add663,https://upload.wikimedia.org/wikipedia/commons...,Jack Block Park,park,"Seattle,King County,Washington,Pacific Northwe...",47.5831,122.371,http://commons.wikimedia.org/wiki/Category:Jac...


In [3]:
# selects the important fields
url_matrix = clean_left[["image_id", "name", "url"]]

In [5]:
# pandarallel requires functions to be self-contained
# imports have to be *in* the function

# Get image data from the url
def get_image_from_url(url):    
    def image_bytes_to_ndarray(image_bytes):
        imgSize = (224, 224, 3)
        
        """
        Converts image bytes to a NumPy ndarray.
        """
        image_buffer = BytesIO(image_bytes.content)
        img = Image.open(image_buffer)

        return resize(np.array(img), imgSize, anti_aliasing=True)
        
    header = {
        "User-Agent": "MyImageScraper/1.0 (ivaldivi@uw.edu)"
    }

    time.sleep(0.5)
    try:
        image_bytes = requests.get(url, headers=header)
        image_ndarray = image_bytes_to_ndarray(image_bytes)
        
        return image_ndarray
    except Exception as e:
        print(e)
        print(url)

        # maybe replace with None
        return 0

In [6]:
pandarallel.initialize(progress_bar=True)
url_matrix["image_data"] = url_matrix["url"].parallel_apply(get_image_from_url)

url_matrix

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=616), Label(value='0 / 616'))), HB…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  url_matrix["image_data"] = url_matrix["url"].parallel_apply(get_image_from_url)


Unnamed: 0,image_id,name,url,image_data
0,1272ef793ebba7a2,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.045279937331769195, 0.05800154655315033, ..."
1,3f509df1d66997a0,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.10766877129296296, 0.11505473672808664, 0..."
2,a263751cf3b9d364,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.3236976619132381, 0.38716444778770953, 0...."
3,9981810cd64b2e5b,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.17229581146720963, 0.20344506205558716, 0..."
4,172999b2cc578a66,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.10171972078135477, 0.1445214264506818, 0...."
...,...,...,...,...
4921,1ee045e5a3bc9568,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.7919118090282239, 0.803956406834169, 0.83..."
4922,adb2a9e25454e0d1,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.697539078858799, 0.70100082725661, 0.7260..."
4923,3d904df7f6c9f92a,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.7672072532084186, 0.7914357164157996, 0.8..."
4924,3895947116add663,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.8056955604115225, 0.7691280361152044, 0.7..."


In [7]:
# Null images are stored as int(0), so it filters them out
# also filters out any potential NA entries from the data scraping
url_matrix_cleaned = url_matrix.loc[url_matrix['image_data'].apply(lambda x: type(x) != int)]
url_matrix_cleaned = url_matrix_cleaned.loc[url_matrix_cleaned['image_data'].notna()]

In [8]:
url_matrix_cleaned

Unnamed: 0,image_id,name,url,image_data
0,1272ef793ebba7a2,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.045279937331769195, 0.05800154655315033, ..."
1,3f509df1d66997a0,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.10766877129296296, 0.11505473672808664, 0..."
2,a263751cf3b9d364,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.3236976619132381, 0.38716444778770953, 0...."
3,9981810cd64b2e5b,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.17229581146720963, 0.20344506205558716, 0..."
4,172999b2cc578a66,Comet Falls,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.10171972078135477, 0.1445214264506818, 0...."
...,...,...,...,...
4921,1ee045e5a3bc9568,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.7919118090282239, 0.803956406834169, 0.83..."
4922,adb2a9e25454e0d1,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.697539078858799, 0.70100082725661, 0.7260..."
4923,3d904df7f6c9f92a,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.7672072532084186, 0.7914357164157996, 0.8..."
4924,3895947116add663,Jack Block Park,https://upload.wikimedia.org/wikipedia/commons...,"[[[0.8056955604115225, 0.7691280361152044, 0.7..."


In [None]:
def download_image(row):
    name = row['name']
    img_id = row['image_id']

    path = f"data/{name}"
    filename = f"{path}/{img_id}.jpg"

    image = row['image_data']
    
    if not os.path.exists(path):
        os.mkdir(path)

    # images stored as 224x224,3 of (0 - 255)
    # multiply the original array (stored as 0-1) to scale for (0-255)
    im = Image.fromarray((image * 255).astype(np.uint8))
    im.save(filename)

In [None]:
url_matrix_cleaned.apply(lambda x: download_image(x), axis=1)