In [1]:
# !pip install requests-html selenium arsenic pandas

##  Sync vs Async

The Chess Game Analogy

Consecutive vs Concurrent

In [2]:
%%time

import time

iteration_times = [1, 3, 2, 4]


def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)


def run():
    for i, second in enumerate(iteration_times):
        sleeper(second, i=i)
    
# run()

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 8.11 µs


In [3]:
start = time.time()
iteration_times = [1, 3, 2, 1]
import asyncio

async def a_sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    await asyncio.sleep(seconds) # coroutine
    
    ellap = time.time() - start
    print(f"{i} done {ellap}")
    return "abc"

async def a_run():
    results = []
    for i, second in enumerate(iteration_times):
        results.append(
            asyncio.create_task(a_sleeper(second, i=i))
        )
    return results
    
results = await a_run()
print(results)
end = time.time() - start

print(end)

[<Task pending name='Task-2' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-3' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-4' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>, <Task pending name='Task-5' coro=<a_sleeper() running at <ipython-input-3-b040379a2390>:5>>]
0.0005028247833251953
0	1s
1	3s
2	2s
3	1s


## Blocking & Timeouts

In [4]:
def sleeper(seconds, i=-1):
    if i != -1:
        print(f"{i}\t{seconds}s")
    time.sleep(seconds)

sleeper(12)

In [5]:
async def asleeper(seconds, i=-1):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.sleep(seconds)
    
await asleeper(12)

0 done 12.01803994178772
3 done 12.018109798431396
2 done 12.01813292503357
1 done 12.018151760101318


In [6]:
print("hello word")

hello word


In [7]:
loop = asyncio.get_event_loop()
# loop = asyncio.new_event_loop()
# aysncio.run()


loop.create_task(asleeper(123))

<Task pending name='Task-7' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:1>>

In [8]:
print("hello word")

hello word


In [9]:
done, pending = await asyncio.wait([asleeper(1), asleeper(123)], timeout=2)
done, pending

({<Task finished name='Task-10' coro=<asleeper() done, defined at <ipython-input-5-f7aa28347698>:1> result=None>},
 {<Task pending name='Task-9' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fb3101746a0>()]>>})

In [10]:
done

{<Task finished name='Task-10' coro=<asleeper() done, defined at <ipython-input-5-f7aa28347698>:1> result=None>}

In [11]:
pending

{<Task pending name='Task-9' coro=<asleeper() running at <ipython-input-5-f7aa28347698>:5> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fb3101746a0>()]>>}

In [12]:
# await asyncio.wait(pending)

In [13]:
# await asyncio.wait_for(asleeper(5), timeout=3)

In [14]:
try:
    await asyncio.wait_for(asleeper(5), timeout=3)
except asyncio.TimeoutError:
    print("Task failed")

Task failed


In [15]:
async def asleeper_timeout(seconds, i=-1, timeout=4):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.wait_for(asyncio.sleep(seconds), timeout=timeout)
    
# await asleeper_timeout(12, timeout=1)

## Scraping with Selenium - Synchronous
New to selenium and web scraping? Watch [this series](https://kirr.co/dwy90n).

In [16]:
url = 'https://www.spoonflower.com/en/shop?on=fabric'

In [17]:
import re
import requests
from requests_html import HTML
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [18]:
def scraper(url):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    return driver.page_source


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']

In [19]:
content = scraper(url)

In [20]:
html_r = HTML(html=content)

fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]

datas = []
for path in fabric_links:
    id_, slug_ = extract_id_slug(path)
    print(id_, slug_)
    data = {
        "id": id_,
        "slug": slug_,
        "path": path,
        "scraped": 0 # True / False -> 1 / 0 
    }
    datas.append(data)

3840217 nurse-theme-by-hot4tees_bg-yahoo_com
8619105 flight-feathers-painted-by-xoxotique
7662668 mermaid-music-by-ceciliamok
5312944 thank-being-friend-small-by-elladorine
4893900 half-scale-m81-woodland-camo-by-ricraynor
509390 spoonflower-color-map-by-spoonflower_help
8039248 forest-animal-hot-air-balloon-night-adventure-by-at_the_cottage
2623675 black-white-music-notes-by-inspirationz
7812388 dnd-pattern-by-neonborealis
5839396 spectacular-cats-by-cynthia_arre
4352750 loteria-by-jellymania
5964319 hearts-on-grey-linen-valentines-day-by-littlearrowdesign
7580754 ibd-gracie-grace-golden-jumbo-by-indybloomdesign
4888888 flowers-skulls-by-elladorine
7216659 rainbow-stars-watercolor-abstract-small-by-crystal_walen
5247883 hexo-blue-med-by-nouveau_bohemian
6650975 love-nurse-charcoal-gray-by-phyllisdobbs
5513692 salon-barber-hairdresser-pattern-by-cloudycapevintage
7790777 turtles-aqua-blue-by-gingerlique
6650888 love-nurse-whimsy-blue-by-phyllisdobbs
7698482 scrubs-dr-stetho-scope-by-ad

In [21]:
df = pd.DataFrame(datas)
df.head()

Unnamed: 0,id,slug,path,scraped
0,3840217,nurse-theme-by-hot4tees_bg-yahoo_com,/en/fabric/3840217-nurse-theme-by-hot4tees_bg-...,0
1,8619105,flight-feathers-painted-by-xoxotique,/en/fabric/8619105-flight-feathers-painted-by-...,0
2,7662668,mermaid-music-by-ceciliamok,/en/fabric/7662668-mermaid-music-by-ceciliamok,0
3,5312944,thank-being-friend-small-by-elladorine,/en/fabric/5312944-thank-being-friend-small-by...,0
4,4893900,half-scale-m81-woodland-camo-by-ricraynor,/en/fabric/4893900-half-scale-m81-woodland-cam...,0


In [22]:
df.to_csv("local.csv", index=False)

In [23]:
pd.read_csv("local.csv")

Unnamed: 0,id,slug,path,scraped
0,3840217,nurse-theme-by-hot4tees_bg-yahoo_com,/en/fabric/3840217-nurse-theme-by-hot4tees_bg-...,0
1,8619105,flight-feathers-painted-by-xoxotique,/en/fabric/8619105-flight-feathers-painted-by-...,0
2,7662668,mermaid-music-by-ceciliamok,/en/fabric/7662668-mermaid-music-by-ceciliamok,0
3,5312944,thank-being-friend-small-by-elladorine,/en/fabric/5312944-thank-being-friend-small-by...,0
4,4893900,half-scale-m81-woodland-camo-by-ricraynor,/en/fabric/4893900-half-scale-m81-woodland-cam...,0
...,...,...,...,...
79,9453318,african-american-girls-retro-pop-art-by-whimsi...,/en/fabric/9453318-african-american-girls-retr...,0
80,6327300,call-mountains-evergreen-med-by-nouveau_bohemian,/en/fabric/6327300-call-mountains-evergreen-me...,0
81,6263258,navy-blue-watercolor-herringbone-by-laurapol,/en/fabric/6263258-navy-blue-watercolor-herrin...,0
82,6715163,8-wild-heart-florals-white-by-shopcabin,/en/fabric/6715163-8-wild-heart-florals-white-...,0


## Asynchronous Scraping with `chromedriver` and `arsenic`

[arsenic Docs](https://arsenic.readthedocs.io/en/latest/)

In [24]:
# !pip install arsenic

In [25]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body


async def store_links_as_df_pickle(datas=[], name='links.pkl'):
    df = pd.DataFrame(datas)
    df.set_index('id', drop=True, inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    df = await store_links_as_df_pickle(links)
    return links
    
if __name__ == "__main__":
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    results = asyncio.run(run(url))
    print(results)


Overwriting async_scrape.py


In [26]:
!python async_scrape.py

Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 62144
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
2020-08-04 15:50.55 request                        body={"desiredCapabilities": {"browserName": "chrome", "chromeOptions": {"args": ["--headless", "--disable-gpu"]}}} method=POST url=http://localhost:62144/session
2020-08-04 15:50.55 response                       body={"desiredCapabilities": {"browserName": "chrome", "chromeOptions": {"args": ["--headless", "--disable-gpu"]}}} data={'sessionId': '1b7e29660a3faa0f5ec1ac138652f429', 'status': 0, 'value': {'acceptInsecureCerts': False, 'acceptSslCerts': False, 'applicationCacheEnabled': False, 'browserConnectionEnabled': False, 'browserName': 'chrome', 'chrome': {'chromedriverVersion': '84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6

<CIMultiDictProxy('Content-Length': '259377', 'Content-Type': 'application/json; charset=utf-8')>
 url=http://localhost:62144/session/1b7e29660a3faa0f5ec1ac138652f429/source
2020-08-04 15:50.58 request                        body=None method=DELETE url=http://localhost:62144/session/1b7e29660a3faa0f5ec1ac138652f429


2020-08-04 15:50.58 response                       body=None data={'sessionId': '1b7e29660a3faa0f5ec1ac138652f429', 'status': 0, 'value': None} method=DELETE response=<ClientResponse(http://localhost:62144/session/1b7e29660a3faa0f5ec1ac138652f429) [200 OK]>
<CIMultiDictProxy('Content-Length': '72', 'Content-Type': 'application/json; charset=utf-8')>
 url=http://localhost:62144/session/1b7e29660a3faa0f5ec1ac138652f429
[{'id': '6650888', 'slug': 'love-nurse-whimsy-blue-by-phyllisdobbs', 'path': '/en/fabric/6650888-love-nurse-whimsy-blue-by-phyllisdobbs', 'scraped': 0}, {'id': '4995555', 'slug': 'heart-health-awareness-light-gray-large-by-ohdarkthirty', 'path': '/en/fabric/4995555-heart-health-awareness-light-gray-large-by-ohdarkthirty', 'scraped': 0}, {'id': '6079351', 'slug': 'josie-meadow-floral-by-sweeterthanhoney', 'path': '/en/fabric/6079351-josie-meadow-floral-by-sweeterthanhoney', 'scraped': 0}, {'id': '1112778', 'slug': 'rosie-riveter-by-spacefem', 'path': '/en/fabric/1112778-ros

In [27]:
name = 'links.pkl'
df = pd.read_pickle(name)
df.head()

Unnamed: 0_level_0,slug,path,scraped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6650888,love-nurse-whimsy-blue-by-phyllisdobbs,/en/fabric/6650888-love-nurse-whimsy-blue-by-p...,0
4995555,heart-health-awareness-light-gray-large-by-ohd...,/en/fabric/4995555-heart-health-awareness-ligh...,0
6079351,josie-meadow-floral-by-sweeterthanhoney,/en/fabric/6079351-josie-meadow-floral-by-swee...,0
1112778,rosie-riveter-by-spacefem,/en/fabric/1112778-rosie-riveter-by-spacefem,0
9060289,saints-fleur-de-lis-new-orleans-saints-footbal...,/en/fabric/9060289-saints-fleur-de-lis-new-orl...,0


In [28]:
df.shape

(84, 3)

## Hide `arsenic` Logs

In [29]:
!pip install structlog



In [30]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

import logging
import structlog # pip install structlog

def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body


async def store_links_as_df_pickle(datas=[], name='links.pkl'):
    df = pd.DataFrame(datas)
    df.set_index('id', drop=True, inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    df = await store_links_as_df_pickle(links)
    return links
    
if __name__ == "__main__":
    set_arsenic_log_level()
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    results = asyncio.run(run(url))
    print(results)


Overwriting async_scrape.py


In [31]:
!python async_scrape.py

Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 62211
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
Traceback (most recent call last):
  File "async_scrape.py", line 80, in <module>
    results = asyncio.run(run(url))
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/runners.py", line 43, in run
    return loop.run_until_complete(main)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
    return future.result()
  File "async_scrape.py", line 74, in run
    df = await store_links_as_df_pickle(links)
  File "async_scrape.py", line 66, in store_links_as_df_pickle
    df.set_index('id', drop=True, inplace=True)
  File "/Users/cfe/.local/share/virtualenvs/supercharged-tjIFpWC

## Async Data with Pandas

In [32]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

import logging
import structlog # pip install structlog

def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        # print(body)
        return body


def store_links_as_df_pickle(datas=[], name='links.pkl'):
    new_df = pd.DataFrame(datas)
    og_df = pd.DataFrame([{'id': 0}])
    if pathlib.Path(name).exists():
        og_df = pd.read_pickle(name) # read_csv
    df = pd.concat([og_df, new_df])
    df.reset_index(inplace=True, drop=False)
    df = df[['id', 'slug', 'path', 'scraped']]
    df = df.loc[~df.id.duplicated(keep='first')]
    # df.set_index('id', inplace=True, drop=True)
    df.dropna(inplace=True)
    df.to_pickle(name)
    return df
    
    
async def run(url):
    body_content = await scraper(url)
    links = await get_links(body_content)
    return links
    
if __name__ == "__main__":
    set_arsenic_log_level()
    url = 'https://www.spoonflower.com/en/shop?on=fabric'
    name = "link.pkl"
    results = asyncio.run(run(url))
    df = store_links_as_df_pickle(results, name=name)
    print(df.head())


Overwriting async_scrape.py


In [33]:
!python async_scrape.py

Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 62271
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
        id  ... scraped
0  4981816  ...     0.0
1  6178734  ...     0.0
2  6852245  ...     0.0
3  4995555  ...     0.0
4  4352750  ...     0.0

[5 rows x 4 columns]


In [34]:
df = pd.read_pickle("link.pkl")
df.shape

(84, 4)

In [35]:
df.head(n=10)

Unnamed: 0,id,slug,path,scraped
0,4981816,black-white-dogs-by-littleislandcompany,/en/fabric/4981816-black-white-dogs-by-littlei...,0.0
1,6178734,fable-floral-blush-med-by-nouveau_bohemian,/en/fabric/6178734-fable-floral-blush-med-by-n...,0.0
2,6852245,cute-nurse-love-black-no-gradient-by-jannasalak,/en/fabric/6852245-cute-nurse-love-black-no-gr...,0.0
3,4995555,heart-health-awareness-light-gray-large-by-ohd...,/en/fabric/4995555-heart-health-awareness-ligh...,0.0
4,4352750,loteria-by-jellymania,/en/fabric/4352750-loteria-by-jellymania,0.0
5,7137786,genevieve-floral-by-crystal_walen,/en/fabric/7137786-genevieve-floral-by-crystal...,0.0
6,4270747,happy-hair-stylist-friends-blue-by-clayvision_...,/en/fabric/4270747-happy-hair-stylist-friends-...,0.0
7,6079351,josie-meadow-floral-by-sweeterthanhoney,/en/fabric/6079351-josie-meadow-floral-by-swee...,0.0
8,7216659,rainbow-stars-watercolor-abstract-small-by-cry...,/en/fabric/7216659-rainbow-stars-watercolor-ab...,0.0
9,2920223,m81-woodland-camo-by-ricraynor,/en/fabric/2920223-m81-woodland-camo-by-ricraynor,0.0


## Prepare to Scrape Multiple URLs

In [36]:
start = time.time()
iteration_times = [1, 3, 2, 1]


async def asleeper_timeout(seconds, i=-1, timeout=4):
    # time.sleep(seconds)
    if i != -1:
        print(f"a{i}\t{seconds}s")
    await asyncio.wait_for(asyncio.sleep(seconds), timeout=timeout)
    ellap = time.time() - start
    print(f"{i} done {ellap}")


async def a_run():
    results = []
    for i, second in enumerate(iteration_times):
        results.append(
            asyncio.create_task(asleeper_timeout(second, i=i))
        )
    return results
    
results = await a_run()
print(results)
end = time.time() - start

print(end)

[<Task pending name='Task-14' coro=<asleeper_timeout() running at <ipython-input-36-3aebb3aea940>:5>>, <Task pending name='Task-15' coro=<asleeper_timeout() running at <ipython-input-36-3aebb3aea940>:5>>, <Task pending name='Task-16' coro=<asleeper_timeout() running at <ipython-input-36-3aebb3aea940>:5>>, <Task pending name='Task-17' coro=<asleeper_timeout() running at <ipython-input-36-3aebb3aea940>:5>>]
0.0006699562072753906
a0	1s
a1	3s
a2	2s
a3	1s


In [37]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib

import logging
import structlog # pip install structlog



def store_links_as_df_pickle(datas=[], name='links.pkl'):
    new_df = pd.DataFrame(datas)
    og_df = pd.DataFrame([{'id': 0}])
    if pathlib.Path(name).exists():
        og_df = pd.read_pickle(name) # read_csv
    df = pd.concat([og_df, new_df])
    df.reset_index(inplace=True, drop=False)
    df = df[['id', 'slug', 'path', 'scraped']]
    df = df.loc[~df.id.duplicated(keep='first')]
    # df.set_index('id', inplace=True, drop=True)
    df.dropna(inplace=True)
    df.to_pickle(name)
    return df


def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, url_path)
    if not group:
        return None, None
    return group['id'], group['slug']



async def get_links(body_content):
    html_r = HTML(html=body_content)
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url, i=-1, timeout=60, start=None):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        body = await session.get_page_source()
        links = await get_links(body)
        if start != None:
            end = time.time() - start
            print(f'{i} took {end} seconds')
        # print(body)
        return links


async def run(urls, timeout=60, start=None):
    results = []
    for i, url in enumerate(urls):
        results.append(
            asyncio.create_task(scraper(url, i=i, timeout=60, start=start))
        )
    list_of_links = await asyncio.gather(*results)
    return list_of_links

if __name__ == "__main__":
    set_arsenic_log_level()
    start = time.time()
    urls = ['https://www.spoonflower.com/en/shop?on=fabric', 
            'https://www.spoonflower.com/en/fabric/6444170-catching-fireflies-by-thestorysmith']
    name = "link.pkl"
    results = asyncio.run(run(urls, start=start))
    print(len(results))
    end = time.time() - start
    print(f'total time is {end}')
#     df = store_links_as_df_pickle(results, name=name)
#     print(df.head())


Overwriting async_scrape.py


In [38]:
!python async_scrape.py

Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 62330
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 62331
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
ChromeDriver was started successfully.
1 took 2.9974570274353027 seconds
0 took 4.026594161987305 seconds
2
total time is 4.082810878753662
0 done 5.038886070251465
1 done 5.038977146148682
2 done 5.03901219367981
3 done 5.039041042327881


## Extract Product Data

In [104]:
import re
import requests
from requests_html import HTML
import pandas as pd
from urllib.parse import urlparse
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def scraper(url):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(5)
    return driver.page_source

def extract_id_slug(url_path):
    path = url_path
    if path.startswith('http'):
        parsed_url = urlparse(path)
        path = parsed_url.path
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, path)
    if not group:
        return None, None, path
    return group['id'], group['slug'], path

In [105]:
url = 'https://www.spoonflower.com/en/fabric/6444170-catching-fireflies-by-thestorysmith'

html_str = scraper(url)
content = HTML(html=html_str)

In [115]:
def get_product_data(url, content):
    id_, slug_, path = extract_id_slug(url)
    titleEl = content.find(".design-title", first=True)
    data = {
        'id': id_,
        'slug': slug_,
        'path': path,
    }
    title = None
    if titleEl == None:
        return data
    title = titleEl.text
    data['title'] = title
    sizeEl = content.find("#fabric-size", first=True)
    size = None
    if sizeEl != None:
        size = sizeEl.text
    data['size'] = size
    price_parent_el = content.find('.b-item-price', first=True)
    price_el = price_parent_el.find('.visuallyhidden', first=True)
    for i in price_el.element.iterchildren():
        attrs = dict(**i.attrib)
        try:
            del attrs['itemprop']
        except:
            pass
        attrs_keys = list(attrs.keys())
        data[i.attrib['itemprop']] = i.attrib[attrs_keys[0]]
    return data

In [116]:
get_product_data(url, content)

{'id': '6444170',
 'slug': 'catching-fireflies-by-thestorysmith',
 'path': '/en/fabric/6444170-catching-fireflies-by-thestorysmith',
 'title': 'Catching Fireflies',
 'size': 'Fat Quarter 21" x 18"',
 'price': '10.75',
 'priceCurrency': 'USD',
 'priceValidUntil': '2030-01-01'}

In [112]:
# url2 = "https://www.spoonflower.com/en/fabric/7175195-golden-watercolor-rainbow-rows-5-by-anniemontgomerydesign?fabric=petal_signature_cotton"
# html_str2 = scraper(url2)
# content2 = HTML(html=html_str2)
# get_product_data(url2, content2)


{'id': '7175195',
 'slug': 'golden-watercolor-rainbow-rows-5-by-anniemontgomerydesign',
 'path': '/en/fabric/7175195-golden-watercolor-rainbow-rows-5-by-anniemontgomerydesign',
 'title': 'Golden Watercolor Rainbow Rows .5"'}

## Async Product Data Extraction

In [124]:
%%writefile async_scrape.py

import os
import asyncio
from arsenic import get_session, keys, browsers, services
import pandas as pd
from requests_html import HTML
import itertools
import re
import time
import pathlib
from urllib.parse import urlparse

import logging
import structlog # pip install structlog



def store_links_as_df_pickle(datas=[], name='links.pkl'):
    new_df = pd.DataFrame(datas)
    og_df = pd.DataFrame([{'id': 0}])
    if pathlib.Path(name).exists():
        og_df = pd.read_pickle(name) # read_csv
    df = pd.concat([og_df, new_df])
    df.reset_index(inplace=True, drop=False)
    df = df[['id', 'slug', 'path', 'scraped']]
    df = df.loc[~df.id.duplicated(keep='first')]
    # df.set_index('id', inplace=True, drop=True)
    df.dropna(inplace=True)
    df.to_pickle(name)
    return df


def set_arsenic_log_level(level = logging.WARNING):
    # Create logger
    logger = logging.getLogger('arsenic')

    # We need factory, to return application-wide logger
    def logger_factory():
        return logger

    structlog.configure(logger_factory=logger_factory)
    logger.setLevel(level)


# /en/fabric/7137786-genevieve-floral-by-crystal_walen
async def extract_id_slug(url_path):
    path = url_path
    if path.startswith('http'):
        parsed_url = urlparse(path)
        path = parsed_url.path
    regex = r"^[^\s]+/(?P<id>\d+)-(?P<slug>[\w_-]+)$"
    group = re.match(regex, path)
    if not group:
        return None, None, path
    return group['id'], group['slug'], path



async def get_product_data(url, content):
    id_, slug_, path = await extract_id_slug(url)
    titleEl = content.find(".design-title", first=True)
    data = {
        'id': id_,
        'slug': slug_,
        'path': path,
    }
    title = None
    if titleEl == None:
        return data
    title = titleEl.text
    data['title'] = title
    sizeEl = content.find("#fabric-size", first=True)
    size = None
    if sizeEl != None:
        size = sizeEl.text
    data['size'] = size
    price_parent_el = content.find('.b-item-price', first=True)
    price_el = price_parent_el.find('.visuallyhidden', first=True)
    for i in price_el.element.iterchildren():
        attrs = dict(**i.attrib)
        try:
            del attrs['itemprop']
        except:
            pass
        attrs_keys = list(attrs.keys())
        data[i.attrib['itemprop']] = i.attrib[attrs_keys[0]]
    return data

async def get_parsable_html(body_html_str):
    return HTML(html=body_html_str)

async def get_links(html_r):
    fabric_links = [x for x in list(html_r.links) if x.startswith("/en/fabric")]
    datas = []
    for path in fabric_links:
        id_, slug_, _ = await extract_id_slug(path)
        data = {
            "id": id_,
            "slug": slug_,
            "path": path,
            "scraped": 0 # True / False -> 1 / 0 
        }
        datas.append(data)
    return datas

async def scraper(url, i=-1, timeout=60, start=None):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        await asyncio.sleep(10)
        body = await session.get_page_source() # save this locally??
        content = await get_parsable_html(body) 
        links = await get_links(content)
        product_data = await get_product_data(url, content)
        if start != None:
            end = time.time() - start
            print(f'{i} took {end} seconds')
        # print(body)
        dataset = {
            "links": links,
            "product_data": product_data
        }
        return dataset


async def run(urls, timeout=60, start=None):
    results = []
    for i, url in enumerate(urls):
        results.append(
            asyncio.create_task(scraper(url, i=i, timeout=60, start=start))
        )
    list_of_links = await asyncio.gather(*results)
    return list_of_links

if __name__ == "__main__":
    set_arsenic_log_level()
    start = time.time()
    urls = ['https://www.spoonflower.com/en/shop?on=fabric', 
            'https://www.spoonflower.com/en/fabric/6444170-catching-fireflies-by-thestorysmith']
    name = "link.pkl"
    results = asyncio.run(run(urls, start=start))
    print(results)
    end = time.time() - start
    print(f'total time is {end}')
#     df = store_links_as_df_pickle(results, name=name)
#     print(df.head())


Overwriting async_scrape.py


In [125]:
!python async_scrape.py

Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 54051
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
Starting ChromeDriver 84.0.4147.30 (48b3e868b4cc0aa7e8149519690b6f6949e110a8-refs/branch-heads/4147@{#310}) on port 54052
Only local connections are allowed.
Please see https://chromedriver.chromium.org/security-considerations for suggestions on keeping ChromeDriver safe.
ChromeDriver was started successfully.
1 took 13.698167085647583 seconds
0 took 14.873960971832275 seconds
[{'links': [{'id': '7661255', 'slug': 'just-jellies-jellyfish-by-katerhees', 'path': '/en/fabric/7661255-just-jellies-jellyfish-by-katerhees', 'scraped': 0}, {'id': '5131007', 'slug': 'scandinavian-sweet-hedgehog-illustration-kids-gender-neutral-black-white-by-littlesmilemakers', 'path': '/en/fabric/5131007