In [1]:
GAMEBOY_LP_URL = 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30'

In [2]:
import asyncio
import random
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm.asyncio import tqdm_asyncio
from tqdm.asyncio import tqdm_asyncio
from playwright.async_api import async_playwright

In [3]:
async def scrape_site(url, selector='table'):
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector(selector)
        content = await page.inner_html(selector)
        await browser.close()
        return content

In [4]:
content = await scrape_site(GAMEBOY_LP_URL, selector='tbody')

In [5]:
soup = BeautifulSoup(content, 'html.parser')

In [6]:
links = soup.find_all('a')
longplays = [{'name': link.text, 'url': urljoin(GAMEBOY_LP_URL, link.get('href'))} for link in links if 'longplay_id=' in link.get('href')]

In [12]:
for lp in longplays:
    if lp['url'] == 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30&longplay_id=4913':
        print(lp)

{'name': 'Waterworld', 'url': 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30&longplay_id=4913'}


In [7]:
async def scrape_longplays(longplays, max_concurrent=3, wait_for_selector='table.tblDetail'):
    ''' assuming longplays: [{ 'name': '', url: '' }, ...]'''
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        semaphore = asyncio.Semaphore(max_concurrent)
        data = []
        
        async def scrape_longplay(longplay):
            async with semaphore:
                await asyncio.sleep(random.uniform(1, 3))
                page = await browser.new_page()
                try:
                    # get page content
                    await page.goto(longplay['url'])
                    await page.wait_for_selector(wait_for_selector)
                    content = await page.content()
                    await page.close()
                    
                    # extract and store longplay metadata
                    soup = BeautifulSoup(content, 'html.parser')
                    authors_ = soup.find_all('a', href=lambda x: x and 'author=' in x)
                    download_links_ = soup.find_all('a', href=lambda x: x and 'file_id=' in x)

                    authors = [{'username': link.text, 'url': urljoin(GAMEBOY_LP_URL, link.get('href'))} for link in authors_]
                    download_links = [urljoin(GAMEBOY_LP_URL, link.get('href')) for link in download_links_]
                    
                    data.append({
                        'name': longplay['name'],
                        'authors': authors,
                        'downloads': download_links
                    })
                    # return url, BeautifulSoup(content, 'html.parser')
                except Exception as e:
                    print(f"Error scraping {longplay['url']}: {e}")
                    await page.close()
                    return None

        await tqdm_asyncio.gather(
            *[scrape_longplay(longplay) for longplay in longplays],
            desc="Scraping sites",
            total=len(longplays)
        )
        
        await browser.close()
        return { 'longplays': data }

In [None]:
results = await scrape_longplays(longplays)

In [11]:
with open('longplay_metadata.json', 'w') as f:
    json.dump(results, f, indent=2)