In [1]:
GAMEBOY_LP_URL = 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30'

In [2]:
import asyncio
import aiohttp
import random
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm.asyncio import tqdm_asyncio
from tqdm.asyncio import tqdm_asyncio
from playwright.async_api import async_playwright
from urllib.parse import urlparse
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

In [3]:
async def scrape_site(url, selector='table'):
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector(selector)
        content = await page.inner_html(selector)
        await browser.close()
        return content

In [4]:
content = await scrape_site(GAMEBOY_LP_URL, selector='tbody')

In [5]:
soup = BeautifulSoup(content, 'html.parser')

In [6]:
links = soup.find_all('a')
longplays = [{'name': link.text, 'url': urljoin(GAMEBOY_LP_URL, link.get('href'))} for link in links if 'longplay_id=' in link.get('href')]

In [12]:
for lp in longplays:
    if lp['url'] == 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30&longplay_id=4913':
        print(lp)

{'name': 'Waterworld', 'url': 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30&longplay_id=4913'}


In [7]:
async def scrape_longplays(longplays, max_concurrent=3, wait_for_selector='table.tblDetail'):
    ''' assuming longplays: [{ 'name': '', url: '' }, ...]'''
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        semaphore = asyncio.Semaphore(max_concurrent)
        data = []
        
        async def scrape_longplay(longplay):
            async with semaphore:
                await asyncio.sleep(random.uniform(1, 3))
                page = await browser.new_page()
                try:
                    # get page content
                    await page.goto(longplay['url'])
                    await page.wait_for_selector(wait_for_selector)
                    content = await page.content()
                    await page.close()
                    
                    # extract and store longplay metadata
                    soup = BeautifulSoup(content, 'html.parser')
                    authors_ = soup.find_all('a', href=lambda x: x and 'author=' in x)
                    download_links_ = soup.find_all('a', href=lambda x: x and 'file_id=' in x)

                    authors = [{'username': link.text, 'url': urljoin(GAMEBOY_LP_URL, link.get('href'))} for link in authors_]
                    download_links = [urljoin(GAMEBOY_LP_URL, link.get('href')) for link in download_links_]
                    
                    data.append({
                        'name': longplay['name'],
                        'authors': authors,
                        'downloads': download_links
                    })
                    # return url, BeautifulSoup(content, 'html.parser')
                except Exception as e:
                    print(f"Error scraping {longplay['url']}: {e}")
                    await page.close()
                    return None

        await tqdm_asyncio.gather(
            *[scrape_longplay(longplay) for longplay in longplays],
            desc="Scraping sites",
            total=len(longplays)
        )
        
        await browser.close()
        return { 'longplays': data }

In [None]:
results = await scrape_longplays(longplays)

In [11]:
with open('longplay_metadata.json', 'w') as f:
    json.dump(results, f, indent=2)

In [None]:
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=(retry_if_exception_type(aiohttp.ClientError) | retry_if_exception_type(asyncio.TimeoutError)),
    retry_with_async=True
)
async def get_actual_download_url(initial_url):
    ''' aquire actual download url after redirects '''
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(
                initial_url,
                allow_redirects=True,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                },
                timeout=aiohttp.ClientTimeout(total=30)
            ) as response:
                response.raise_for_status()
                return str(response.url)

In [None]:
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=(retry_if_exception_type(aiohttp.ClientError) | retry_if_exception_type(asyncio.TimeoutError)),
    retry_with_async=True
)
async def download_file(url, output_path=None):
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()

        if output_path is None:
            output_path = await get_filename_from_response(url, response)
        
        total_size = int(response.headers.get('content-length', 0))
        
        with open(output_path, 'wb') as file, tqdm(
                desc=output_path,
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
                colour='green'
             ) as progress_bar:
            for data in response.iter_content(chunk_size=8192):
                size = file.write(data)
                progress_bar.update(size)
                
    except requests.RequestException as e:
        if os.path.exists(output_path):
            os.remove(output_path)
        raise Exception(f"Download attempt failed: {str(e)}")

In [2]:
%ls

[0m[01;34mVideoGen[0m/  [01;32mlongplay_metadata.json[0m*


In [None]:
def get_filename_from_response(url, response):
    content_type = response.headers.get('Content-Type', '').split(';')[0]
    ext_map = {
        # Video formats
       'video/x-matroska': '.mkv',
        'video/mp4': '.mp4',
        'video/webm': '.webm',
        'video/ogg': '.ogv',
        'video/x-msvideo': '.avi',
        'video/quicktime': '.mov',
        'video/x-ms-wmv': '.wmv',
        'video/3gpp': '.3gp',
        'video/x-flv': '.flv'
        # Common non-video formats
        'application/pdf': '.pdf',
        'image/jpeg': '.jpg',
        'image/png': '.png',
        'image/gif': '.gif',
        'text/csv': '.csv',
        'application/zip': '.zip',
        'text/plain': '.txt',
        'application/json': '.json',
        'audio/mpeg': '.mp3',
        'audio/wav': '.wav'
    }
    extension = ext_map.get(content_type, '')

    # Try Content-Disposition first
    if 'Content-Disposition' in response.headers:
        cd = response.headers['Content-Disposition']
        if 'filename=' in cd:
            return cd.split('filename=')[1].strip('"')
    
    # Fall back to URL basename + extension from content type
    base = os.path.basename(urlparse(url).path).split('.')[0] or 'downloaded_file'
    return f"{base}{extension}"

In [None]:
async def download_files():
    final_urls = await asyncio.gather(*[get_actual_download_url(url) for url in urls])
    await asyncio.gather(*[download_file(url, get) 
                          for i, url in enumerate(final_urls)])