In [3]:
GAMEBOY_LP_URL = 'https://longplays.org/infusions/longplays/longplays.php?cat_id=30'

In [1]:
import asyncio
import aiohttp
import random
import requests
import json
import re
import os
import httpx
from urllib.parse import urlparse, unquote
from pathlib import Path
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm.asyncio import tqdm_asyncio
from tqdm.asyncio import tqdm_asyncio
from playwright.async_api import async_playwright
from urllib.parse import urlparse
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

# Scrape for Longplay URL

In [3]:
async def scrape_site(url, selector='table'):
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector(selector)
        content = await page.inner_html(selector)
        await browser.close()
        return content

In [4]:
content = await scrape_site(GAMEBOY_LP_URL, selector='tbody')
soup = BeautifulSoup(content, 'html.parser')
links = soup.find_all('a')
longplays = [{'name': link.text, 'url': urljoin(GAMEBOY_LP_URL, link.get('href'))} for link in links if 'longplay_id=' in link.get('href')]

In [4]:
async def scrape_longplays(longplays, max_concurrent=3, wait_for_selector='table.tblDetail'):
    ''' assuming longplays: [{ 'name': '', url: '' }, ...]'''
    async with async_playwright() as p:
        browser = await p.firefox.launch()
        semaphore = asyncio.Semaphore(max_concurrent)
        data = []
        
        async def scrape_longplay(longplay):
            async with semaphore:
                await asyncio.sleep(random.uniform(1, 3))
                page = await browser.new_page()
                try:
                    # get page content
                    await page.goto(longplay['url'])
                    await page.wait_for_selector(wait_for_selector)
                    content = await page.content()
                    await page.close()
                    
                    # extract and store longplay metadata
                    soup = BeautifulSoup(content, 'html.parser')
                    authors_ = soup.find_all('a', href=lambda x: x and 'author=' in x)
                    download_links_ = soup.find_all('a', href=lambda x: x and 'file_id=' in x)
                    file_size_ = soup.find(string=re.compile(r'\d+\.\d+\s*MB'))

                    authors = [{'username': link.text, 'url': urljoin(GAMEBOY_LP_URL, link.get('href'))} for link in authors_]
                    download_links = [urljoin(GAMEBOY_LP_URL, link.get('href')) for link in download_links_]
                    file_size = file_size_.text
                    
                    data.append({
                        'name': longplay['name'],
                        'authors': authors,
                        'downloads': download_links,
                        'file_size': file_size
                    })
                    # return url, BeautifulSoup(content, 'html.parser')
                except Exception as e:
                    print(f"Error scraping {longplay['url']}: {e}")
                    await page.close()
                    return None

        await tqdm_asyncio.gather(
            *[scrape_longplay(longplay) for longplay in longplays],
            desc="Scraping sites",
            total=len(longplays)
        )
        
        await browser.close()
        return { 'longplays': data }

In [19]:
results = await scrape_longplays(longplays)

Scraping sites: 100%|██████████| 489/489 [14:09<00:00,  1.74s/it]


# Get Longplay Download Link

In [3]:
%cd VideoGen/

/teamspace/studios/this_studio/VideoGen


In [4]:
with open('data/longplay_metadata.json', 'r') as f:
    lp_data = json.load(f)

In [11]:
c = 0
for lp in lp_data['longplays']:
    c += len(lp['archive_links'])

In [12]:
c

498

In [62]:
async def get_actual_download_url(initial_url):
    async with async_playwright() as p:
        browser = None
        try:
            browser = await p.firefox.launch(headless=True)
            context = await browser.new_context()
            page = await context.new_page()
            
            download_url = None
            
            async def handle_download(download):
                nonlocal download_url
                download_url = download.url
                await download.cancel()
            
            page.on('download', handle_download)
            
            try:
                await page.goto(initial_url)
            except Exception as e:
                # Only log error if we didn't get a download URL
                if not download_url:
                    print(f"Error: {e}")
                
            if download_url:
                return download_url
            return page.url
            
        finally:
            if browser:
                await browser.close()

In [78]:
url = await get_actual_download_url("https://longplays.org/infusions/longplays/longplays.php?file_id=11829")

In [79]:
url

'https://ia804503.us.archive.org/10/items/game-boy-longplay-rockman-world-3-jp/Game_Boy_Longplay_-_Rockman_World_3_-_JP.mkv'

In [80]:
for lp in lp_data['longplays']:
    if lp['id'] == 404:
        lp['archive_links'] = [ url ]
        # print(lp['archive_links'])
        

In [81]:
with open('data/longplay_metadata.json', 'w', encoding='utf-8') as file:
    json.dump(lp_data, file, indent=2)

In [None]:
for lp in lp_data['longplays']:
    urls = []
    for idx, dl_url in enumerate(lp['downloads']):
        print(f'getting archive link {idx} for {lp}')
        urls.append(await get_actual_download_url(dl_url))
    lp['archive_links'] = urls

In [24]:
for lp in lp_data['longplays']:
    if len(lp['archive_links']) == 0:
        print(lp['name'])

In [25]:
with open('VideoGen/data/longplay_metadata.json', 'w', encoding='utf-8') as file:
    json.dump(lp_data, file, indent=2)

# Download Files

In [5]:
def verify_file(path: str) -> tuple[bool, str]:
    """
    Verify if a file downloaded correctly
    
    Returns:
        tuple: (is_valid, error_message)
    """
    try:
        # Check if file exists
        if not os.path.exists(path):
            return False, "File does not exist"
            
        # Check if file is empty
        if os.path.getsize(path) == 0:
            return False, "Empty file (0 bytes)"
            
        # Try to read the file to check if it's corrupted
        with open(path, 'rb') as f:
            # Read first and last kb to verify file is readable
            f.read(1024)
            f.seek(-1024, 2)
            f.read()
        
        return True, ""
        
    except Exception as e:
        return False, f"Verification failed: {str(e)}"

In [6]:
async def download_file(url: str, semaphore, idx: int, url_idx: int, download_dir: str = ".") -> tuple[str, bool, str]:
    """
    Download a file and verify its integrity
    
    Returns:
        tuple: (output_path, is_valid, error_message)
    """
    async with semaphore:
        async with httpx.AsyncClient() as client:
            try:
                async with client.stream("GET", url) as response:
                    response.raise_for_status()
                    
                    if 'content-disposition' in response.headers:
                        cd = response.headers['content-disposition']
                        filename = cd.split('filename=')[-1].strip('"')
                    else:
                        filename = unquote(os.path.basename(urlparse(url).path))
                        if not filename:
                            filename = "download"
                    
                    output_path = os.path.join(download_dir, str(idx) + '_' + str(url_idx) + '_' + filename)
                    
                    if os.path.exists(output_path):
                        is_valid, error_msg = await verify_file(output_path)
                        return output_path, is_valid, error_msg
                    
                    with open(output_path, 'wb') as file:
                        async for chunk in response.aiter_bytes():
                            file.write(chunk)
                    
                    # Verify the downloaded file
                    is_valid, error_msg = await verify_file(output_path)
                    return output_path, is_valid, error_msg
                    
            except Exception as e:
                return "", False, f"Download failed: {str(e)}"

In [8]:
async def download_files(urls, download_dir='gameboy_longplays'):
    ''' urls: [..., (longplay id, [..., url, ...]), ... ] '''
    semaphore = asyncio.Semaphore(10)
    total_urls = len(urls)
    failed_downloads = []
    
    pbar = tqdm(total=total_urls, desc="Downloading files")
    
    try:
        async def download_with_progress(url, idx, url_idx):
            result = await download_file(url, semaphore, idx, url_idx, download_dir)
            pbar.update(1)
            return (url, *result)  # Include the URL in the result tuple
        
        tasks = []
        for idx, url_list in urls:
            for url_idx, url in enumerate(url_list):
                task = asyncio.create_task(download_with_progress(url, idx, url_idx))
                tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        for result in results:
            if isinstance(result, Exception):
                # Handle any exceptions that occurred during download
                print(f"Error during download: {result}")
                continue
                
            url, path, is_valid, error_msg = result
            if not is_valid:
                failed_downloads.append((url, path, error_msg))
    
    finally:
        pbar.close()
    
    # Report any failed downloads
    if failed_downloads:
        print("\nFailed downloads:")
        for url, path, error in failed_downloads:
            print(f"- URL: {url}")
            print(f"  Path: {path}")
            print(f"  Error: {error}")
            print()
    else:
        print("\nAll files downloaded and verified successfully!")

In [9]:
urls = []
for lp in lp_data['longplays']:
    urls.append((lp['id'], lp['archive_links']))

In [14]:
count = 0
for url in urls:
    count += len(url[1])

In [15]:
count

498

In [16]:
!ls

README.md  config  notebooks	     scripts   videogen
assets	   data    requirements.txt  setup.py


In [22]:
lps = {}
for file in Path('data/gameboy_longplays').glob('*'):
    name = file.name.split('_')
    idx, url_idx = name[0], name[1]
    if idx not in lps:
        lps[idx] = [ url_idx ]
    else:
        lps[idx].append(url_idx)

In [37]:
len([len(urls) for urls in lps.values()])

487

In [23]:
lps_true = {}
for lp in lp_data['longplays']:
    lps_true[lp['id']] = lp['archive_links']

In [32]:
lps_true[190]

['https://ia601200.us.archive.org/32/items/Longplays_ScHlAuChi_December_2023/Game_Boy_Longplay_-_Genesis_-_US_-_UL.mkv']

In [39]:
for (idx, urls) in lps_true.items():
    if str(idx) not in lps:
        print(f'{idx}: 0 / {len(lps_true[idx])}')
    elif len(lps[str(idx)]) != len(lps_true[idx]):
        print(f'{idx}: {len(lps[str(idx)])} / {len(lps_true[idx])}')

360: 0 / 1
376: 0 / 1


In [40]:
urls = [
    (360, ["https://dn720309.ca.archive.org/0/items/game-boy-longplay-last-action-hero-us/Game_Boy_Longplay_-_Last_Action_Hero_-_US.mkv"]),
    (376, ["https://dn720309.ca.archive.org/0/items/Game_Boy_Longplay_-_The_Adventures_of_Tintin_-_Prisoners_of_the_Sun_-_EU/Game_Boy_Longplay_-_The_Adventures_of_Tintin_-_Prisoners_of_the_Sun_-_EU.mkv"])
]

In [41]:
await download_files(urls, download_dir='data/gameboy_longplays')

Downloading files: 100%|██████████| 2/2 [00:04<00:00,  2.38s/it]


Failed downloads:
- URL: https://dn720309.ca.archive.org/0/items/game-boy-longplay-last-action-hero-us/Game_Boy_Longplay_-_Last_Action_Hero_-_US.mkv
  Path: 
  Error: Download failed: object tuple can't be used in 'await' expression

- URL: https://dn720309.ca.archive.org/0/items/Game_Boy_Longplay_-_The_Adventures_of_Tintin_-_Prisoners_of_the_Sun_-_EU/Game_Boy_Longplay_-_The_Adventures_of_Tintin_-_Prisoners_of_the_Sun_-_EU.mkv
  Path: 
  Error: Download failed: object tuple can't be used in 'await' expression






In [59]:
for lp in lp_data['longplays']:
    for link in lp['archive_links']:
        if 'https://' not in link and 'http://' not in link:
            print(f"{lp['name']}: {link}")

The Getaway: about:blank
Kirby's Dream Land 2: about:blank
Super Mario 4 (Unlicensed): about:blank
Rockman World 3: about:blank


In [19]:
async def download_files(url_dict, download_dir='gameboy_longplays'):
    semaphore = asyncio.Semaphore(10)
    total_urls = sum(len(url_list) for url_list in url_dict.values())
    
    with tqdm(total=total_urls, desc="Downloading files") as pbar:
        tasks = []
        for idx, url_list in url_dict.items():
            for url_idx, url in enumerate(url_list):
                task = asyncio.create_task(download_file(url, semaphore, idx, url_idx, download_dir))
                task.add_done_callback(lambda _: pbar.update(1))
                tasks.append(task)
        
        await asyncio.gather(*tasks)

In [50]:
%cd ..

/teamspace/studios/this_studio/VideoGen


In [53]:
semaphore = asyncio.Semaphore(10)
for url_idx, u in enumerate(urls[138]):
    print(u)
    # await download_file(u, semaphore=semaphore, idx=138, url_idx=url_idx, download_dir='data/gameboy_longplays')

about:blank


In [None]:
await download_files(url_dict=urls, download_dir='data/gameboy_longplays')

In [None]:
import os
import hashlib
from pathlib import Path

def check_downloads(directory):
    issues = []
    
    for path in Path(directory).iterdir():
        if not path.is_file():
            continue
            
        # Check if file is empty
        if path.stat().st_size == 0:
            issues.append(f"{path.name}: Empty file (0 bytes)")
            continue
            
        # Try to read the file to check if it's corrupted
        try:
            with open(path, 'rb') as f:
                # Read first and last kb to verify file is readable
                f.read(1024)
                f.seek(-1024, 2)
                f.read()
                
            # Optional: Calculate file hash
            # md5 = hashlib.md5()
            # with open(path, 'rb') as f:
            #     for chunk in iter(lambda: f.read(4096), b''):
            #         md5.update(chunk)
            # print(f"{path.name}: {md5.hexdigest()}")
                
        except Exception as e:
            issues.append(f"{path.name}: Possible corruption - {str(e)}")
    
    return issues

In [54]:
check_downloads('.')

[]

# Convert to MP4

In [6]:
id_names = {}
for lp in lp_data['longplays']:
    id_names[lp['id']] = f"{lp['id']}_{lp['name'].replace(' ','_')}"

In [7]:
print(id_names)

{0: '0_Star_Wars', 1: '1_Donkey_Kong_Land_III', 2: '2_Elevator_Action', 3: '3_Zettai_Muteki_Raijin-oh', 4: '4_Uoozu', 5: '5_Rubble_Saver_II', 6: '6_Rubble_Saver', 7: '7_Game_Boy_Gallery_2', 8: "8_Milon's_Secret_Castle", 9: '9_Game_Boy_Gallery_(JP)', 10: '10_Penguin-kun_Wars_VS.', 11: '11_Oddworld_Adventures', 12: '12_King_of_the_Zoo', 13: '13_Arcade_Classic_No._2:_Centipede_/_Millipede', 14: '14_Arcade_Classic_No._3:_Galaga_/_Galaxian', 15: '15_Arcade_Classic_No._1:_Asteroids_/_Missile_Command', 16: '16_Jimmy_Connors_no_Pro_Tennis_Tour', 17: '17_Yannick_Noah_Tennis', 18: '18_In_Your_Face', 19: '19_TrailBlazers:_Death_Track_(Prototype_/_No_Audio)', 20: '20_Wing_Warriors_(Homebrew)', 21: '21_Bomberman_GB', 22: '22_Atomic_Punk', 23: '23_The_Jetsons:_Robot_Panic', 24: '24_A-Force:_Armour_Force_(Unlicensed)', 25: '25_Arcade_Classics:_Breakout_&_Battlezone', 26: '26_Jimmy_Connors_Tennis', 27: '27_Track_Meet:_Mezase!_Barcelona', 28: '28_Power_Racer', 29: "29_Mario's_Picross", 30: '30_Spartan_

In [21]:
async def convert_mkv_to_mp4(input_path: str,
                            semaphore: asyncio.Semaphore,
                            output_path):
    """
    Async function to convert MKV file to MP4 using FFmpeg.
    
    Args:
        input_path: Path to input MKV file
        output_path: Path for output MP4 file (optional)
        semaphore: Asyncio semaphore for controlling concurrent conversions
    
    Returns:
        bool: True if conversion was successful
    """
    async def run_ffmpeg(stream):
        loop = asyncio.get_event_loop()
        await loop.run_in_executor(None, lambda: ffmpeg.run(stream, overwrite_output=True))

    async with semaphore:
        try:
            input_path = Path(input_path)
            if not input_path.exists():
                raise FileNotFoundError(f"Input file not found: {input_path}")
                
            output_path_final = Path(output_path) if output_path else input_path.with_suffix('.mp4')
            
            try:
                # Try stream copy first (fast)
                stream = ffmpeg.input(str(input_path))
                stream = ffmpeg.output(stream, str(output_path_final),
                                     vcodec='copy',
                                     acodec='copy',
                                     loglevel='error')
                await run_ffmpeg(stream)
                
            except ffmpeg.Error:
                # If stream copy fails, try re-encoding
                stream = ffmpeg.input(str(input_path))
                stream = ffmpeg.output(stream, str(output_path_final),
                                     vcodec='libx264',
                                     acodec='aac',
                                     loglevel='error')
                await run_ffmpeg(stream)
                
        except ffmpeg.Error as e:
            print(f"Conversion failed: {e.stderr.decode()}")

In [10]:
!ls

VideoGen


In [None]:
%cd gameboy_longplays/

In [None]:
!pip install ffmpeg-python

In [None]:
!mkdir -p "longplay_mp4_files" && for f in *.mkv; do ffmpeg -i "$f" -c copy "longplay_mp4_files/${f%.mkv}.mp4"; done

In [None]:
%ls

In [61]:
cp *.mp4 longplay_mp4_files/

In [4]:
from pathlib import Path
import ffmpeg

In [None]:
%cd gameboy_longplays/

In [42]:
ids = {}
for file in Path('.').glob('*'):
    idx = int(file.name.split('_')[0])
    file_idx = int(file.name.split('_')[1])
    if ids.get(idx, None) is not None:
        ids[idx] += 1
    else:
        ids[idx] = 1

In [None]:
errors = []
for lp in lp_data['longplays']:
    if ids[int(lp['id'])] < len(lp['archive_links']):
        error.append(lp['archive_links'][ids[lp['id']]-1])

In [38]:
c

494