# Reddit Image Scraper with Metadata

This notebook scrapes images from a list of Reddit posts, saves them to a dataset directory, and records metadata for each image.

In [None]:
%pip install requests beautifulsoup4 Pillow

In [None]:
import requests
from bs4 import BeautifulSoup
from PIL import Image
import os
from urllib.parse import urlparse
import json
import re

In [None]:
REDDIT_POST_URLS = [
    "https://www.reddit.com/r/SydneySweeney/comments/1new1eh/sydney_for_the_premiere_of_christy_during_the/"
]

OUTPUT_DIR = "../../datasets/reddit/images/original/"
METADATA_FILE = "../../datasets/reddit/metadata.jsonl"

In [None]:
def scrape_and_save_images(urls, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    collected_metadata = []

    for url in urls:
        try:
            print(f"Processing {url}")
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # --- Metadata Extraction ---
            post_title = soup.find('title').get_text()
            subreddit = url.split('/r/')[1].split('/')[0]
            
            # Simple keyword extraction from title
            tags = re.findall(r'\w+', post_title.lower())
            event_match = re.search(r'(premiere of \w+)', post_title.lower())
            event = event_match.group(1) if event_match else None

            # --- Image Scraping ---
            img_tag = soup.find('img', {'alt': 'Post image'})
            if not img_tag:
                img_container = soup.find('div', {'data-test-id': 'post-content'})
                if img_container:
                    img_tag = img_container.find('img')

            if img_tag and img_tag.get('src'):
                img_url = img_tag['src']
                print(f"  Found image URL: {img_url}")

                img_response = requests.get(img_url, headers=headers)
                img_response.raise_for_status()

                parsed_url = urlparse(img_url)
                filename = os.path.basename(parsed_url.path)
                filepath = os.path.join(output_dir, filename)

                with open(filepath, 'wb') as f:
                    f.write(img_response.content)
                print(f"  Saved image to {filepath}")

                # --- Append Metadata ---
                metadata = {
                    'source_url': url,
                    'image_filename': filename,
                    'subreddit': subreddit,
                    'title': post_title,
                    'event': event,
                    'tags': ['sydneysweeney'] # Add specific tags
                }
                collected_metadata.append(metadata)
            else:
                print(f"  Could not find a suitable image on {url}")

        except requests.exceptions.RequestException as e:
            print(f"  Error fetching {url}: {e}")
        except Exception as e:
            print(f"  An error occurred: {e}")
    return collected_metadata

def save_metadata(metadata_list, filepath):
    with open(filepath, 'a') as f:
        for item in metadata_list:
            f.write(json.dumps(item) + '
')
    print(f"{len(metadata_list)} metadata records saved to {filepath}" )


# --- Main Execution ---
scraped_data = scrape_and_save_images(REDDIT_POST_URLS, OUTPUT_DIR)
if scraped_data:
    save_metadata(scraped_data, METADATA_FILE)