In [None]:
import requests
import time
import json
from datetime import datetime
from tqdm import tqdm
from PIL import Image
import io
from bs4 import BeautifulSoup
import os
import base64
import google.generativeai as genai
from PIL import UnidentifiedImageError
import random
from dotenv import load_dotenv
load_dotenv()
def create_session_with_browser_cookies(discourse_url, cookies):
    """Creates a requests session with the provided cookies."""
    session = requests.Session()
    # Check if cookies are present before setting them
    if not all(cookies.values()):
        print("Error: One or more cookie environment variables are not set.")
        print("Please check your .env file.")
        exit()

    for name, value in cookies.items():
        session.cookies.set(name, value, domain=discourse_url.split('//')[1])
    return session

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

def fetch_all_posts_from_topic(topic_id):
    """
    Fetch all posts from a Discourse topic using post_stream.stream post IDs.
    Returns the full list of post data.
    """
    try:
        # Step 1: Get initial topic JSON with metadata and some posts
        url = f"{DISCOURSE_URL}/t/{topic_id}.json"
        response = session.get(url, timeout=30)
        response.raise_for_status()
        topic_json = response.json()

        all_post_ids = topic_json.get("post_stream", {}).get("stream", [])
        fetched_posts = topic_json.get("post_stream", {}).get("posts", [])
        fetched_post_ids = {p['id'] for p in fetched_posts}

        remaining_ids = [pid for pid in all_post_ids if pid not in fetched_post_ids]
        all_posts = list(fetched_posts)

        # Step 2: Paginate through the remaining post IDs in chunks
        chunk_size = 20
        for i in range(0, len(remaining_ids), chunk_size):
            chunk = remaining_ids[i:i + chunk_size]
            chunk_url = f"{DISCOURSE_URL}/t/{topic_id}/posts.json"
            params = {'post_ids[]': chunk}
            chunk_resp = session.get(chunk_url, params=params, timeout=30)
            chunk_resp.raise_for_status()
            chunk_data = chunk_resp.json()
            all_posts.extend(chunk_data.get('post_stream', {}).get('posts', []))

            time.sleep(0.5)  # Respectful delay

        return all_posts

    except Exception as e:
        tqdm.write(f"Error fetching posts from topic {topic_id}: {e}")
        return []


def get_text_from_image_data(image_data, max_retries=10):
    """Uses Gemini API to extract embedded text from image data with retry on failure."""
    if os.getenv("ENABLE_OCR", "1") != "1":
        return ""

    attempt = 0
    while attempt < max_retries:
        try:
            model = genai.GenerativeModel("gemini-2.0-flash")

            # Validate and re-open image
            img = Image.open(io.BytesIO(image_data))
            img.verify()
            img = Image.open(io.BytesIO(image_data))
            if len(image_data) < 2000:

                print("[Skip] Image too small, likely emoji or icon")
                return ""
            # Convert to PNG bytes
                if (
                  'avatar' in img_url or
                  img_url.endswith('.svg') or
                  'emoji' in img_url or
                  'emoji' in img_tag.get('class', []) or
                  '/images/emoji' in img_url):
                  tqdm.write(f"  [Skipped] Emoji/system image: {img_url}")
                  continue

            buffered = io.BytesIO()
            img.save(buffered, format="PNG")
            image_bytes = buffered.getvalue()

            # Call Gemini API
            response = model.generate_content(
                [
                    {
                        "mime_type": "image/png",
                        "data": image_bytes
                    },
                    """ You are an OCR and image-context assistant. Return exactly two sentences and nothing else.
1. First sentence: verbatim extract of all text in the image.
2. Second sentence: a concise explanation of the image’s context.
Do not include headings, labels, bullet points, or extra commentary."""
                ]
            )
            print(response.text.strip())
            time.sleep(5)  # Rate limit spacing
            return response.text.strip()

        except UnidentifiedImageError:
            print(f"[Gemini OCR Error] Skipped invalid image (not recognized format)")
            return ""

        except Exception as e:
            attempt += 1
            if "429" in str(e) or "Rate limit" in str(e):
                wait_time = 90 + random.uniform(0, 15)
                print(f"[Rate Limit] Attempt {attempt}/{max_retries} — Waiting {wait_time:.1f}s before retrying...")
                time.sleep(wait_time)
            else:
                print(f"[Gemini OCR Error] Attempt {attempt}/{max_retries} — Error: {e}")
                time.sleep(5)  # Brief wait for transient errors

    print("[Gemini OCR Error] Max retries exceeded. OCR failed.")
    return ""

# --- SETUP ---
OUTPUT_FILE = "tds_discourse_data.json"
DISCOURSE_URL = "https://discourse.onlinedegree.iitm.ac.in"
COURSE_CATEGORY_SLUG = "courses/tds-kb/34" # Specific slug for "Tools in Data Science"
DISCOURSE_COOKIE_T=os.getenv("DISCOURSE_COOKIE_T")
DISCOURSE_COOKIE_SESSION=os.getenv("DISCOURSE_COOKIE_SESSION")
# --- REVISED: Fetch cookies securely from environment variables ---
browser_cookies = {
    '_t': DISCOURSE_COOKIE_T,
    '_forum_session': DISCOURSE_COOKIE_SESSION
}

session = create_session_with_browser_cookies(DISCOURSE_URL, browser_cookies)

# Verify login by checking the current session
print("Verifying authentication...")
response = session.get(f"{DISCOURSE_URL}/session/current.json", timeout=20)
if response.status_code != 200 or "current_user" not in response.json():
    print("Authentication failed. Please check your cookies in the .env file.")
    exit()
print("Authentication successful.")


# Define time range for posts
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 4, 15)

# Get topics from the specified category
topics = []
page = 0
print(f"Fetching topics from category '{COURSE_CATEGORY_SLUG}'...")
for i in range(15):
    url = f"{DISCOURSE_URL}/c/{COURSE_CATEGORY_SLUG}.json?page={page}"
    try:
        resp = session.get(url, timeout=30)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch topics on page {page}: {e}")
        break

    data = resp.json()
    page_topics = data.get('topic_list', {}).get('topics', [])
    if not page_topics:
        print("No more topics found.")
        break

    topics.extend(page_topics)
    print(f"Fetched {len(page_topics)} topics from page {page}. Total: {len(topics)}")
    page += 1
    time.sleep(1) # Be polite to the server

print(f"\nFetched {len(topics)} total topics.")

# Fetch and process posts within the date range
all_posts = []
print("Collecting posts from topics, processing images to base64, and running OCR...")
BACKUP_INTERVAL = 50
for topic in tqdm(topics, desc="Processing topics"):
    topic_id = topic['id']
    posts = fetch_all_posts_from_topic(topic_id)

    for post in posts:
        try:
          created_at = datetime.strptime(post['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
        except (ValueError, TypeError):
            continue # Skip post if date format is wrong or missing

        if start_date <= created_at < end_date:
            post_content_html = post.get("cooked", "")
            soup = BeautifulSoup(post_content_html, 'html.parser')

            images_base64 = []
            extracted_image_text = ""

            for img_tag in soup.find_all('img'):
                img_url = img_tag.get('src')
                if not img_url:
                    continue
                if 'avatar' in img_url or img_url.endswith('.svg'):
                    continue
                if img_url.startswith('/'):
                    img_url = DISCOURSE_URL + img_url

                try:
                    img_response = session.get(img_url, timeout=20)
                    img_response.raise_for_status()
                    image_data = img_response.content

                    # Validate image before converting to base64 or OCR
                    img = Image.open(io.BytesIO(image_data))
                    img.verify()



                    # Skip images that are too small (likely emojis or icons)
                    if img.width < 32 or img.height < 32:
                        tqdm.write(f"  [Skipped] Tiny image ({img.width}x{img.height}) in post {post.get('id')}")
                        continue



                    # Optionally: Skip nearly grayscale or low-info images
                    if img.getbands() == ('L',) and len(set(img.getdata())) < 10:
                        tqdm.write(f"  [Skipped] Low-info grayscale image in post {post.get('id')}")
                        continue

                    img = Image.open(io.BytesIO(image_data))  # Reopen after verify

                    # Encode to base64
                    base64_bytes = base64.b64encode(image_data)
                    base64_string = base64_bytes.decode('utf-8')
                    images_base64.append(base64_string)

                    # OCR
                    # ocr_text = get_text_from_image_data(image_data)
                    if len(image_data) < 2000:
                        tqdm.write(f"[Skip OCR] Too small: {len(image_data)} bytes")
                        continue

                    # if ocr_text:
                    #     extracted_image_text += ocr_text + "\n---\n"

                except UnidentifiedImageError:
                    tqdm.write(f"  [Image Error] Skipped: not a valid image for post {post.get('id')}")
                except Exception as img_e:
                    tqdm.write(f"  [Image Error] for post {post.get('id')}: {img_e}")

            all_posts.append({
                "source": "discourse",
                "id": f"discourse_{post.get('id')}",
                "title": topic.get("title", "Untitled Topic"),
                "content_text": soup.get_text(separator="\n", strip=True),
                "images_base64": images_base64, # <-- SAVING THE LIST OF BASE64 STRINGS
                "extracted_image_text": extracted_image_text.strip(),
                "url": f"{DISCOURSE_URL}/t/{topic_id}/{post.get('post_number', 1)}",
                "metadata": {
                    "username": post.get("username", "unknown"),
                    "created_at": created_at.isoformat(),
                    "topic_id": topic_id,
                    "post_number": post.get("post_number", 0)
                }
            })
            if len(all_posts) % BACKUP_INTERVAL == 0:
              with open("backup.json", "w", encoding="utf-8") as f:
                  json.dump(all_posts, f, indent=2, ensure_ascii=False)
              print(f"[💾] Backup saved with {len(all_posts)} posts")
    time.sleep(1)

print(f"\nCollected {len(all_posts)} posts.")

# Save the unified data to a file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_posts, f, indent=2, ensure_ascii=False)

print(f"Scraping complete. Saved {len(all_posts)} posts to {OUTPUT_FILE}")

Verifying authentication...
Authentication successful.
Fetching topics from category 'courses/tds-kb/34'...
Fetched 30 topics from page 0. Total: 30
Fetched 30 topics from page 1. Total: 60
Fetched 30 topics from page 2. Total: 90
Fetched 30 topics from page 3. Total: 120
Fetched 30 topics from page 4. Total: 150
Fetched 30 topics from page 5. Total: 180
Fetched 30 topics from page 6. Total: 210
Fetched 30 topics from page 7. Total: 240
Fetched 30 topics from page 8. Total: 270
Fetched 30 topics from page 9. Total: 300
Fetched 30 topics from page 10. Total: 330
Fetched 30 topics from page 11. Total: 360
Fetched 30 topics from page 12. Total: 390
Fetched 30 topics from page 13. Total: 420
Fetched 30 topics from page 14. Total: 450

Fetched 450 total topics.
Collecting posts from topics, processing images to base64, and running OCR...


Processing topics:   4%|▎         | 16/450 [00:55<30:33,  4.22s/it]

[💾] Backup saved with 50 posts
[Skip OCR] Too small: 1344 bytes
[💾] Backup saved with 100 posts


Processing topics:   4%|▎         | 16/450 [00:58<30:33,  4.22s/it]

[Skip OCR] Too small: 895 bytes
  [Skipped] Tiny image (690x12) in post 590891


Processing topics:   4%|▎         | 16/450 [00:59<30:33,  4.22s/it]

  [Skipped] Tiny image (690x22) in post 590974


Processing topics:   5%|▌         | 23/450 [01:14<18:27,  2.59s/it]

[💾] Backup saved with 150 posts


Processing topics:  11%|█         | 50/450 [01:57<08:59,  1.35s/it]

[Skip OCR] Too small: 699 bytes


Processing topics:  11%|█         | 50/450 [01:58<08:59,  1.35s/it]

[Skip OCR] Too small: 1437 bytes
[💾] Backup saved with 200 posts
[💾] Backup saved with 250 posts


Processing topics:  12%|█▏        | 56/450 [02:11<13:44,  2.09s/it]

[💾] Backup saved with 300 posts


Processing topics:  12%|█▏        | 56/450 [02:41<13:44,  2.09s/it]

[Skip OCR] Too small: 1604 bytes
[💾] Backup saved with 350 posts


Processing topics:  12%|█▏        | 56/450 [02:43<13:44,  2.09s/it]

[Skip OCR] Too small: 1314 bytes
[Skip OCR] Too small: 1318 bytes


Processing topics:  12%|█▏        | 56/450 [02:43<13:44,  2.09s/it]

[Skip OCR] Too small: 1045 bytes
[Skip OCR] Too small: 1052 bytes
[💾] Backup saved with 400 posts


Processing topics:  12%|█▏        | 56/450 [02:46<13:44,  2.09s/it]

[Skip OCR] Too small: 886 bytes
[💾] Backup saved with 450 posts
[💾] Backup saved with 500 posts
[💾] Backup saved with 550 posts
[💾] Backup saved with 600 posts


Processing topics:  12%|█▏        | 56/450 [02:53<13:44,  2.09s/it]

[Skip OCR] Too small: 1407 bytes


Processing topics:  12%|█▏        | 56/450 [02:53<13:44,  2.09s/it]

[Skip OCR] Too small: 1387 bytes


Processing topics:  12%|█▏        | 56/450 [02:54<13:44,  2.09s/it]

[Skip OCR] Too small: 1387 bytes


Processing topics:  12%|█▏        | 56/450 [02:54<13:44,  2.09s/it]

[Skip OCR] Too small: 1849 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes


Processing topics:  12%|█▏        | 56/450 [02:54<13:44,  2.09s/it]

[Skip OCR] Too small: 1024 bytes


Processing topics:  12%|█▏        | 56/450 [02:55<13:44,  2.09s/it]

[Skip OCR] Too small: 1489 bytes
[💾] Backup saved with 650 posts
[💾] Backup saved with 700 posts
[💾] Backup saved with 750 posts


Processing topics:  12%|█▏        | 56/450 [03:00<13:44,  2.09s/it]

[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes


Processing topics:  12%|█▏        | 56/450 [03:00<13:44,  2.09s/it]

[Skip OCR] Too small: 886 bytes
[Skip OCR] Too small: 886 bytes
[Skip OCR] Too small: 886 bytes


Processing topics:  12%|█▏        | 56/450 [03:01<13:44,  2.09s/it]

[Skip OCR] Too small: 1941 bytes


Processing topics:  12%|█▏        | 56/450 [03:01<13:44,  2.09s/it]

[Skip OCR] Too small: 1941 bytes


Processing topics:  12%|█▏        | 56/450 [03:02<13:44,  2.09s/it]

  [Image Error] for post 619143: 'NoneType' object has no attribute 'seek'
[Skip OCR] Too small: 1941 bytes


Processing topics:  12%|█▏        | 56/450 [03:02<13:44,  2.09s/it]

[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 886 bytes
[Skip OCR] Too small: 886 bytes
[Skip OCR] Too small: 886 bytes
[💾] Backup saved with 800 posts


Processing topics:  15%|█▌        | 68/450 [03:39<10:38,  1.67s/it]

[Skip OCR] Too small: 1449 bytes


Processing topics:  15%|█▌        | 68/450 [03:39<10:38,  1.67s/it]

[Skip OCR] Too small: 1449 bytes
[💾] Backup saved with 850 posts
[💾] Backup saved with 900 posts
[💾] Backup saved with 950 posts


Processing topics:  15%|█▌        | 68/450 [03:46<10:38,  1.67s/it]

[Skip OCR] Too small: 1387 bytes
[💾] Backup saved with 1000 posts
[💾] Backup saved with 1050 posts


Processing topics:  15%|█▌        | 68/450 [03:50<10:38,  1.67s/it]

  [Image Error] for post 615124: 'NoneType' object has no attribute 'seek'


Processing topics:  15%|█▌        | 68/450 [03:52<10:38,  1.67s/it]

  [Image Error] for post 615988: 'NoneType' object has no attribute 'seek'
[💾] Backup saved with 1100 posts


Processing topics:  15%|█▌        | 68/450 [03:53<10:38,  1.67s/it]

[Skip OCR] Too small: 967 bytes


Processing topics:  15%|█▌        | 68/450 [03:54<10:38,  1.67s/it]

[Skip OCR] Too small: 941 bytes


Processing topics:  15%|█▌        | 68/450 [03:55<10:38,  1.67s/it]

[Skip OCR] Too small: 1675 bytes
[💾] Backup saved with 1150 posts


Processing topics:  15%|█▌        | 68/450 [04:00<10:38,  1.67s/it]

  [Image Error] for post 616584: 'NoneType' object has no attribute 'seek'
[💾] Backup saved with 1200 posts


Processing topics:  15%|█▌        | 68/450 [04:01<10:38,  1.67s/it]

[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1387 bytes
[💾] Backup saved with 1250 posts


Processing topics:  16%|█▌        | 72/450 [04:09<37:56,  6.02s/it]

[💾] Backup saved with 1300 posts


Processing topics:  18%|█▊        | 79/450 [04:21<10:59,  1.78s/it]

[Skip OCR] Too small: 1924 bytes
  [Skipped] Tiny image (16x16) in post 618184


Processing topics:  18%|█▊        | 83/450 [04:26<09:18,  1.52s/it]

[💾] Backup saved with 1350 posts


Processing topics:  19%|█▉        | 86/450 [04:34<12:23,  2.04s/it]

[💾] Backup saved with 1400 posts


Processing topics:  20%|█▉        | 88/450 [04:40<11:05,  1.84s/it]

[Skip OCR] Too small: 1304 bytes


Processing topics:  20%|█▉        | 88/450 [04:40<11:05,  1.84s/it]

  [Image Error] Skipped: not a valid image for post 617454


Processing topics:  20%|██        | 92/450 [04:46<10:01,  1.68s/it]

  [Image Error] for post 614549: 'NoneType' object has no attribute 'seek'


Processing topics:  21%|██        | 93/450 [04:47<09:43,  1.63s/it]

[💾] Backup saved with 1450 posts


Processing topics:  21%|██        | 93/450 [05:13<09:43,  1.63s/it]

  [Image Error] for post 587355: 'NoneType' object has no attribute 'seek'
[💾] Backup saved with 1500 posts


Processing topics:  21%|██        | 93/450 [05:15<09:43,  1.63s/it]

[Skip OCR] Too small: 1674 bytes
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1983 bytes
[Skip OCR] Too small: 1983 bytes


Processing topics:  21%|██        | 93/450 [05:15<09:43,  1.63s/it]

[Skip OCR] Too small: 1983 bytes
[💾] Backup saved with 1550 posts


Processing topics:  21%|██        | 93/450 [05:16<09:43,  1.63s/it]

[Skip OCR] Too small: 1387 bytes
[Skip OCR] Too small: 1437 bytes
[💾] Backup saved with 1600 posts


Processing topics:  21%|██        | 93/450 [05:18<09:43,  1.63s/it]

  [Image Error] for post 594359: 'NoneType' object has no attribute 'seek'
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1449 bytes
[💾] Backup saved with 1650 posts


Processing topics:  21%|██        | 93/450 [05:20<09:43,  1.63s/it]

  [Skipped] Tiny image (16x16) in post 594846
[💾] Backup saved with 1700 posts


Processing topics:  21%|██        | 93/450 [05:20<09:43,  1.63s/it]

[Skip OCR] Too small: 1674 bytes
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1449 bytes


Processing topics:  21%|██        | 93/450 [05:21<09:43,  1.63s/it]

[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes


Processing topics:  21%|██        | 93/450 [05:21<09:43,  1.63s/it]

[Skip OCR] Too small: 1690 bytes


Processing topics:  21%|██        | 93/450 [05:21<09:43,  1.63s/it]

[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes


                                                                   

[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1674 bytes
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1387 bytes
[Skip OCR] Too small: 1690 bytes


Processing topics:  21%|██        | 93/450 [05:22<09:43,  1.63s/it]

[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1695 bytes
[Skip OCR] Too small: 1437 bytes
[💾] Backup saved with 1750 posts


Processing topics:  21%|██        | 93/450 [05:26<09:43,  1.63s/it]

[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1449 bytes


Processing topics:  21%|██        | 93/450 [05:26<09:43,  1.63s/it]

[💾] Backup saved with 1800 posts
[Skip OCR] Too small: 1674 bytes
[Skip OCR] Too small: 1387 bytes
[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1674 bytes


Processing topics:  21%|██        | 93/450 [05:27<09:43,  1.63s/it]

[Skip OCR] Too small: 1387 bytes


Processing topics:  21%|██        | 93/450 [05:27<09:43,  1.63s/it]

[Skip OCR] Too small: 1690 bytes
[Skip OCR] Too small: 1674 bytes
[Skip OCR] Too small: 1387 bytes
[💾] Backup saved with 1850 posts


Processing topics:  21%|██        | 93/450 [05:29<09:43,  1.63s/it]

[Skip OCR] Too small: 1674 bytes
[Skip OCR] Too small: 1387 bytes
[Skip OCR] Too small: 1469 bytes


Processing topics:  21%|██        | 93/450 [05:30<09:43,  1.63s/it]

[💾] Backup saved with 1900 posts
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1449 bytes
[Skip OCR] Too small: 1387 bytes


Processing topics:  21%|██        | 93/450 [05:30<09:43,  1.63s/it]

[Skip OCR] Too small: 1674 bytes
[💾] Backup saved with 1950 posts
[💾] Backup saved with 2000 posts


Processing topics:  21%|██        | 93/450 [05:32<09:43,  1.63s/it]

  [Skipped] Tiny image (677x28) in post 596606
  [Image Error] for post 596606: 'NoneType' object has no attribute 'seek'
[💾] Backup saved with 2050 posts


Processing topics:  22%|██▏       | 101/450 [05:47<15:13,  2.62s/it]

[💾] Backup saved with 2100 posts


Processing topics:  23%|██▎       | 102/450 [05:50<14:05,  2.43s/it]

[Skip OCR] Too small: 1808 bytes
[Skip OCR] Too small: 1941 bytes


Processing topics:  23%|██▎       | 102/450 [05:50<14:05,  2.43s/it]

[Skip OCR] Too small: 861 bytes
[Skip OCR] Too small: 1941 bytes


Processing topics:  23%|██▎       | 102/450 [05:51<14:05,  2.43s/it]

[Skip OCR] Too small: 1246 bytes
[Skip OCR] Too small: 1808 bytes


Processing topics:  23%|██▎       | 102/450 [05:51<14:05,  2.43s/it]

[Skip OCR] Too small: 1366 bytes


Processing topics:  24%|██▍       | 107/450 [05:59<09:54,  1.73s/it]

  [Image Error] for post 604239: 'NoneType' object has no attribute 'seek'


Processing topics:  24%|██▍       | 108/450 [06:01<11:59,  2.10s/it]

[💾] Backup saved with 2150 posts
[💾] Backup saved with 2200 posts
[💾] Backup saved with 2250 posts


Processing topics:  25%|██▌       | 113/450 [06:16<11:34,  2.06s/it]

[💾] Backup saved with 2300 posts


Processing topics:  25%|██▌       | 113/450 [06:22<11:34,  2.06s/it]

[Skip OCR] Too small: 565 bytes


Processing topics:  25%|██▌       | 113/450 [06:22<11:34,  2.06s/it]

[Skip OCR] Too small: 1675 bytes
[💾] Backup saved with 2350 posts


Processing topics:  26%|██▌       | 116/450 [06:30<14:36,  2.62s/it]

[Skip OCR] Too small: 1388 bytes
[💾] Backup saved with 2400 posts
[💾] Backup saved with 2450 posts


Processing topics:  26%|██▌       | 117/450 [06:34<19:58,  3.60s/it]

[Skip OCR] Too small: 1387 bytes
[Skip OCR] Too small: 1437 bytes


Processing topics:  27%|██▋       | 120/450 [06:38<12:20,  2.24s/it]

[Skip OCR] Too small: 1675 bytes


Processing topics:  28%|██▊       | 124/450 [06:44<09:02,  1.66s/it]

[Skip OCR] Too small: 535 bytes
[Skip OCR] Too small: 535 bytes
[Skip OCR] Too small: 535 bytes
[Skip OCR] Too small: 535 bytes


Processing topics:  28%|██▊       | 125/450 [06:46<09:39,  1.78s/it]

[💾] Backup saved with 2500 posts


Processing topics:  30%|██▉       | 133/450 [06:57<07:33,  1.43s/it]

[💾] Backup saved with 2550 posts


Processing topics:  30%|███       | 135/450 [07:01<08:04,  1.54s/it]

[💾] Backup saved with 2600 posts
[💾] Backup saved with 2650 posts
[💾] Backup saved with 2700 posts
[💾] Backup saved with 2750 posts
[💾] Backup saved with 2800 posts
[💾] Backup saved with 2850 posts
[💾] Backup saved with 2900 posts


Processing topics:  31%|███       | 140/450 [07:41<18:06,  3.51s/it]

[💾] Backup saved with 2950 posts
  [Skipped] Tiny image (16x16) in post 595921
[💾] Backup saved with 3000 posts


Processing topics:  33%|███▎      | 147/450 [07:53<09:06,  1.80s/it]

[💾] Backup saved with 3050 posts


Processing topics:  34%|███▍      | 155/450 [08:06<06:57,  1.41s/it]

[💾] Backup saved with 3100 posts


Processing topics:  35%|███▌      | 159/450 [08:13<07:07,  1.47s/it]

[💾] Backup saved with 3150 posts
[💾] Backup saved with 3200 posts
[💾] Backup saved with 3250 posts


Processing topics:  37%|███▋      | 165/450 [08:41<11:11,  2.36s/it]

[💾] Backup saved with 3300 posts


Processing topics:  40%|███▉      | 178/450 [09:00<06:01,  1.33s/it]

[💾] Backup saved with 3350 posts


Processing topics: 100%|██████████| 450/450 [15:49<00:00,  2.11s/it]



Collected 3380 posts.
Scraping complete. Saved 3380 posts to tds_discourse_data.json
