<h1>Final Code</h1>

In [1]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import random

# ✅ ScraperAPI key
API_KEY = "fb255ed807c12e5c03651908a491615f"
NUM_THREADS = 8  # Optimized for an 8-core laptop
MAX_RETRIES = 7  # Increased retries for better success
SAVE_INTERVAL = 10  # Save every 10 pages

# ✅ Path to save Excel file
save_path = r"E:\DHP\File_7.xlsx"

# ✅ Shared data storage (processed in batches)
all_data = []
lock = Lock()  # Thread safety

# ✅ User input for page range
start_page = int(input("Enter the starting page number: "))
end_page = int(input("Enter the ending page number: "))


def scrape_page(page_number):
    """Scrapes a single Stack Overflow page with retries and returns extracted data."""
    site = f"https://stackoverflow.com/questions?tab=newest&pagesize=50&page={page_number}"
    url = f"http://api.scraperapi.com?api_key={API_KEY}&url={site}"
    
    delay = 3  # Start with a higher delay to avoid rate limits

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = requests.get(url, timeout=30)  # ⬆ Increased timeout to avoid read timeouts
            if response.status_code == 429:  # Rate limited
                wait_time = delay + random.uniform(1, 3)  # Randomized backoff
                print(f"⚠️ Rate limit hit on page {page_number}, retrying in {wait_time:.2f}s (Attempt {attempt}/{MAX_RETRIES})...")
                time.sleep(wait_time)
                delay *= 2  # Exponential backoff
                continue  # Retry request

            if response.status_code != 200:
                print(f"❌ Failed to fetch page {page_number}: {response.status_code}")
                return []

            soup = BeautifulSoup(response.text, "html.parser")
            questions = soup.find_all("div", class_="s-post-summary")

            page_data = []
            for question in questions:
                time_element = question.find("span", class_="relativetime")
                exact_time = time_element["title"] if time_element else "N/A"

                tags = [tag.get_text() for tag in question.find_all("a", class_="post-tag")]
                page_data.append([exact_time, ", ".join(tags)])

            print(f"✅ Scraped page {page_number} successfully!")

            return page_data  # Return extracted data

        except requests.exceptions.RequestException as e:
            print(f"⚠️ Error on page {page_number} (attempt {attempt}/{MAX_RETRIES}): {e}")
            time.sleep(delay + random.uniform(1, 3))  # Randomized retry delay
            delay *= 2  # Exponential backoff

    print(f"❌ Skipping page {page_number} after {MAX_RETRIES} failed attempts.")
    return []  # Return empty list if all retries fail


def save_to_excel():
    """Efficiently appends new data to the Excel file in batches."""
    with lock:
        if not all_data:
            return

        df = pd.DataFrame(all_data, columns=["Date", "Tags"])

        if os.path.exists(save_path):
            existing_df = pd.read_excel(save_path)
            df = pd.concat([existing_df, df], ignore_index=True)  # Append new data

        df.to_excel(save_path, index=False)
        print(f"💾 Data saved to: {save_path} (Total rows: {len(df)})")

        # ✅ Clear memory after saving
        all_data.clear()

# ✅ Parallel scraping with ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    future_to_page = {executor.submit(scrape_page, i): i for i in range(start_page, end_page + 1)}

    for i, future in enumerate(as_completed(future_to_page), start=1):
        result = future.result()
        with lock:
            all_data.extend(result)

        # ✅ Save every 10 pages
        if i % SAVE_INTERVAL == 0 and all_data:
            save_to_excel()

# ✅ Final save when all pages are done
if all_data:
    save_to_excel()

print("🎉 Data scraping completed successfully!")


✅ Scraped page 52900 successfully!
✅ Scraped page 52904 successfully!
✅ Scraped page 52902 successfully!
✅ Scraped page 52905 successfully!
✅ Scraped page 52906 successfully!
✅ Scraped page 52910 successfully!
✅ Scraped page 52908 successfully!
✅ Scraped page 52911 successfully!
✅ Scraped page 52909 successfully!
✅ Scraped page 52907 successfully!
✅ Scraped page 52913 successfully!
✅ Scraped page 52914 successfully!
✅ Scraped page 52915 successfully!
✅ Scraped page 52917 successfully!
✅ Scraped page 52919 successfully!
✅ Scraped page 52920 successfully!
💾 Data saved to: E:\DHP\File_7.xlsx (Total rows: 68967)
✅ Scraped page 52918 successfully!
✅ Scraped page 52922 successfully!
✅ Scraped page 52923 successfully!
✅ Scraped page 52921 successfully!
✅ Scraped page 52916 successfully!
⚠️ Rate limit hit on page 52928, retrying in 4.83s (Attempt 1/7)...
✅ Scraped page 52924 successfully!
✅ Scraped page 52925 successfully!
✅ Scraped page 52912 successfully!
✅ Scraped page 52926 successfully!
✅

KeyboardInterrupt: 