In [24]:
import requests
from lxml import etree
import pandas as pd
import time

🔄 Scraping page 1: https://old.reddit.com
🔄 Scraping page 2: https://old.reddit.com/?count=25&after=t3_1lf1gg9
🔄 Scraping page 3: https://old.reddit.com/?count=50&after=t3_1lf7mus
🔄 Scraping page 4: https://old.reddit.com/?count=75&after=t3_1lf8eyt
🔄 Scraping page 5: https://old.reddit.com/?count=100&after=t3_1lfbfpa
✅ Excel file with real hyperlinks saved successfully.


In [None]:
def conn_web(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.content.decode('utf-8')
    except requests.exceptions.RequestException as e:
        print(f"❌ Error connecting to {url}: {e}")
        return None

In [26]:
def xpath_anchoring(content, data_list):
    try:
        html = etree.HTML(content)
    except Exception as e:
        print(f"❌ Error parsing HTML: {e}")
        return data_list, None

    posts = html.xpath('//p[@class="title"]/a[@data-event-action="title"]')

    for post in posts:
        text = post.xpath('./text()')
        text = text[0].strip() if text else ""

        href = post.xpath('./@href')
        href = href[0].strip() if href else ""

        if href.startswith('/'):
            href = "https://old.reddit.com" + href

        data_list.append({
            "title": text,
            "url": href
        })

    next_url = html.xpath('//span[@class="next-button"]/a/@href')
    next_url = next_url[0] if next_url else None

    return data_list, next_url

In [28]:
def reddit_scraper_loop(start_url, page_num):
    data_list = []
    current_url = start_url

    for i in range(page_num):
        print(f"🔄 Scraping page {i+1}: {current_url}")
        content = conn_web(current_url)
        if content is None:
            print(f"⚠️ Skipping page {i+1} due to connection error.")
            continue

        data_list, current_url = xpath_anchoring(content, data_list)

        if not current_url:
            print("🚧 No next page found. Stopping early.")
            break

        time.sleep(2)

    return data_list

In [30]:
def save_to_excel_with_real_hyperlinks(df, filename):
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        df.to_excel(writer, sheet_name='RedditPosts', index=False, startrow=1, header=False)

        workbook = writer.book
        worksheet = writer.sheets['RedditPosts']
        header_format = workbook.add_format({'bold': True, 'bg_color': '#DDEBF7'})
        link_format = workbook.add_format({'font_color': 'blue', 'underline': 1})

        # Write headers manually
        for col_num, value in enumerate(df.columns):
            worksheet.write(0, col_num, value, header_format)

        # Write actual hyperlinks
        for row in range(len(df)):
            title = df.at[row, "title"]
            url = df.at[row, "url"]

            if isinstance(url, str) and url.startswith("http"):
                worksheet.write_url(row + 1, 1, url, link_format, string=title)
            else:
                worksheet.write(row + 1, 1, "")


In [32]:
def main():
    start_url = "https://old.reddit.com"
    page_num = 5

    scraped_data = reddit_scraper_loop(start_url, page_num)
    df = pd.DataFrame(scraped_data)

    if not df.empty:
        try:
            save_to_excel_with_real_hyperlinks(df, "reddit_output.xlsx")
            print("✅ Excel file with real hyperlinks saved successfully.")
        except Exception as e:
            print(f"❌ Error writing Excel: {e}")
    else:
        print("⚠️ No data scraped.")


In [None]:
if __name__ == "__main__":
    main()