In [2]:
import os
import requests
import time
import json
import random

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Referer": "https://www.reddit.com/r/Map2Canada/",
}

# === Config Path ===
json_dir = "Canada_JSON"
post_log_file = os.path.join(json_dir, "saved_post_ids.txt")
after_checkpoint_file = os.path.join(json_dir, "after_checkpoint.txt")
page_index_file = os.path.join(json_dir, "page_index.txt")

TOTAL_PAGES = 200000
os.makedirs(json_dir, exist_ok=True)

# === Load saved post_ids ===
saved_post_ids = set()
if os.path.exists(post_log_file):
    with open(post_log_file, "r") as f:
        saved_post_ids = set(line.strip() for line in f)

# === Load pagination parameters ===
after = None
if os.path.exists(after_checkpoint_file):
    with open(after_checkpoint_file, "r") as f:
        after = f.read().strip()

page = 0
if os.path.exists(page_index_file):
    with open(page_index_file, "r") as f:
        page = int(f.read().strip())

print(f"üìå Resuming from page {page} with after = {after}")

# === Start scraping pages ===
for _ in range(page, TOTAL_PAGES):
    url = "https://www.reddit.com/r/Map2Canada/.json"
    if after:
        url += f"?after={after}"

    try:
        res = requests.get(url, headers=headers)
        data = res.json()
    except Exception as e:
        print(f"‚ùå Error fetching page {page}: {e}")
        time.sleep(5)
        continue

    children = data.get("data", {}).get("children", [])
    after = data.get("data", {}).get("after")

    new_posts = []
    for child in children:
        post_id = child["data"].get("id")
        if post_id and post_id not in saved_post_ids:
            new_posts.append(child)
            saved_post_ids.add(post_id)

    if not new_posts:
        print(f"‚ö†Ô∏è Page {page} contains only duplicate posts. Skipping.")
    else:
        json_filename = f"page_{page}.json"
        save_data = {"data": {"children": new_posts}}
        with open(os.path.join(json_dir, json_filename), "w", encoding="utf-8") as f:
            json.dump(save_data, f, ensure_ascii=False, indent=2)

        # Updae log and checkpoint
        with open(post_log_file, "a") as f:
            for post in new_posts:
                f.write(post["data"]["id"] + "\n")

        print(f"‚úÖ Saved {json_filename} with {len(new_posts)} new posts.")

    with open(after_checkpoint_file, "w") as f:
        f.write(after if after else "")

    page += 1
    with open(page_index_file, "w") as f:
        f.write(str(page))

    if not after:
        print("‚úÖ No more pages. Stopping.")
        break

    time.sleep(random.uniform(10, 20))

üìå Resuming from page 0 with after = None
‚úÖ Saved page_0.json with 27 new posts.
‚úÖ Saved page_1.json with 25 new posts.
‚úÖ Saved page_2.json with 9 new posts.
‚úÖ No more pages. Stopping.


In [3]:
import os
import json
import csv
import requests
from datetime import datetime
import time
import random

# === Config Path ===
json_dir = "Canada_JSON"
output_dir = "Canada_CSV"
processed_file = os.path.join(output_dir, "processed_pages.txt")
error_dir = os.path.join(output_dir, "errors")

# === Create error_dir if doesn't exist ===
os.makedirs(error_dir, exist_ok=True)

# === Initialize processed pages set ===
processed_pages = set()
if os.path.exists(processed_file):
    with open(processed_file, "r") as f:
        processed_pages = set(line.strip() for line in f)

# === Headers for HTTP requests ===
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json",
    "Referer": "https://www.reddit.com/r/Map2Canada/",
}

# === Recursive function: extract all levels of comments ===
def extract_comments(comments_list, post_id, all_comments):
    for c in comments_list:
        if c.get("kind") != "t1":
            continue
        data = c["data"]
        comment = {
            "post_id": post_id,
            "author": data.get("author"),
            "body": data.get("body", "").replace("\n", " "),
            "score": data.get("score"),
            "created_utc": data.get("created_utc"),
            "date": datetime.utcfromtimestamp(data.get("created_utc")).strftime('%Y-%m-%d %H:%M:%S') if data.get("created_utc") else ""
        }
        all_comments.append(comment)
        # Recursive function for child comments
        if data.get("replies") and isinstance(data["replies"], dict):
            replies = data["replies"]["data"]["children"]
            extract_comments(replies, post_id, all_comments)

# === Iterate through all JSON files ===
for filename in os.listdir(json_dir):
    if not filename.endswith(".json") or filename in processed_pages:
        continue

    json_path = os.path.join(json_dir, filename)
    with open(json_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except Exception as e:
            print(f"‚ùå Failed to load {filename}: {e}")
            continue

    comments_data = []
    for child in data.get("data", {}).get("children", []):
        post_id = child["data"].get("id")
        if not post_id:
            continue

        comment_url = f"https://www.reddit.com/comments/{post_id}.json"

        try:
            res = requests.get(comment_url, headers=headers)
            if res.status_code == 429:
                wait = int(res.headers.get("Retry-After", 10))
                print(f"‚è≥ Rate limited on post {post_id}, waiting {wait} seconds...")
                time.sleep(wait)
                continue

            if res.status_code != 200:
                print(f"‚ö†Ô∏è Skipping post {post_id}, status code {res.status_code}")
                continue

            if not res.text.strip():
                print(f"‚ö†Ô∏è Empty response for post {post_id}, possible rate limit or server error")
                continue

            try:
                thread_data = res.json()
            except json.JSONDecodeError:
                error_path = os.path.join(error_dir, f"error_{post_id}.html")
                with open(error_path, "w", encoding="utf-8") as ef:
                    ef.write(res.text)
                print(f"‚ùå Error parsing JSON for post {post_id}, saved raw response to {error_path}")
                continue

            if not isinstance(thread_data, list) or len(thread_data) < 2:
                print(f"‚ö†Ô∏è Invalid structure for post {post_id}, skipping")
                continue

            comments_list = thread_data[1]["data"]["children"]
            extract_comments(comments_list, post_id, comments_data)
            print(f"‚úÖ {filename} - Post {post_id} -> {len(comments_data)} comments collected so far")
        except Exception as e:
            print(f"‚ùå Error fetching comments for post {post_id}: {e}")

        time.sleep(random.uniform(1, 2))  # aviod rate limiting

    # Save to CSV files correspondingly
    csv_filename = filename.replace(".json", ".csv")
    csv_path = os.path.join(output_dir, csv_filename)
    with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["post_id", "author", "body", "score", "created_utc", "date"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in comments_data:
            writer.writerow(row)

    # Mark as processed
    with open(processed_file, "a") as f:
        f.write(f"{filename}\n")

    print(f"üìÅ Finished {filename} -> Saved to {csv_filename}")

‚úÖ page_0.json - Post ww1fru -> 10 comments collected so far
‚úÖ page_0.json - Post 13eqdjo -> 13 comments collected so far
‚úÖ page_0.json - Post 1gtvf98 -> 13 comments collected so far
‚úÖ page_0.json - Post 1ghqsc1 -> 13 comments collected so far
‚úÖ page_0.json - Post 1fzmfz7 -> 27 comments collected so far
‚úÖ page_0.json - Post 1fc1v3p -> 34 comments collected so far
‚úÖ page_0.json - Post 1cq6evc -> 39 comments collected so far
‚úÖ page_0.json - Post 1bn7px3 -> 42 comments collected so far
‚úÖ page_0.json - Post 1ad3y02 -> 51 comments collected so far
‚úÖ page_0.json - Post 19e6t1g -> 51 comments collected so far
‚úÖ page_0.json - Post 18v1l8d -> 51 comments collected so far
‚úÖ page_0.json - Post 18s31fe -> 59 comments collected so far
‚úÖ page_0.json - Post 156pfbe -> 61 comments collected so far
‚úÖ page_0.json - Post 13jyaap -> 80 comments collected so far
‚úÖ page_0.json - Post 13glxtc -> 90 comments collected so far
‚úÖ page_0.json - Post 13c85z1 -> 96 comments collected 

In [4]:
import os
import pandas as pd

# Config CSV file directory
csv_dir = "Canada_CSV"

# Collect all CSV file directories
csv_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith(".csv")]

# Merge all CSV files
all_data = pd.concat([pd.read_csv(f, usecols=["body"]) for f in csv_files], ignore_index=True)

# Summarize
total_rows = len(all_data)
unique_bodies = all_data["body"].drop_duplicates()
unique_rows = len(unique_bodies)
duplicate_rows = total_rows - unique_rows

# Print results
print(f"üìÑ Total comment rows: {total_rows}")
print(f"üîÅ Duplicate bodies: {duplicate_rows}")
print(f"‚úÖ Unique bodies: {unique_rows}")

üìÑ Total comment rows: 325
üîÅ Duplicate bodies: 10
‚úÖ Unique bodies: 315


In [8]:
import os
import pandas as pd

# Config input and output directories
input_dir = "Canada_CSV"
output_file = os.path.join(input_dir, "may_7_merged_comments.csv")

# Merge all CSV files
all_dfs = []
for filename in os.listdir(input_dir):
    if filename.endswith(".csv") and filename != "may_7_merged_comments.csv":
        file_path = os.path.join(input_dir, filename)
        try:
            df = pd.read_csv(file_path)
            all_dfs.append(df)
            print(f"‚úÖ Loaded {filename} with {len(df)} rows.")
        except Exception as e:
            print(f"‚ùå Failed to load {filename}: {e}")

# Merge and drop duplicates
if all_dfs:
    merged_df = pd.concat(all_dfs, ignore_index=True)
    print(f"üìä Total merged rows (before deduplication): {len(merged_df)}")

    # Drop duplicates based on "body" column
    merged_df.drop_duplicates(subset="body", inplace=True)
    print(f"üßπ Rows after deduplication: {len(merged_df)}")

    # Merge all processed CSV files
    merged_df.to_csv(output_file, index=False)
    print(f"üìÅ Merged file saved to: {output_file}")
else:
    print("‚ö†Ô∏è No CSV files found or all failed to load.")

‚úÖ Loaded page_0.csv with 149 rows.
‚úÖ Loaded page_1.csv with 117 rows.
‚úÖ Loaded page_2.csv with 59 rows.
üìä Total merged rows (before deduplication): 325
üßπ Rows after deduplication: 315
üìÅ Merged file saved to: Canada_CSV\may_7_merged_comments.csv


In [9]:
import pandas as pd

# Read the merged CSV file
df = pd.read_csv("Canada_CSV/may_7_merged_comments.csv")

# Ensure the 'date' column is of datetime type
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with unparseable dates
df = df.dropna(subset=['date'])

# Extract the year
df['year'] = df['date'].dt.year

# Count frequency by year
year_counts = df['year'].value_counts().sort_index()

print("üìÖ Comment count distribution by year:")
print(year_counts)


üìÖ Comment count distribution by year:
year
2022    162
2023    102
2024     50
2025      1
Name: count, dtype: int64


In [10]:
# Extract the month
df['month'] = df['date'].dt.month

# Count frequency by year and month
year_month_counts = df.groupby(['year', 'month']).size().unstack(fill_value=0)

print("\nüìÖ Comment count distribution by year and month:")
print(year_month_counts)



üìÖ Comment count distribution by year and month:
month  1   2   3   4   5   6   7   8   9   10  11  12
year                                                 
2022    0   0   0   0   0   0   0  22  45  25  66   4
2023    2   0   9  35  34   3   3   2   0   2   4   8
2024    8   1   6   3   5   0   1   5   7  14   0   0
2025    0   1   0   0   0   0   0   0   0   0   0   0


In [11]:
# Prevent truncation in display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)

# Parse dates and drop unparseable entries
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

# Extract year, month, and day
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Filter data for the year 2022
df_2025 = df[df['year'] == 2022]

# Group and count comments by year, month, and day
daily_counts = df_2025.groupby(['year', 'month', 'day']).size().reset_index(name='count')

# Create pivot table: year + month as rows, day as columns
pivot_table = daily_counts.pivot_table(
    index=['year', 'month'], columns='day', values='count', fill_value=0
)

# Output the tidy table
print("üìÖ Comment count distribution for each day of every month in 2022:")
print(pivot_table)


üìÖ Comment count distribution for each day of every month in 2022:
day          1    2    3    4    6     7    8     9    10   12   13   14   17   18   21   22    23    24   25   26   27   28   29   31
year month                                                                                                                            
2022 8      0.0  0.0  0.0  0.0  0.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0  13.0  1.0  0.0  4.0  1.0  1.0  1.0
     9      0.0  2.0  2.0  3.0  0.0   0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0  8.0  4.0  6.0  18.0   1.0  0.0  0.0  0.0  0.0  1.0  0.0
     10     0.0  0.0  2.0  0.0  3.0  14.0  0.0   0.0  0.0  0.0  3.0  1.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0  2.0  0.0  0.0  0.0  0.0
     11     4.0  8.0  0.0  3.0  0.0  23.0  6.0  10.0  6.0  0.0  0.0  0.0  0.0  0.0  0.0  6.0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
     12     0.0  0.0  0.0  0.0  0.0   0.0  0.0   0.0  0.0  2.0  0.0  0.0  1.0  1.0  0.0  0.0   0.0   0.0  0.0  0.0  0.0  