In [1]:
import os
import requests
import time
import json
import random

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Referer": "https://www.reddit.com/r/Riga/",
}

# === Config Path ===
json_dir = "Riga_JSON"
post_log_file = os.path.join(json_dir, "saved_post_ids.txt")
after_checkpoint_file = os.path.join(json_dir, "after_checkpoint.txt")
page_index_file = os.path.join(json_dir, "page_index.txt")

TOTAL_PAGES = 200000
os.makedirs(json_dir, exist_ok=True)

# === Load saved post_ids ===
saved_post_ids = set()
if os.path.exists(post_log_file):
    with open(post_log_file, "r") as f:
        saved_post_ids = set(line.strip() for line in f)

# === Load pagination parameters ===
after = None
if os.path.exists(after_checkpoint_file):
    with open(after_checkpoint_file, "r") as f:
        after = f.read().strip()

page = 0
if os.path.exists(page_index_file):
    with open(page_index_file, "r") as f:
        page = int(f.read().strip())

print(f"üìå Resuming from page {page} with after = {after}")

# === Start scraping pages ===
for _ in range(page, TOTAL_PAGES):
    url = "https://www.reddit.com/r/Riga/.json"
    if after:
        url += f"?after={after}"

    try:
        res = requests.get(url, headers=headers)
        data = res.json()
    except Exception as e:
        print(f"‚ùå Error fetching page {page}: {e}")
        time.sleep(5)
        continue

    children = data.get("data", {}).get("children", [])
    after = data.get("data", {}).get("after")

    new_posts = []
    for child in children:
        post_id = child["data"].get("id")
        if post_id and post_id not in saved_post_ids:
            new_posts.append(child)
            saved_post_ids.add(post_id)

    if not new_posts:
        print(f"‚ö†Ô∏è Page {page} contains only duplicate posts. Skipping.")
    else:
        json_filename = f"page_{page}.json"
        save_data = {"data": {"children": new_posts}}
        with open(os.path.join(json_dir, json_filename), "w", encoding="utf-8") as f:
            json.dump(save_data, f, ensure_ascii=False, indent=2)

        # Updae log and checkpoint
        with open(post_log_file, "a") as f:
            for post in new_posts:
                f.write(post["data"]["id"] + "\n")

        print(f"‚úÖ Saved {json_filename} with {len(new_posts)} new posts.")

    with open(after_checkpoint_file, "w") as f:
        f.write(after if after else "")

    page += 1
    with open(page_index_file, "w") as f:
        f.write(str(page))

    if not after:
        print("‚úÖ No more pages. Stopping.")
        break

    time.sleep(random.uniform(10, 20))

üìå Resuming from page 0 with after = None
‚úÖ Saved page_0.json with 25 new posts.
‚úÖ Saved page_1.json with 25 new posts.
‚úÖ Saved page_2.json with 25 new posts.
‚úÖ Saved page_3.json with 25 new posts.
‚úÖ Saved page_4.json with 25 new posts.
‚úÖ Saved page_5.json with 25 new posts.
‚úÖ Saved page_6.json with 25 new posts.
‚úÖ Saved page_7.json with 25 new posts.
‚úÖ Saved page_8.json with 25 new posts.
‚úÖ Saved page_9.json with 25 new posts.
‚úÖ Saved page_10.json with 25 new posts.
‚úÖ Saved page_11.json with 25 new posts.
‚úÖ Saved page_12.json with 25 new posts.
‚úÖ Saved page_13.json with 25 new posts.
‚úÖ Saved page_14.json with 25 new posts.
‚úÖ Saved page_15.json with 25 new posts.
‚úÖ Saved page_16.json with 25 new posts.
‚úÖ Saved page_17.json with 25 new posts.
‚úÖ Saved page_18.json with 25 new posts.
‚úÖ Saved page_19.json with 25 new posts.
‚úÖ Saved page_20.json with 25 new posts.
‚úÖ Saved page_21.json with 25 new posts.
‚úÖ Saved page_22.json with 25 new posts.


In [17]:
import os
import json
import csv
import requests
from datetime import datetime
import time
import random

# === Config Path ===
json_dir = "Riga_JSON"
output_dir = "Riga_CSV"
processed_file = os.path.join(output_dir, "processed_pages.txt")
error_dir = os.path.join(output_dir, "errors")

# === Create error_dir if doesn't exist ===
os.makedirs(error_dir, exist_ok=True)

# === Initialize processed pages set ===
processed_pages = set()
if os.path.exists(processed_file):
    with open(processed_file, "r") as f:
        processed_pages = set(line.strip() for line in f)

# === Headers for HTTP requests ===
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json",
    "Referer": "https://www.reddit.com/r/Riga/",
}

# === Recursive function: extract all levels of comments ===
def extract_comments(comments_list, post_id, all_comments):
    for c in comments_list:
        if c.get("kind") != "t1":
            continue
        data = c["data"]
        comment = {
            "post_id": post_id,
            "author": data.get("author"),
            "body": data.get("body", "").replace("\n", " "),
            "score": data.get("score"),
            "created_utc": data.get("created_utc"),
            "date": datetime.utcfromtimestamp(data.get("created_utc")).strftime('%Y-%m-%d %H:%M:%S') if data.get("created_utc") else ""
        }
        all_comments.append(comment)
        # Recursive function for child comments
        if data.get("replies") and isinstance(data["replies"], dict):
            replies = data["replies"]["data"]["children"]
            extract_comments(replies, post_id, all_comments)

# === Iterate through all JSON files ===
for filename in os.listdir(json_dir):
    if not filename.endswith(".json") or filename in processed_pages:
        continue

    json_path = os.path.join(json_dir, filename)
    with open(json_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except Exception as e:
            print(f"‚ùå Failed to load {filename}: {e}")
            continue

    comments_data = []
    for child in data.get("data", {}).get("children", []):
        post_id = child["data"].get("id")
        if not post_id:
            continue

        comment_url = f"https://www.reddit.com/comments/{post_id}.json"

        try:
            res = requests.get(comment_url, headers=headers)
            if res.status_code == 429:
                wait = int(res.headers.get("Retry-After", 10))
                print(f"‚è≥ Rate limited on post {post_id}, waiting {wait} seconds...")
                time.sleep(wait)
                continue

            if res.status_code != 200:
                print(f"‚ö†Ô∏è Skipping post {post_id}, status code {res.status_code}")
                continue

            if not res.text.strip():
                print(f"‚ö†Ô∏è Empty response for post {post_id}, possible rate limit or server error")
                continue

            try:
                thread_data = res.json()
            except json.JSONDecodeError:
                error_path = os.path.join(error_dir, f"error_{post_id}.html")
                with open(error_path, "w", encoding="utf-8") as ef:
                    ef.write(res.text)
                print(f"‚ùå Error parsing JSON for post {post_id}, saved raw response to {error_path}")
                continue

            if not isinstance(thread_data, list) or len(thread_data) < 2:
                print(f"‚ö†Ô∏è Invalid structure for post {post_id}, skipping")
                continue

            comments_list = thread_data[1]["data"]["children"]
            extract_comments(comments_list, post_id, comments_data)
            print(f"‚úÖ {filename} - Post {post_id} -> {len(comments_data)} comments collected so far")
        except Exception as e:
            print(f"‚ùå Error fetching comments for post {post_id}: {e}")

        time.sleep(random.uniform(1, 2))  # aviod rate limiting

    # Save to CSV files correspondingly
    csv_filename = filename.replace(".json", ".csv")
    csv_path = os.path.join(output_dir, csv_filename)
    with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["post_id", "author", "body", "score", "created_utc", "date"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in comments_data:
            writer.writerow(row)

    # Mark as processed
    with open(processed_file, "a") as f:
        f.write(f"{filename}\n")

    print(f"üìÅ Finished {filename} -> Saved to {csv_filename}")

‚úÖ page_7.json - Post 1im7a5f -> 0 comments collected so far
‚úÖ page_7.json - Post 1ilb83v -> 11 comments collected so far
‚úÖ page_7.json - Post 1ikr7zl -> 13 comments collected so far
‚úÖ page_7.json - Post 1ikk5xq -> 37 comments collected so far
‚úÖ page_7.json - Post 1ik4b08 -> 39 comments collected so far
‚úÖ page_7.json - Post 1ijxhi5 -> 43 comments collected so far
‚úÖ page_7.json - Post 1ijt1ep -> 47 comments collected so far
‚úÖ page_7.json - Post 1ijr8dw -> 47 comments collected so far
‚úÖ page_7.json - Post 1ii5x64 -> 53 comments collected so far
‚úÖ page_7.json - Post 1ii2ni1 -> 84 comments collected so far
‚úÖ page_7.json - Post 1ii2u1x -> 117 comments collected so far
‚úÖ page_7.json - Post 1ii6hbk -> 119 comments collected so far
‚úÖ page_7.json - Post 1ihs730 -> 123 comments collected so far
‚úÖ page_7.json - Post 1ihkp4o -> 127 comments collected so far
‚úÖ page_7.json - Post 1ihebro -> 150 comments collected so far
‚úÖ page_7.json - Post 1ihewqh -> 158 comments coll

In [18]:
import os
import pandas as pd

# Config CSV file directory
csv_dir = "Riga_CSV"

# Collect all CSV file directories
csv_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith(".csv")]

# Merge all CSV files
all_data = pd.concat([pd.read_csv(f, usecols=["body"]) for f in csv_files], ignore_index=True)

# Summarize
total_rows = len(all_data)
unique_bodies = all_data["body"].drop_duplicates()
unique_rows = len(unique_bodies)
duplicate_rows = total_rows - unique_rows

# Print results
print(f"üìÑ Total comment rows: {total_rows}")
print(f"üîÅ Duplicate bodies: {duplicate_rows}")
print(f"‚úÖ Unique bodies: {unique_rows}")

üìÑ Total comment rows: 5638
üîÅ Duplicate bodies: 198
‚úÖ Unique bodies: 5440


In [19]:
import os
import pandas as pd

# Config input and output directories
input_dir = "Riga_CSV"
output_file = os.path.join(input_dir, "may_7_merged_comments.csv")

# Merge all CSV files
all_dfs = []
for filename in os.listdir(input_dir):
    if filename.endswith(".csv") and filename != "may_7_merged_comments.csv":
        file_path = os.path.join(input_dir, filename)
        try:
            df = pd.read_csv(file_path)
            all_dfs.append(df)
            print(f"‚úÖ Loaded {filename} with {len(df)} rows.")
        except Exception as e:
            print(f"‚ùå Failed to load {filename}: {e}")

# Merge and drop duplicates
if all_dfs:
    merged_df = pd.concat(all_dfs, ignore_index=True)
    print(f"üìä Total merged rows (before deduplication): {len(merged_df)}")

    # Drop duplicates based on "body" column
    merged_df.drop_duplicates(subset="body", inplace=True)
    print(f"üßπ Rows after deduplication: {len(merged_df)}")

    # Merge all processed CSV files
    merged_df.to_csv(output_file, index=False)
    print(f"üìÅ Merged file saved to: {output_file}")
else:
    print("‚ö†Ô∏è No CSV files found or all failed to load.")

‚úÖ Loaded page_0.csv with 75 rows.
‚úÖ Loaded page_1.csv with 159 rows.
‚úÖ Loaded page_10.csv with 123 rows.
‚úÖ Loaded page_11.csv with 142 rows.
‚úÖ Loaded page_12.csv with 257 rows.
‚úÖ Loaded page_13.csv with 256 rows.
‚úÖ Loaded page_14.csv with 215 rows.
‚úÖ Loaded page_15.csv with 181 rows.
‚úÖ Loaded page_16.csv with 248 rows.
‚úÖ Loaded page_17.csv with 439 rows.
‚úÖ Loaded page_18.csv with 215 rows.
‚úÖ Loaded page_19.csv with 270 rows.
‚úÖ Loaded page_2.csv with 48 rows.
‚úÖ Loaded page_20.csv with 400 rows.
‚úÖ Loaded page_21.csv with 244 rows.
‚úÖ Loaded page_22.csv with 66 rows.
‚úÖ Loaded page_23.csv with 135 rows.
‚úÖ Loaded page_24.csv with 0 rows.
‚úÖ Loaded page_25.csv with 89 rows.
‚úÖ Loaded page_26.csv with 64 rows.
‚úÖ Loaded page_27.csv with 93 rows.
‚úÖ Loaded page_28.csv with 88 rows.
‚úÖ Loaded page_29.csv with 25 rows.
‚úÖ Loaded page_3.csv with 231 rows.
‚úÖ Loaded page_30.csv with 71 rows.
‚úÖ Loaded page_31.csv with 68 rows.
‚úÖ Loaded page_32.csv with 

  merged_df = pd.concat(all_dfs, ignore_index=True)


In [20]:
import pandas as pd

# Read the merged CSV file
df = pd.read_csv("Riga_CSV/may_7_merged_comments.csv")

# Ensure the 'date' column is of datetime type
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with unparseable dates
df = df.dropna(subset=['date'])

# Extract the year
df['year'] = df['date'].dt.year

# Count frequency by year
year_counts = df['year'].value_counts().sort_index()

print("üìÖ Comment count distribution by year:")
print(year_counts)


üìÖ Comment count distribution by year:
year
2022      76
2023     655
2024    3306
2025    1403
Name: count, dtype: int64


In [21]:
# Extract the month
df['month'] = df['date'].dt.month

# Count frequency by year and month
year_month_counts = df.groupby(['year', 'month']).size().unstack(fill_value=0)

print("\nüìÖ Comment count distribution by year and month:")
print(year_month_counts)



üìÖ Comment count distribution by year and month:
month   1    2    3    4    5    6    7    8    9    10   11   12
year                                                             
2022     0    0    0    0    0    0    0    0    0    0   17   59
2023    85   54   28   33   75   27  105   41   55   57   20   75
2024    81   44   53   50   12  163  532  873  348  488  440  222
2025   268  245  415  258  217    0    0    0    0    0    0    0


In [22]:
# Prevent truncation in display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)

# Parse dates and drop unparseable entries
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

# Extract year, month, and day
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Filter data for the year 2022
df_2025 = df[df['year'] == 2025]

# Group and count comments by year, month, and day
daily_counts = df_2025.groupby(['year', 'month', 'day']).size().reset_index(name='count')

# Create pivot table: year + month as rows, day as columns
pivot_table = daily_counts.pivot_table(
    index=['year', 'month'], columns='day', values='count', fill_value=0
)

# Output the tidy table
print("üìÖ Comment count distribution for each day of every month in 2025:")
print(pivot_table)


üìÖ Comment count distribution for each day of every month in 2025:
day           1     2     3     4     5     6     7     8     9     10    11    12    13    14    15    16    17    18    19    20    21    22    23    24    25    26   27   28   29    30    31
year month                                                                                                                                                                                       
2025 1       1.0   5.0   0.0  14.0   4.0   5.0   3.0   2.0  15.0   7.0   5.0   4.0   9.0   2.0  18.0  27.0  14.0   5.0   1.0  18.0  21.0  16.0   6.0  16.0   7.0   5.0  8.0  8.0  4.0   8.0  10.0
     2       8.0  13.0  10.0  39.0  46.0   7.0  18.0  40.0  18.0   4.0   3.0   4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   3.0  12.0   3.0   2.0  5.0  9.0  0.0   0.0   0.0
     3       5.0   9.0  22.0   8.0  20.0   5.0   3.0   1.0  17.0  10.0   7.0  34.0   9.0  35.0  22.0  11.0  21.0   7.0   3.0  11.0  10.0   3.0  15.0   6.0 