In [2]:
import requests
import json
import time
import html
import re
import pandas as pd
import csv

<h2>Scrape Reddit Data</h2>

In [23]:
url = "https://www.reddit.com/r/AmItheAsshole/search.json"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"}

def clean_dict(d):
    if isinstance(d, dict):
        return {key: clean_dict(value) for key, value in d.items()}
    elif isinstance(d, list):
        return [clean_dict(item) for item in d]
    elif isinstance(d, str):
        return html.unescape(d).replace("\n\n", " ").replace("\n", "").replace("\"", "").replace("\\", "").replace("&#x200B;", "").replace("\\u2019", "'")
    else:
        return d

def fetch_reddit_posts(max_posts=1000):
    
    posts = []
    after = None  # Pagination token

    while len(posts) < max_posts:
        params = {
            "q": 'flair:"Not the A-hole"',  # Search for flair
            "restrict_sr": 1,        # Restrict search to this subreddit
            "sort": "top",           # Sort by
            "limit": 100,            # Fetch 100 posts per request (max allowed)
            "after": after,          # Pagination token
            "t" : "year"
        }

        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        data = response.json()
        children = data["data"]["children"]

        if not children:
            print("No more posts found.")
            break

        # Extract filtered fields
        for post in children:
            post_data = post["data"]
            filtered_post = {
                "selftext": post_data.get("selftext"),
                "gilded": post_data.get("gilded"),
                "title": post_data.get("title"),
                "upvote_ratio": post_data.get("upvote_ratio"),
                "ups": post_data.get("ups"),
                "link_flair_text" : post_data.get("link_flair_text"),
                "created" : post_data.get("created"),
                "num_comments" : post_data.get("num_comments"),
                "url" : post_data.get("url"),
                "num_crossposts" : post_data.get("num_crossposts")
            }
            cleaned_post = clean_dict(filtered_post)
            posts.append(cleaned_post)

        # Get after token for pagination
        after = data["data"].get("after")
        if not after:
            print("Reached the last available post.")
            break

        print(f"Fetched {len(posts)} posts so far...")

        time.sleep(3.5)

    return posts[:max_posts]

posts = fetch_reddit_posts(max_posts=1000)

with open("reddit_data/nta_topyear_2025-2-7.json", "a", encoding="utf-8") as file:
    json.dump(posts, file, indent=4)

print(f"Saved {len(posts)} posts to file")

Fetched 100 posts so far...
Fetched 200 posts so far...
Reached the last available post.
Saved 238 posts to 'file'


<h2>Format Corpus for IMDB Data</h2>

In [79]:
imdb_path = "C:/Users/maddo/CS770_data/project_data/imdb_data"


# read raw data csv file
df = pd.read_csv(imdb_path + "/IMDB Dataset.csv")

# apply function to convert labels to what fasttext expects: __label__[label]
df['sentiment'] = df['sentiment'].apply(lambda x: f"__label__{x}")  # Example: "positive" -> "__label__positive"

# reverse order of columns
df = df[['sentiment', 'review']]

# remove spare html left in review contents
df['review'] = df['review'].str.replace("<br /><br />", " ").str.replace('\u200b', '').str.strip()

# save partially cleaned data into text file
df.to_csv(imdb_path + "/fasttext_imdb_corpus_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)

with open(imdb_path + "/fasttext_imdb_corpus_temp.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    
cleaned_lines = []
for line in lines:
    line = line.replace("  ", " ") # fix double spacing in temp file
    cleaned_lines.append(line)

# fully-cleaned fasttext training corpus stored in "fasttext_imdb_corpus.txt"
with open(imdb_path + "/fasttext_imdb_corpus.txt", "w", encoding="utf-8") as f:
    f.writelines(cleaned_lines)

<h2>Format Corpus for Yelp Data</h2>

In [16]:
yelp_path = "C:/Users/maddo/CS770_data/project_data/yelp_data"


# read csv: this one has no column headers
df = pd.read_csv(yelp_path + "/yelp.csv", header=None, names=['sentiment', 'review'])

df['sentiment'] = df['sentiment'].replace({1: '__label__negative', 2: '__label__positive'}) # re-format sentiment column

df['review'] = df['review'].str.replace("\\n", '', regex=False).str.replace('\\"', '"', regex=False) # get rid of newline character encodings

In [17]:
# save partially cleaned data into text file
df.to_csv(yelp_path + "/fasttext_yelp_corpus_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)

with open(yelp_path + "/fasttext_yelp_corpus_temp.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    
cleaned_lines = []
for line in lines:
    line = line.replace("  ", " ") # fix double spacing in temp file
    cleaned_lines.append(line)

# fully-cleaned fasttext training corpus stored in "fasttext_yelp_corpus.txt"
with open(yelp_path + "/fasttext_yelp_corpus.txt", "w", encoding="utf-8") as f:
    f.writelines(cleaned_lines)