In [20]:
import requests
import json
import time
import html
import re
import pandas as pd
import csv
from sklearn.model_selection import train_test_split

<h2>Scrape Reddit Data</h2>

In [23]:
url = "https://www.reddit.com/r/AmItheAsshole/search.json"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"}

def clean_dict(d):
    if isinstance(d, dict):
        return {key: clean_dict(value) for key, value in d.items()}
    elif isinstance(d, list):
        return [clean_dict(item) for item in d]
    elif isinstance(d, str):
        return html.unescape(d).replace("\n\n", " ").replace("\n", "").replace("\"", "").replace("\\", "").replace("&#x200B;", "").replace("\\u2019", "'")
    else:
        return d

def fetch_reddit_posts(max_posts=1000):
    
    posts = []
    after = None  # Pagination token

    while len(posts) < max_posts:
        params = {
            "q": 'flair:"Not the A-hole"',  # Search for flair
            "restrict_sr": 1,        # Restrict search to this subreddit
            "sort": "top",           # Sort by
            "limit": 100,            # Fetch 100 posts per request (max allowed)
            "after": after,          # Pagination token
            "t" : "year"
        }

        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        data = response.json()
        children = data["data"]["children"]

        if not children:
            print("No more posts found.")
            break

        # Extract filtered fields
        for post in children:
            post_data = post["data"]
            filtered_post = {
                "selftext": post_data.get("selftext"),
                "gilded": post_data.get("gilded"),
                "title": post_data.get("title"),
                "upvote_ratio": post_data.get("upvote_ratio"),
                "ups": post_data.get("ups"),
                "link_flair_text" : post_data.get("link_flair_text"),
                "created" : post_data.get("created"),
                "num_comments" : post_data.get("num_comments"),
                "url" : post_data.get("url"),
                "num_crossposts" : post_data.get("num_crossposts")
            }
            cleaned_post = clean_dict(filtered_post)
            posts.append(cleaned_post)

        # Get after token for pagination
        after = data["data"].get("after")
        if not after:
            print("Reached the last available post.")
            break

        print(f"Fetched {len(posts)} posts so far...")

        time.sleep(3.5)

    return posts[:max_posts]

posts = fetch_reddit_posts(max_posts=1000)

with open("reddit_data/nta_topyear_2025-2-7.json", "a", encoding="utf-8") as file:
    json.dump(posts, file, indent=4)

print(f"Saved {len(posts)} posts to file")

Fetched 100 posts so far...
Fetched 200 posts so far...
Reached the last available post.
Saved 238 posts to 'file'


<h2>Format Corpus for IMDB Data</h2>

In [79]:
imdb_path = "C:/Users/maddo/CS770_data/project_data/imdb_data"

# read raw data csv file
df = pd.read_csv(imdb_path + "/IMDB Dataset.csv")

# apply function to convert labels to what fasttext expects: __label__[label]
df['sentiment'] = df['sentiment'].apply(lambda x: f"__label__{x}")  # Example: "positive" -> "__label__positive"

# reverse order of columns
df = df[['sentiment', 'review']]

# remove spare html left in review contents
df['review'] = df['review'].str.replace("<br /><br />", " ").str.replace('\u200b', '').str.strip()

In [None]:
# save partially cleaned data into text file
df.to_csv(imdb_path + "/fasttext_imdb_corpus_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)

with open(imdb_path + "/fasttext_imdb_corpus_temp.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    
cleaned_lines = []
for line in lines:
    line = line.replace("  ", " ") # fix double spacing in temp file
    cleaned_lines.append(line)

# fully-cleaned fasttext training corpus stored in "fasttext_imdb_corpus.txt"
with open(imdb_path + "/fasttext_imdb_corpus.txt", "w", encoding="utf-8") as f:
    f.writelines(cleaned_lines)

In [None]:
X = df['review']
y = df['sentiment']

# divide dataset into train, test, and validation sets (70, 20, 10 split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test, X_tune, y_test, y_tune = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42, stratify=y_temp)

# create list of dataframes to iterate through later
sets = []
setnames = ['train', 'test', 'tune']

train = y_train.to_frame().join(X_train) # create training set
sets.append(train)

test = y_test.to_frame().join(X_test) # create testing set
sets.append(test)

tune = y_tune.to_frame().join(X_tune) # create validation set
sets.append(tune)

In [None]:
i = 0
for set in sets:
    # write set contents to file
    set.to_csv(f"{imdb_path}/fasttext_{setnames[i]}_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)
    
    with open(f"{imdb_path}/fasttext_{setnames[i]}_temp.txt", "r", encoding="utf-8") as f:
        lines = f.readlines() # read set contents from file
    
    cleaned_lines = []
    for line in lines:
        line = line.replace("  ", " ") # clean space padding
        cleaned_lines.append(line)

    # write fully cleaned set to file for fasttext to use
    if setnames[i] == 'tune':
        with open(f"{imdb_path}/fasttext_{setnames[i]}.valid", "w", encoding="utf-8") as f:
            f.writelines(cleaned_lines)
    else:
        with open(f"{imdb_path}/fasttext_{setnames[i]}.txt", "w", encoding="utf-8") as f:
            f.writelines(cleaned_lines)
    
    i += 1

<h2>Format Corpus for Reddit Data</h2>

In [16]:
reddit_path = 'reddit_data'

# load json file containing reddit post data
with open('reddit_data/aita_data.json') as f:
    raw_data = json.load(f)

# get posts as a dataframe
data = pd.json_normalize(raw_data['posts']).drop_duplicates()

data = data[['link_flair_text', 'selftext']]
data = data.rename(columns={'link_flair_text': 'verdict', 'selftext': 'content'})
data['content'] = data['content'].apply(lambda x: x.strip())

# convert verdicts into either label negative (nta), or label positive (yta)
data['verdict'] = data['verdict'].replace({'Asshole' : '__label__positive', 'Asshole POO Mode' : '__label__positive', 'Not the A-hole' : '__label__negative', 'Not the A-hole POO Mode' : '__label__negative'})

In [12]:
# save partially cleaned data into text file
data.to_csv(reddit_path + "/fasttext_reddit_corpus_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)

with open(reddit_path + "/fasttext_reddit_corpus_temp.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    
cleaned_lines = []
for line in lines:
    line = line.replace("  ", " ") # fix double spacing in temp file
    cleaned_lines.append(line)

# fully-cleaned fasttext training corpus stored in "fasttext_yelp_corpus.txt"
with open(reddit_path + "/fasttext_reddit_corpus.txt", "w", encoding="utf-8") as f:
    f.writelines(cleaned_lines)

In [22]:
X = data['content']
y = data['verdict']

# divide dataset into train, test, and validation sets (70, 20, 10 split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test, X_tune, y_test, y_tune = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42, stratify=y_temp)

# create list of dataframes to iterate through later
sets = []
setnames = ['train', 'test', 'tune']

train = y_train.to_frame().join(X_train) # create training set
sets.append(train)

test = y_test.to_frame().join(X_test) # create testing set
sets.append(test)

tune = y_tune.to_frame().join(X_tune) # create validation set
sets.append(tune)

In [24]:
i = 0
for set in sets:
    # write set contents to file
    set.to_csv(f"{reddit_path}/fasttext_{setnames[i]}_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)
    
    with open(f"{reddit_path}/fasttext_{setnames[i]}_temp.txt", "r", encoding="utf-8") as f:
        lines = f.readlines() # read set contents from file
    
    cleaned_lines = []
    for line in lines:
        line = line.replace("  ", " ") # clean space padding
        cleaned_lines.append(line)

    # write fully cleaned set to file for fasttext to use
    if setnames[i] == 'tune':
        with open(f"{reddit_path}/fasttext_{setnames[i]}.valid", "w", encoding="utf-8") as f:
            f.writelines(cleaned_lines)
    else:
        with open(f"{reddit_path}/fasttext_{setnames[i]}.txt", "w", encoding="utf-8") as f:
            f.writelines(cleaned_lines)
    
    i += 1