In [1]:
import pandas as pd
import os
import re

In [2]:
folder_path = "reddit_data"

politics_comments_filtered = pd.read_csv(os.path.join(folder_path, "politics_comments_filtered.csv"))
democrats_comments_filtered = pd.read_csv(os.path.join(folder_path, "democrats_comments_filtered.csv"))
spacex_comments_filtered = pd.read_csv(os.path.join(folder_path, "spacex_comments_filtered.csv"))
teslamotors_comments_filtered = pd.read_csv(os.path.join(folder_path, "teslamotors_comments_filtered.csv"))
PoliticalDiscussion_comments_filtered = pd.read_csv(os.path.join(folder_path, "PoliticalDiscussion_comments_filtered.csv"))
Libertarian_comments_filtered = pd.read_csv(os.path.join(folder_path, "Libertarian_comments_filtered.csv"))
Conservative_comments_filtered = pd.read_csv(os.path.join(folder_path, "Conservative_comments_filtered.csv"))
technology_comments_filtered = pd.read_csv(os.path.join(folder_path, "technology_comments_filtered.csv"))
AskThe_Donald_comments_filtered = pd.read_csv(os.path.join(folder_path, "AskThe_Donald_comments_filtered.csv"))
technology_submissions_filtered = pd.read_csv(os.path.join(folder_path, "technology_submissions_filtered.csv"))
teslamotors_submissions_filtered = pd.read_csv(os.path.join(folder_path, "teslamotors_submissions_filtered.csv"))
spacex_submissions_filtered = pd.read_csv(os.path.join(folder_path, "spacex_submissions_filtered.csv"))
AskThe_Donald_submissions_filtered = pd.read_csv(os.path.join(folder_path, "AskThe_Donald_submissions_filtered.csv"))
Conservative_submissions_filtered = pd.read_csv(os.path.join(folder_path, "Conservative_submissions_filtered.csv"))
democrats_submissions_filtered = pd.read_csv(os.path.join(folder_path, "democrats_submissions_filtered.csv"))
Libertarian_submissions_filtered = pd.read_csv(os.path.join(folder_path, "Libertarian_submissions_filtered.csv"))
PoliticalDiscussion_submissions_filtered = pd.read_csv(os.path.join(folder_path, "PoliticalDiscussion_submissions_filtered.csv"))
politics_submissions_filtered = pd.read_csv(os.path.join(folder_path, "politics_submissions_filtered.csv"))

In [3]:
# Political orientation mapping
subreddit_orientation = {
    "politics": "Left",
    "democrats": "Left",
    "PoliticalDiscussion": "Left",
    "Libertarian": "Right",
    "Conservative": "Right",
    "AskThe_Donald": "Right",
    "spacex": "Neutral",
    "technology": "Neutral",
    "teslamotors": "Neutral"
}

# List of submission dataframes
submission_dataframes = [
    ("politics", politics_submissions_filtered),
    ("democrats", democrats_submissions_filtered),
    ("PoliticalDiscussion", PoliticalDiscussion_submissions_filtered),
    ("Libertarian", Libertarian_submissions_filtered),
    ("Conservative", Conservative_submissions_filtered),
    ("AskThe_Donald", AskThe_Donald_submissions_filtered),
    ("spacex", spacex_submissions_filtered),
    ("technology", technology_submissions_filtered),
    ("teslamotors", teslamotors_submissions_filtered)
]

# Create a list to hold the modified dataframes
modified_dfs = []

# Iterate through the submission dataframes and add the subreddit and orientation columns
for subreddit, df in submission_dataframes:
    df["subreddit"] = subreddit
    df["political_orientation"] = subreddit_orientation[subreddit]
    modified_dfs.append(df)

# Concatenate all the modified dataframes into one
combined_submissions_df = pd.concat(modified_dfs, ignore_index=True)

In [4]:
# List of comment dataframes
comment_dataframes = [
    ("politics", politics_comments_filtered),
    ("democrats", democrats_comments_filtered),
    ("PoliticalDiscussion", PoliticalDiscussion_comments_filtered),
    ("Libertarian", Libertarian_comments_filtered),
    ("Conservative", Conservative_comments_filtered),
    ("AskThe_Donald", AskThe_Donald_comments_filtered),
    ("spacex", spacex_comments_filtered),
    ("technology", technology_comments_filtered),
    ("teslamotors", teslamotors_comments_filtered)
]

# Create a list to hold the modified dataframes
modified_dfs = []

# Iterate through the comment dataframes and add the subreddit and orientation columns
for subreddit, df in comment_dataframes:
    df["subreddit"] = subreddit
    df["political_orientation"] = subreddit_orientation[subreddit]
    modified_dfs.append(df)

# Concatenate all the modified dataframes into one
combined_comments_df = pd.concat(modified_dfs, ignore_index=True)

In [5]:
combined_comments_df_selected = combined_comments_df[["Date", "Body", "Post ID", "subreddit", "political_orientation"]]
combined_submissions_df_selected = combined_submissions_df[["Date", "Title", "Selftext", "ID", "subreddit", "political_orientation"]]

In [6]:
def combine_dfs(comments, submissions):
    comments = comments.rename(columns={"Post ID": "ID", "Body": "Text"})
    submissions = submissions.rename(columns={"Selftext": "Text"})
    return pd.concat([comments[["ID", "Date", "Text", "subreddit", "political_orientation"]], 
                      submissions[["ID", "Date", "Text", "subreddit", "political_orientation"]]], 
                     ignore_index=True)

combined_df = combine_dfs(combined_comments_df_selected, combined_submissions_df_selected)

In [7]:
combined_df = combined_df[combined_df['Text'] != '[removed]']

combined_df = combined_df[combined_df['Text'].notna()]

combined_df = combined_df[~combined_df['Text'].str.contains('I am a bot, and this action was performed automatically.', na=False)]

combined_df = combined_df[combined_df['Text'].str.len() >= 10]

combined_df = combined_df[~combined_df['Text'].str.match(r'^(?:http|https)://\S+$', na=False)]

combined_df = combined_df[~combined_df['Text'].str.match(r'^!?\[gif\]\([^)]*\)$', na=False, flags=re.IGNORECASE)]

combined_df = combined_df[~combined_df['Text'].str.contains("Thanks for contributing! Unfortunately your submission has been removed", na=False, case=False)]

In [8]:
combined_df.to_excel("reddit_data/combined_data.xlsx", index=False)

In [None]:
# Convert Date to datetime
if not pd.api.types.is_datetime64_any_dtype(combined_df['Date']):
    combined_df['Date'] = pd.to_datetime(combined_df['Date'])

# Get list of unique subreddits
subreddits = combined_df['subreddit'].unique()

sample_data = pd.DataFrame()

# For each subreddit, sample 100 rows distributed by date
for subreddit in subreddits:
    subreddit_df = combined_df[combined_df['subreddit'] == subreddit]
    
    dates = sorted(subreddit_df['Date'].unique())
    
    samples_per_date = 100 // len(dates)
    remainder = 100 % len(dates)
    
    subreddit_sample = pd.DataFrame()
    
    for i, date in enumerate(dates):
        date_df = subreddit_df[subreddit_df['Date'] == date]
        
        n_samples = samples_per_date + (1 if i < remainder else 0)
        
        n_samples = min(n_samples, len(date_df))
        
        date_sample = date_df.sample(n=n_samples, random_state=42)
        subreddit_sample = pd.concat([subreddit_sample, date_sample])
    
    sample_data = pd.concat([sample_data, subreddit_sample])

combined_data_without_sample = combined_df[~combined_df.index.isin(sample_data.index)]

sample_data.to_excel('reddit_data/sample_data.xlsx', index=False)
combined_data_without_sample.to_excel('reddit_data/combined_data_without_sample.xlsx', index=False)

print(f"Sample data shape: {sample_data.shape}")
print(f"Remaining data shape: {combined_data_without_sample.shape}")
print(f"Original data shape: {combined_df.shape}")

print(sample_data['subreddit'].value_counts())

Sample data shape: (900, 5)
Remaining data shape: (71459, 5)
Original data shape: (72359, 5)

Sample distribution by subreddit:
subreddit
politics               100
democrats              100
PoliticalDiscussion    100
Libertarian            100
Conservative           100
AskThe_Donald          100
spacex                 100
technology             100
teslamotors            100
Name: count, dtype: int64
