# Preparatory Notebook

The folowing code was used to prepare the partisan subreddit data for analysis and identify the Redditors relevant for the lifestyle analysis and the respective extraction. 


The basis for applying this is having the subreddit data stored as csv files within one common folder, with each file representing data for one subreddit during one specific month. 


This network is specifically created for the three years 2014, 2018, and 2022 but can be easily adapted to contain more years.

In [None]:
import pandas as pd
import os
import plotly.express as px
import pytz

In [None]:
path_to_csvs = "ENTER PATH TO FOLDER"

os.chdir(path_to_csvs)

The first steps are to create two DataFrames - one containing all submissions and one containing all comments regardless of the specific year.

In [None]:
submission_csvs = [f for f in os.listdir() if "submission" in f]
comment_csvs = [f for f in os.listdir() if "comment" in f]

In [None]:
submission_dfs = []
comment_dfs = []

for subs in submission_csvs:
    df = pd.read_csv(subs)
    submission_dfs.append(df)

for comms in comment_csvs:
    df = pd.read_csv(comms)
    comment_dfs.append(df)

In [None]:
submission_df = pd.concat(submission_dfs, ignore_index=True)
comment_df = pd.concat(comment_dfs, ignore_index=True)

A quick inspection if everything worked:

In [None]:
submission_df

In [None]:
comment_df

Necessary corrections within the strings, as "\r" is carriage return

In [None]:
def remove_r(some_string):
    if "\r" in some_string:
        some_string = some_string.replace("\r", "/r")
    return some_string

In [None]:
comment_df["body"] = comment_df["body"].apply(remove_r)

Converting from UNIX time and adjusting the timezone

In [None]:
submission_df["created"] = pd.to_datetime(submission_df["created_utc"],unit="s")

comment_df["created"] = pd.to_datetime(comment_df["created_utc"],unit="s")

timezone = pytz.timezone("America/New_York")

submission_df["created"] = submission_df["created"].dt.tz_localize("UTC").dt.tz_convert(timezone)

comment_df["created"] = comment_df["created"].dt.tz_localize("UTC").dt.tz_convert(timezone)

submission_df["posting_day"] = submission_df["created"].dt.floor("D")

comment_df["posting_day"] = comment_df["created"].dt.floor("D")

Breaking it all down to year specific dfs

In [None]:
subs_df_2014 = submission_df[submission_df["posting_day"].dt.year == 2014]
subs_df_2018 = submission_df[submission_df["posting_day"].dt.year == 2018]
subs_df_2022 = submission_df[submission_df["posting_day"].dt.year == 2022]

comms_df_2014 = comment_df[comment_df["posting_day"].dt.year == 2014]
comms_df_2018 = comment_df[comment_df["posting_day"].dt.year == 2018]
comms_df_2022 = comment_df[comment_df["posting_day"].dt.year == 2022]

Checking for missing days:

In [None]:
sub_dates_2014 = list(set(subs_df_2014["posting_day"].to_list()))
exist_list_subs_2014 = [1] * len(sub_dates_2014)

sub_dates_2018 = list(set(subs_df_2018["posting_day"].to_list()))
exist_list_subs_2018 = [1] * len(sub_dates_2018)

sub_dates_2022 = list(set(subs_df_2022["posting_day"].to_list()))
exist_list_subs_2022 = [1] * len(sub_dates_2022)

com_dates_2014 = list(set(comms_df_2014["posting_day"].to_list()))
exist_list_coms_2014 = [1] * len(com_dates_2014)

com_dates_2018 = list(set(comms_df_2018["posting_day"].to_list()))
exist_list_coms_2018 = [1] * len(com_dates_2018)

com_dates_2022 = list(set(comms_df_2022["posting_day"].to_list()))
exist_list_coms_2022 = [1] * len(com_dates_2022)

In [None]:
sub_date_df_2014 = pd.DataFrame(list(zip(sub_dates_2014, exist_list_subs_2014)), columns = ["day", "count"]).sort_values(by="day")
sub_date_df_2018 = pd.DataFrame(list(zip(sub_dates_2018, exist_list_subs_2018)), columns = ["day", "count"]).sort_values(by="day")
sub_date_df_2022 = pd.DataFrame(list(zip(sub_dates_2022, exist_list_subs_2022)), columns = ["day", "count"]).sort_values(by="day")

com_date_df_2014 = pd.DataFrame(list(zip(com_dates_2014, exist_list_coms_2014)), columns = ["day", "count"]).sort_values(by="day")
com_date_df_2018 = pd.DataFrame(list(zip(com_dates_2018, exist_list_coms_2018)), columns = ["day", "count"]).sort_values(by="day")
com_date_df_2022 = pd.DataFrame(list(zip(com_dates_2022, exist_list_coms_2022)), columns = ["day", "count"]).sort_values(by="day")


In [None]:
px.bar(sub_date_df_2014, x= "day", y="count", title = f"First = {min(sub_dates_2014)}, Last = {max(sub_dates_2014)}, Total  = {len(sub_dates_2014)}")

In [None]:
px.bar(sub_date_df_2018, x= "day", y="count", title = f"First = {min(sub_dates_2018)}, Last = {max(sub_dates_2018)}, Total  = {len(sub_dates_2018)}")

In [None]:
px.bar(sub_date_df_2022, x= "day", y="count", title = f"First = {min(sub_dates_2022)}, Last = {max(sub_dates_2022)}, Total  = {len(sub_dates_2022)}")

In [None]:
px.bar(com_date_df_2014, x= "day", y="count", title = f"First = {min(com_dates_2014)}, Last = {max(com_dates_2014)}, Total  = {len(com_dates_2014)}")

In [None]:
px.bar(com_date_df_2018, x= "day", y="count", title = f"First = {min(com_dates_2018)}, Last = {max(com_dates_2018)}, Total  = {len(com_dates_2018)}")

In [None]:
px.bar(com_date_df_2022, x= "day", y="count", title = f"First = {min(com_dates_2022)}, Last = {max(com_dates_2022)}, Total  = {len(com_dates_2022)}")

In [None]:
comms_df_2014= comms_df_2014.drop(columns=["created"])
subs_df_2014 = subs_df_2014.drop(columns=["created"])

comms_df_2018= comms_df_2018.drop(columns=["created"])
subs_df_2018 = subs_df_2018.drop(columns=["created"])

comms_df_2022= comms_df_2022.drop(columns=["created"])
subs_df_2022 = subs_df_2022.drop(columns=["created"])

comms_df_2014= comms_df_2014.drop(columns=["posting_day"])
subs_df_2014 = subs_df_2014.drop(columns=["posting_day"])

comms_df_2018= comms_df_2018.drop(columns=["posting_day"])
subs_df_2018 = subs_df_2018.drop(columns=["posting_day"])

comms_df_2022= comms_df_2022.drop(columns=["posting_day"])
subs_df_2022 = subs_df_2022.drop(columns=["posting_day"])

Storing the partisan subreddit data:

In [None]:
subs_df_2014.to_csv("PATH FOR FILE TO BE STORED", index=False)
comms_df_2014.to_csv("PATH FOR FILE TO BE STORED", index=False)

subs_df_2018.to_csv("PATH FOR FILE TO BE STORED", index=False)
comms_df_2018.to_csv("PATH FOR FILE TO BE STORED", index=False)

subs_df_2022.to_csv("PATH FOR FILE TO BE STORED", index=False)
comms_df_2022.to_csv("PATH FOR FILE TO BE STORED", index=False)

Preparing network files:

In [None]:
relevancy_threshold = 5

In [None]:
dem_subs_2014 = subs_df_2014[subs_df_2014["subreddit"] == "democrats"]
dem_subs_2018 = subs_df_2018[subs_df_2018["subreddit"] == "democrats"]
dem_subs_2022 = subs_df_2022[subs_df_2022["subreddit"] == "democrats"]

rep_subs_2014 = subs_df_2014[subs_df_2014["subreddit"] == "Republican"]
rep_subs_2018 = subs_df_2018[subs_df_2018["subreddit"] == "Republican"]
rep_subs_2022 = subs_df_2022[subs_df_2022["subreddit"] == "Republican"]

dem_comms_2014 = comms_df_2014[comms_df_2014["subreddit"] == "democrats"]
dem_comms_2018 = comms_df_2018[comms_df_2018["subreddit"] == "democrats"]
dem_comms_2022 = comms_df_2022[comms_df_2022["subreddit"] == "democrats"]

rep_comms_2014 = comms_df_2014[comms_df_2014["subreddit"] == "Republican"]
rep_comms_2018 = comms_df_2018[comms_df_2018["subreddit"] == "Republican"]
rep_comms_2022 = comms_df_2022[comms_df_2022["subreddit"] == "Republican"]

Breaking it down to individual authors and preparing the txt files for the lifestyle networks

In [None]:
dem_redditors_2014 = pd.concat([dem_subs_2014["author"], dem_comms_2014["author"]], ignore_index=True).value_counts()
network_dem_redditors_2014 = dem_redditors_2014[dem_redditors_2014>=relevancy_threshold].index.to_list()

dem_redditors_2018 = pd.concat([dem_subs_2018["author"], dem_comms_2018["author"]], ignore_index=True).value_counts()
network_dem_redditors_2018 = dem_redditors_2018[dem_redditors_2018>=relevancy_threshold].index.to_list()

dem_redditors_2022 = pd.concat([dem_subs_2022["author"], dem_comms_2022["author"]], ignore_index=True).value_counts()
network_dem_redditors_2022 = dem_redditors_2022[dem_redditors_2022>=relevancy_threshold].index.to_list()

rep_redditors_2014 = pd.concat([rep_subs_2014["author"], rep_comms_2014["author"]], ignore_index=True).value_counts()
network_rep_redditors_2014 = rep_redditors_2014[rep_redditors_2014>=relevancy_threshold].index.to_list()

rep_redditors_2018 = pd.concat([rep_subs_2018["author"], rep_comms_2018["author"]], ignore_index=True).value_counts()
network_rep_redditors_2018 = rep_redditors_2018[rep_redditors_2018>=relevancy_threshold].index.to_list()

rep_redditors_2022 = pd.concat([rep_subs_2022["author"], rep_comms_2022["author"]], ignore_index=True).value_counts()
network_rep_redditors_2022 = rep_redditors_2022[rep_redditors_2022>=relevancy_threshold].index.to_list()

In [None]:
network_redditor_lists = [network_dem_redditors_2014, network_dem_redditors_2018, network_dem_redditors_2022, network_rep_redditors_2014, network_rep_redditors_2018, network_rep_redditors_2022]

remove_users = ["[deleted]", "AutoModerator"]

for redditor_list in network_redditor_lists:
    for user in remove_users:
        if user in redditor_list:
            redditor_list.remove(user)
    print(len(redditor_list))

In [None]:
network_redditors_2014 = list(set(network_dem_redditors_2014 + network_rep_redditors_2014))
network_redditors_2018 = list(set(network_dem_redditors_2018 + network_rep_redditors_2018))
network_redditors_2022 = list(set(network_dem_redditors_2022 + network_rep_redditors_2022))

In [None]:
print(len(network_redditors_2014))
print(len(network_redditors_2018))
print(len(network_redditors_2022))

Storing the txt file that serve as a basis for applying the lifestyle posting extracting script

In [None]:
txt_folder_path = "PATH TO FOLDER"

txt_file_names = ["redditors_2014", "redditors_2018", "redditors_2022"]

redditors_to_store = [network_redditors_2014, network_redditors_2018, network_redditors_2022]

for i in range(3):
   with open (txt_folder_path + txt_file_names[i] + ".txt", "w") as txt_file:
       for redditor in redditors_to_store[i]:
           txt_file.write(redditor + "\n")