# Analysis Code Template

The code in this notebook was used to perform all anlytical steps. It was designed to only need a few parameters specified at the beginning and then analyse each yearly dataset accordingly

In [None]:
import pandas as pd
import os
import pytz
import numpy as np
import plotly
import plotly.express as px
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
from collections import OrderedDict
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.data import from_edge_list
from umap import UMAP
import json
import plotly.graph_objects as go
import matplotlib

Parameters to specify:

In [None]:
this_specific_year = "YEAR TO ANALYZE"

min_responses_in_subred = 4

lifestyle_threshold = 10

subreddit_engagement_threshold = 10

neutral_zone_marker = 0.2

In [None]:
if this_specific_year == "2014":
    election_date = "2014-11-04"
elif this_specific_year == "2018":
    election_date = "2018-11-06"
elif this_specific_year == "2022":
    election_date = "2022-11-08"

The naming logic for the submission and comment files was "all_demrep_submission_YEAR.csv" and "all_demrep_comments_YEAR.csv", while the lifestyle subreddit files (network files) were named "clean_nt_file_YEAR.csv"

In [None]:
sub_path = "PATH_TO_SUBMISSION_FOLDER/all_demrep_submissions_"
comm_path = "PATH_TO_COMMENT_FOLDER/all_demrep_comments_"

network_file_path = "PATH_TO_LIFESTYLE_FOLDER" + this_specific_year + ".csv"

path_to_bots = "PATH TO TXT FILE CONTAINING BOTS"
path_to_found_bots = "PATH TO TXT FILE CONTAINING FOUND BOTS"
path_to_more_trolls_and_bots= "PATH TO TXT FILE CONTAINING ADDITIONAL TROLLS AND BOTS"

In [None]:
save_csvs = "PATH TO FOLDER IN WHICH TO SAVE RESULTING CSV FILES"
save_plots = "PATH TO FOLDER IN WHICH TO SAVE RESULTING PLOTS"

## Exploratory Analysis

In [None]:
this_years_subs = pd.read_csv(sub_path + this_specific_year +".csv")
this_years_comms = pd.read_csv(comm_path + this_specific_year +".csv")

file_year_intervalls = 4

row_names = ["Democrats", "Republican", "Total", "Deleted/Removed", "Excluded", "Additional info"]

summary_df = pd.DataFrame(row_names, columns = [this_specific_year])

In [None]:
this_years_subs["created_utc"] = pd.to_datetime(this_years_subs["created_utc"],
                                  unit='s')
this_years_subs.rename(columns = {"created_utc":"created"}, inplace = True)



this_years_comms["created_utc"] = pd.to_datetime(this_years_comms["created_utc"],
                                  unit='s')
this_years_comms.rename(columns = {"created_utc":"created"}, inplace = True)

In [None]:
timezone = pytz.timezone("America/New_York")

this_years_subs["created"] = this_years_subs["created"].dt.tz_localize("UTC").dt.tz_convert(timezone)
this_years_comms["created"] = this_years_comms["created"].dt.tz_localize("UTC").dt.tz_convert(timezone)

In [None]:
this_years_subs

In [None]:
this_years_comms

In [None]:
this_years_dem_subs = this_years_subs[this_years_subs["subreddit"] == "democrats"]
this_years_rep_subs = this_years_subs[this_years_subs["subreddit"] == "Republican"]
this_years_dem_comms = this_years_comms[this_years_comms["subreddit"] == "democrats"]
this_years_rep_comms = this_years_comms[this_years_comms["subreddit"] == "Republican"]

### Counting Users

In [None]:
remove_users = ["[deleted]", "AutoModerator", "election_info_bot"]

unwanted_user_sources = [path_to_bots, path_to_found_bots, path_to_more_trolls_and_bots]

unwanted_users = remove_users.copy()

for bot_list in unwanted_user_sources:
    if bot_list == path_to_bots:
        with open(bot_list, "r") as txt:
            for line in txt:
                unwanted_users.append(line.strip()[3:])
    else:
        with open(bot_list, "r") as txt:
            for line in txt:
                unwanted_users.append(line.strip())


troll_bot_users = list(set(unwanted_users))
troll_bot_users.remove("[deleted]")

In [None]:
this_years_dem_redditors = list(set(pd.concat([this_years_dem_subs["author"], this_years_dem_comms["author"]], ignore_index=True).to_list()))
this_years_rep_redditors = list(set(pd.concat([this_years_rep_subs["author"], this_years_rep_comms["author"]], ignore_index=True).to_list()))

this_years_redditor_lists = [this_years_dem_redditors, this_years_rep_redditors]

dem_rep_distinguisher = 0
removed_dems = []
removed_reps = []
for redditor_list in this_years_redditor_lists:
    for user in unwanted_users:
        if user in redditor_list:
            redditor_list.remove(user)
            if dem_rep_distinguisher == 0:
                removed_dems.append(user)
            elif dem_rep_distinguisher == 1:
                removed_reps.append(user)

    dem_rep_distinguisher += 1
            
this_years_total_redditors = list(set(this_years_rep_redditors + this_years_dem_redditors))


-1 is applied in next step because of [deleted], which is not one single user

In [None]:
excluded_redditors = "Dems: " + str(len(removed_dems)-1) + "; Reps: " + str(len(removed_reps)-1)

In [None]:
this_years_user_column = [len(this_years_dem_redditors), len(this_years_rep_redditors), len(this_years_total_redditors), "Not clear as only one indicator", excluded_redditors, "-"]

In [None]:
freq_dem_redditors = pd.concat([this_years_dem_subs["author"], this_years_dem_comms["author"]], ignore_index=True).value_counts()
freq_dem_redditors_ser = freq_dem_redditors[freq_dem_redditors>=5]
for i in unwanted_users:
     if i in freq_dem_redditors:
          freq_dem_redditors = freq_dem_redditors.drop(i)
     if i in freq_dem_redditors_ser:
          freq_dem_redditors_ser = freq_dem_redditors_ser.drop(i)


freq_rep_redditors = pd.concat([this_years_rep_subs["author"], this_years_rep_comms["author"]], ignore_index=True).value_counts()
freq_rep_redditors_ser = freq_rep_redditors[freq_rep_redditors>=5]
for i in unwanted_users:
     if i in freq_rep_redditors:
          freq_rep_redditors = freq_rep_redditors.drop(i)
     if i in freq_rep_redditors_ser:
          freq_rep_redditors_ser = freq_rep_redditors_ser.drop(i)


In [None]:
frequ_redditors = [len(freq_dem_redditors_ser), len(freq_rep_redditors_ser), len(set(freq_dem_redditors_ser.index.to_list() + freq_rep_redditors_ser.index.to_list())), "-", "-", "-"]

"Even more frequent redditors" are called "more frequent redditors" in the thesis.

In [None]:
common_redditors = set(this_years_dem_redditors) & set(this_years_rep_redditors)
common_frequ_redditors = set(freq_dem_redditors_ser.index.to_list()) & set(freq_rep_redditors_ser.index.to_list())

even_more_freq_dem_redditors = freq_dem_redditors_ser[freq_dem_redditors_ser>=10]
even_more_freq_rep_redditors = freq_rep_redditors_ser[freq_rep_redditors_ser>=10]

common_more_frequ_redditors = set(even_more_freq_dem_redditors.index.to_list()) & set(even_more_freq_rep_redditors.index.to_list())

In [None]:
more_frequ_redditors = [len(even_more_freq_dem_redditors), len(even_more_freq_rep_redditors), len(set(even_more_freq_dem_redditors.index.to_list() + even_more_freq_rep_redditors.index.to_list())), "-", "-", "-"]

### Submissions

Submissions with the title "[ REMOVED BY REDDIT ]" are not removed as they have comments following them
Same goes for [deleted by user]

In [None]:
if "[ Removed by Reddit ]" in this_years_dem_subs["title"].value_counts():
    dem_removed_by_redd_subs =  this_years_dem_subs["title"].value_counts()["[ Removed by Reddit ]"]
else:
    dem_removed_by_redd_subs = 0

if "[ Removed by Reddit ]" in this_years_rep_subs["title"].value_counts():
    rep_removed_by_redd_subs =  this_years_rep_subs["title"].value_counts()["[ Removed by Reddit ]"]
else:
    rep_removed_by_redd_subs = 0

if "[deleted by user]" in this_years_dem_subs["title"].value_counts():
    dem_del_by_user_subs = this_years_dem_subs["title"].value_counts()["[deleted by user]"]
else:
    dem_del_by_user_subs = 0

if "[deleted by user]" in this_years_rep_subs["title"].value_counts():
    rep_del_by_user_subs = this_years_rep_subs["title"].value_counts()["[deleted by user]"]
else:
    rep_del_by_user_subs = 0

this_years_deleted_subs = "Dems: " + str(dem_removed_by_redd_subs + dem_del_by_user_subs) + "; Reps: " +  str(rep_removed_by_redd_subs + rep_del_by_user_subs)

In [None]:
this_years_multiple_dem_subs = this_years_dem_subs["title"].value_counts()
this_years_multiple_dem_subs = this_years_multiple_dem_subs[this_years_multiple_dem_subs>=2]
if "[deleted by user]" in this_years_multiple_dem_subs:
    this_years_multiple_dem_subs = this_years_multiple_dem_subs.drop(["[deleted by user]"])
if dem_removed_by_redd_subs > 0:
    this_years_multiple_dem_subs = this_years_multiple_dem_subs.drop(["[ Removed by Reddit ]"])

this_years_multiple_rep_subs = this_years_rep_subs["title"].value_counts()
this_years_multiple_rep_subs = this_years_multiple_rep_subs[this_years_multiple_rep_subs>=2]
if "[deleted by user]" in this_years_multiple_rep_subs:
    this_years_multiple_rep_subs = this_years_multiple_rep_subs.drop(["[deleted by user]"])
if rep_removed_by_redd_subs > 0:
    this_years_multiple_rep_subs = this_years_multiple_rep_subs.drop(["[ Removed by Reddit ]"])

In [None]:
ty_dem_subs_without_troll_bots = this_years_dem_subs[~this_years_dem_subs["author"].isin(troll_bot_users)]
ty_rep_subs_without_troll_bots = this_years_rep_subs[~this_years_rep_subs["author"].isin(troll_bot_users)]

In [None]:
subs_info = "Multiple submission names: Dems: " + str(len(this_years_multiple_dem_subs)) + "; Reps: "  + str(len(this_years_multiple_rep_subs)) + "; Submissions made by trollbots Dems: " + str(len(this_years_dem_subs) - len(ty_dem_subs_without_troll_bots)) + " Reps: " + str(len(this_years_rep_subs) - len(ty_rep_subs_without_troll_bots))

In [None]:
this_years_submission_column = [ty_dem_subs_without_troll_bots["id"].nunique(), ty_rep_subs_without_troll_bots["id"].nunique(), ty_dem_subs_without_troll_bots["id"].nunique() + ty_rep_subs_without_troll_bots["id"].nunique(), this_years_deleted_subs,  "NOTE: Deleted Subs are still counted, as the comments are left in"  , subs_info]

In [None]:
this_years_dem_subs = ty_dem_subs_without_troll_bots
this_years_rep_subs = ty_rep_subs_without_troll_bots

### Comments

In [None]:
del_rem_comments = ["[deleted]", "[removed]"]

this_years_commentator_dfs = [this_years_dem_comms, this_years_rep_comms]

dem_rep_comm_distinguisher = 0
deleted__dem_comms = 0
deleted__rep_comms = 0
removed__dem_comms = 0
removed__rep_comms = 0
excluded_dem_comms = 0
excluded_rep_comms = 0
excluded_dem_authors = []
excluded_rep_authors = []
for commentator_df in this_years_commentator_dfs:
    for del_rem_com in del_rem_comments:

        if del_rem_com == "[deleted]":
            if dem_rep_comm_distinguisher == 0:
                if del_rem_com in commentator_df["body"].value_counts():
                    deleted__dem_comms += commentator_df["body"].value_counts()[del_rem_com]
            elif dem_rep_comm_distinguisher == 1:
                if del_rem_com in commentator_df["body"].value_counts():
                    deleted__rep_comms += commentator_df["body"].value_counts()[del_rem_com]
        elif del_rem_com == "[removed]":
            if dem_rep_comm_distinguisher == 0:
                if del_rem_com in commentator_df["body"].value_counts():
                    removed__dem_comms += commentator_df["body"].value_counts()[del_rem_com]
            elif dem_rep_comm_distinguisher == 1:
                if del_rem_com in commentator_df["body"].value_counts():
                    removed__rep_comms += commentator_df["body"].value_counts()[del_rem_com]

        commentator_df.drop(commentator_df[commentator_df["body"] == del_rem_com].index, inplace=True)        


    for commentator in troll_bot_users:
        if commentator in commentator_df["author"].unique():
            if dem_rep_comm_distinguisher == 0:
                excluded_dem_comms += commentator_df["author"].value_counts()[commentator]
                excluded_dem_authors.append(commentator)
            elif dem_rep_comm_distinguisher == 1:
                excluded_rep_comms += commentator_df["author"].value_counts()[commentator]
                excluded_rep_authors.append(commentator)
            commentator_df.drop(commentator_df[commentator_df["author"] == commentator].index, inplace=True) 

    dem_rep_comm_distinguisher += 1




In [None]:
this_years_del_rem_comms = "Deleted: Dems: " + str(deleted__dem_comms) + "; Reps: " + str(deleted__rep_comms) + " - Removed: Dems: " + str(removed__dem_comms) + "; Reps: " + str(removed__rep_comms)

In [None]:
this_years_excl_comms = "Dems: " + str(excluded_dem_comms) + "; Reps: " + str(excluded_rep_comms)

In [None]:
comms_info = "Excluded commentators: Dems: " + ", ".join(excluded_dem_authors) + "; Reps: " + ", ".join(excluded_rep_authors)

In [None]:
this_years_comment_column = [this_years_dem_comms["id"].nunique(), this_years_rep_comms["id"].nunique(), this_years_dem_comms["id"].nunique() + this_years_rep_comms["id"].nunique(), this_years_del_rem_comms,  this_years_excl_comms, comms_info]

In [None]:
ratio_info = "Dems had " + str(this_years_dem_subs["id"].nunique()/this_years_rep_subs["id"].nunique()) + " as many submissions as Reps and " + str(this_years_dem_comms["id"].nunique()/this_years_rep_comms["id"].nunique()) + " as many comments. While Reps had " + str(this_years_rep_subs["id"].nunique()/this_years_dem_subs["id"].nunique()) + " as many submissions as Dems and " + str(this_years_rep_comms["id"].nunique()/this_years_dem_comms["id"].nunique()) + " as many comments"

In [None]:
comm_sub_ratio = [this_years_dem_comms["id"].nunique()/this_years_dem_subs["id"].nunique(), this_years_rep_comms["id"].nunique()/this_years_rep_subs["id"].nunique(), (this_years_dem_comms["id"].nunique() + this_years_rep_comms["id"].nunique())/(this_years_dem_subs["id"].nunique() + this_years_rep_subs["id"].nunique()), "-", "-", ratio_info]

### News Sources

In [None]:
def domain_finder(some_text):
    if isinstance(some_text, str):
        link_pattern = r'(https?://|www\.)[^/\s]+'
        is_link = re.finditer(link_pattern, some_text)
        link_list = []
        for any_link in is_link:
            link = any_link.group()
            domain = re.sub(r'https?://|www\.', '', link)
            link_list.append(domain)
        if len(link_list) > 1:
            return ",".join(link_list)
        elif len(link_list) == 1:
            return link_list[0]

In [None]:
this_years_dem_subs["selftext_domains"] = this_years_dem_subs["selftext"].apply(domain_finder)
this_years_rep_subs["selftext_domains"] = this_years_rep_subs["selftext"].apply(domain_finder)

this_years_dem_comms["domain"] = this_years_dem_comms["body"].apply(domain_finder)
this_years_rep_comms["domain"] = this_years_rep_comms["body"].apply(domain_finder)

In [None]:
dem_sub_selte_domains_unpacked = this_years_dem_subs["selftext_domains"].to_list()
rep_sub_selte_domains_unpacked = this_years_rep_subs["selftext_domains"].to_list()

In [None]:
dem_com_domains_unpacked = this_years_dem_comms["domain"].to_list()
rep_com_domains_unpacked = this_years_rep_comms["domain"].to_list()

In [None]:
dem_sub_selftext_domains = []
rep_sub_selftext_domains = []


for i in dem_sub_selte_domains_unpacked:
    if isinstance(i, str):
        if "," in i:
            multis = i.split(",")
            multis = list(set(multis))
            dem_sub_selftext_domains.extend(multis)
        else:
            dem_sub_selftext_domains.append(i)


for i in rep_sub_selte_domains_unpacked:
    if isinstance(i, str):
        if "," in i:
            multis = i.split(",")
            multis = list(set(multis))
            rep_sub_selftext_domains.extend(multis)
        else:
            rep_sub_selftext_domains.append(i)


In [None]:
dem_comm_domains = []
rep_comm_domains = []


for i in dem_com_domains_unpacked:
    if isinstance(i, str):
        if "," in i:
            multis = i.split(",")
            multis = list(set(multis))
            dem_comm_domains.extend(multis)
        else:
            dem_comm_domains.append(i)


for i in rep_com_domains_unpacked:
    if isinstance(i, str):
        if "," in i:
            multis = i.split(",")
            multis = list(set(multis))
            rep_comm_domains.extend(multis)
        else:
            rep_comm_domains.append(i)


The same domain posted multiple times in a comment is excluded. Otherwise multiple links are included, if within one comment and reffering to different domains.

In the sub domains there is a NaN float and a "None" string

In both cases the author has been deleted but the title remained

In [None]:
nr_dem_domains = list(set(this_years_dem_subs["domain"].to_list() + dem_sub_selftext_domains + dem_comm_domains))
nr_dem_domains = [domain for domain in nr_dem_domains if domain != "None"]
nr_dem_domains = [domain for domain in nr_dem_domains if domain if not isinstance(domain, float)]

nr_rep_domains = list(set(this_years_rep_subs["domain"].to_list() + rep_sub_selftext_domains + rep_comm_domains))
nr_rep_domains = [domain for domain in nr_rep_domains if domain != "None"]
nr_rep_domains = [domain for domain in nr_rep_domains if domain if not isinstance(domain, float)]

In [None]:
aggregate_domains = [len(nr_dem_domains), len(nr_rep_domains), len(set(nr_dem_domains + nr_rep_domains)), "-", "-", "-"]

In [None]:
submission_domains = [len(set(this_years_dem_subs["domain"].to_list())), len(set(this_years_rep_subs["domain"].to_list())), len(set(this_years_dem_subs["domain"].to_list() + this_years_rep_subs["domain"].to_list())), "-", "-", "-"]

In [None]:
submission_domains_incl_selftext = [len(set(this_years_dem_subs["domain"].to_list() + dem_sub_selftext_domains)), len(set(this_years_rep_subs["domain"].to_list() + rep_sub_selftext_domains)), len(set(this_years_dem_subs["domain"].to_list() + this_years_rep_subs["domain"].to_list() + dem_sub_selftext_domains + rep_sub_selftext_domains)), "-", "-", "-"]

In [None]:
found_submission_selftext_domains = [len(set(dem_sub_selftext_domains)), len(set(rep_sub_selftext_domains)), len(set(dem_sub_selftext_domains + rep_sub_selftext_domains)), "-", "-", "-"]

In [None]:
found_comment_domains = [len(set(dem_comm_domains)), len(set(rep_comm_domains)), len(set(dem_comm_domains + rep_comm_domains)), "-", "-", "-"]

In [None]:
dem_com_domain_df = pd.DataFrame(dem_comm_domains, columns=["domain"])
dem_comment_domains = dem_com_domain_df.value_counts()
frequent_dem_comment_domains = dem_comment_domains[dem_comment_domains>=5].index.to_list()

rep_com_domain_df = pd.DataFrame(rep_comm_domains, columns=["domain"])
rep_comment_domains = rep_com_domain_df.value_counts()
frequent_rep_comment_domains = rep_comment_domains[rep_comment_domains>=5].index.to_list()

In [None]:
frequent_dem_sub_domains = this_years_dem_subs["domain"].value_counts()
frequent_dem_sub_domains = frequent_dem_sub_domains[frequent_dem_sub_domains>=5].index.to_list()

frequent_rep_sub_domains = this_years_rep_subs["domain"].value_counts()
frequent_rep_sub_domains = frequent_rep_sub_domains[frequent_rep_sub_domains>=5].index.to_list()

In [None]:
dem_sub_selftext_domain_df = pd.DataFrame(dem_sub_selftext_domains, columns=["domain"])
dem_submission_found_selftext_domains = dem_sub_selftext_domain_df.value_counts()
frequent_dem_submission_found_selftext_domains = dem_submission_found_selftext_domains[dem_submission_found_selftext_domains>=5].index.to_list()

rep_sub_selftext_domain_df = pd.DataFrame(rep_sub_selftext_domains, columns=["domain"])
rep_submission_found_selftext_domains = rep_sub_selftext_domain_df.value_counts()
frequent_rep_submission_found_selftext_domains = rep_submission_found_selftext_domains[rep_submission_found_selftext_domains>=5].index.to_list()

In [None]:
dem_agg_submission_domains = pd.DataFrame(this_years_dem_subs["domain"].to_list()+ dem_sub_selftext_domains, columns =["domain"])["domain"].value_counts()

dem_frequent_agg_submission_domains = dem_agg_submission_domains[dem_agg_submission_domains>=5]

rep_agg_submission_domains = pd.DataFrame(this_years_rep_subs["domain"].to_list()+ rep_sub_selftext_domains, columns =["domain"])["domain"].value_counts()

rep_frequent_agg_submission_domains = rep_agg_submission_domains[rep_agg_submission_domains>=5]

In [None]:
all_dem_domain_columns = [this_years_dem_subs["domain"], dem_sub_selftext_domain_df["domain"], dem_com_domain_df["domain"]]
all_dem_domain_df = pd.DataFrame(pd.concat(all_dem_domain_columns, axis=0, ignore_index=True))
dem_agg_domain_count = all_dem_domain_df["domain"].value_counts()
frequent_dem_agg_domains = dem_agg_domain_count[dem_agg_domain_count>=5].index.to_list()

all_rep_domain_columns = [this_years_rep_subs["domain"], rep_sub_selftext_domain_df["domain"], rep_com_domain_df["domain"]]
all_rep_domain_df = pd.DataFrame(pd.concat(all_rep_domain_columns, axis=0, ignore_index=True))
rep_agg_domain_count = all_rep_domain_df["domain"].value_counts()
frequent_rep_agg_domains = rep_agg_domain_count[rep_agg_domain_count>=5].index.to_list()

In [None]:
frequent_agg_domains = [len(frequent_dem_agg_domains), len(frequent_rep_agg_domains), len(set(frequent_dem_agg_domains + frequent_rep_agg_domains)), "-", "-", "-"]

In [None]:
frequent_sub_domains = [len(set(frequent_dem_sub_domains)), len(set(frequent_rep_sub_domains)), len(set(frequent_dem_sub_domains + frequent_rep_sub_domains)), "-", "-", "-"]

In [None]:
frequent_found_sub_domains = [len(set(frequent_dem_submission_found_selftext_domains)), len(set(frequent_rep_submission_found_selftext_domains)), len(set(frequent_dem_submission_found_selftext_domains + frequent_rep_submission_found_selftext_domains)), "-", "-", "-"]

In [None]:
frequent_agg_submission_domains = [len(dem_frequent_agg_submission_domains), len(rep_frequent_agg_submission_domains), len(set(dem_frequent_agg_submission_domains.index.to_list() + rep_frequent_agg_submission_domains.index.to_list())), "-", "-", "-"]

In [None]:
frequent_comm_domains = [len(set(frequent_dem_comment_domains)), len(set(frequent_rep_comment_domains)), len(set(frequent_dem_comment_domains + frequent_rep_comment_domains)), "-", "-", "-"]

In [None]:
dist_dem_agg_doms = list(set(nr_dem_domains) - set(nr_rep_domains))

dem_doms_list = this_years_dem_subs["domain"].to_list() + dem_sub_selftext_domains + dem_comm_domains
dist_dem_ag_dom_df = pd.DataFrame(dem_doms_list, columns=["domain"])
dist_dem_domain_agg_freq = dist_dem_ag_dom_df.value_counts()
dist_dem_domain_agg_freq = dist_dem_domain_agg_freq.loc[dist_dem_agg_doms].sort_values(ascending=False)

dist_rep_agg_doms = list(set(nr_rep_domains) - set(nr_dem_domains))

rep_doms_list = this_years_rep_subs["domain"].to_list() + rep_sub_selftext_domains + rep_comm_domains
dist_rep_ag_dom_df = pd.DataFrame(rep_doms_list, columns=["domain"])
dist_rep_domain_agg_freq = dist_rep_ag_dom_df.value_counts()
dist_rep_domain_agg_freq = dist_rep_domain_agg_freq.loc[dist_rep_agg_doms].sort_values(ascending=False)



In [None]:
dist_agg_doms = [len(dist_dem_domain_agg_freq), len(dist_rep_domain_agg_freq), "-", "-", "-", "-"]

In [None]:
dist_dem_freq_doms = set(frequent_dem_agg_domains) - set(frequent_rep_agg_domains)
dist_rep_freq_doms = set(frequent_rep_agg_domains) - set(frequent_dem_agg_domains)

In [None]:
distinct_dem_domains_above_threshold = dem_agg_domain_count.loc[dist_dem_freq_doms].sort_values(ascending=False)

distinct_rep_domains_above_threshold = rep_agg_domain_count.loc[dist_rep_freq_doms].sort_values(ascending=False)

In [None]:
dist_freq_doms = [len(dist_dem_freq_doms), len(dist_rep_freq_doms), "-", "-", "-", "-"]

In [None]:
dist_dem_sub_doms = set(this_years_dem_subs["domain"].to_list() + dem_sub_selftext_domains) - set(this_years_rep_subs["domain"].to_list() + rep_sub_selftext_domains)
dist_rep_sub_doms = set(this_years_rep_subs["domain"].to_list() + rep_sub_selftext_domains) - set(this_years_dem_subs["domain"].to_list() + dem_sub_selftext_domains)

In [None]:
dist_dem_domain_sub_freq = dem_agg_submission_domains.loc[list(dist_dem_sub_doms)].sort_values(ascending=False)

dist_rep_domain_sub_freq = rep_agg_submission_domains.loc[list(dist_rep_sub_doms)].sort_values(ascending=False)

In [None]:
distinct_submission_domains = [len(dist_dem_sub_doms), len(dist_rep_sub_doms), "-", "-", "-", "-"]

In [None]:
dist_frequ_dem_sub_domains = set(dem_frequent_agg_submission_domains.index.to_list()) - set(rep_frequent_agg_submission_domains.index.to_list())
dist_frequ_rep_sub_domains = set(rep_frequent_agg_submission_domains.index.to_list()) - set(dem_frequent_agg_submission_domains.index.to_list())

distinct_frequent_submission_domains = [len(dist_frequ_dem_sub_domains), len(dist_frequ_rep_sub_domains), "-", "-", "-", "-"]

In [None]:
dist_frequ_dem_domain_sub_freq = dem_agg_submission_domains.loc[list(dist_frequ_dem_sub_domains)].sort_values(ascending=False)

dist_frequ_rep_domain_sub_freq = rep_agg_submission_domains.loc[list(dist_frequ_rep_sub_domains)].sort_values(ascending=False)

### Further Redditor Specificities 

In [None]:
dem_posting_redditor_ratio = (this_years_dem_subs["id"].nunique()+this_years_dem_comms["id"].nunique())/len(this_years_dem_redditors)

rep_posting_redditor_ratio = (this_years_rep_subs["id"].nunique()+this_years_rep_comms["id"].nunique())/len(this_years_rep_redditors)

agg_posting_redditor_ratio =  (this_years_dem_subs["id"].nunique()+this_years_dem_comms["id"].nunique() + this_years_rep_subs["id"].nunique()+this_years_rep_comms["id"].nunique())/len(set(this_years_dem_redditors + this_years_rep_redditors))

post_redditor_ratio = [dem_posting_redditor_ratio, rep_posting_redditor_ratio, agg_posting_redditor_ratio, "-", "-", "-"]

In [None]:
dem_sub_creators = this_years_dem_subs["author"].value_counts()

for i in unwanted_users:
     if i in dem_sub_creators:
          dem_sub_creators = dem_sub_creators.drop(i)

freq_dem_sub_creators = dem_sub_creators[dem_sub_creators>=5]



rep_sub_creators = this_years_rep_subs["author"].value_counts()

for i in unwanted_users:
     if i in rep_sub_creators:
          rep_sub_creators = rep_sub_creators.drop(i)

freq_rep_sub_creators = rep_sub_creators[rep_sub_creators>=5]

In [None]:
sub_creators = [len(dem_sub_creators), len(rep_sub_creators), "-", "-", "-", "-"]
freq_sub_creators = [len(freq_dem_sub_creators), len(freq_rep_sub_creators), "-", "-", "-", "-"]

### Controversy

In [None]:
controversy_list = [this_years_dem_comms["controversiality"].value_counts()[1], this_years_rep_comms["controversiality"].value_counts()[1], this_years_dem_comms["controversiality"].value_counts()[1] + this_years_rep_comms["controversiality"].value_counts()[1], "-", "-", "-" ]

### Threads

In [None]:
submission_with_comments = [len(this_years_dem_subs[this_years_dem_subs["num_comments"] > 4]), len(this_years_rep_subs[this_years_rep_subs["num_comments"] > 4]), len(this_years_dem_subs[this_years_dem_subs["num_comments"] > 4]) + len(this_years_rep_subs[this_years_rep_subs["num_comments"] > 4]), "-", "-", "-"]
submission_with_more_comments = [len(this_years_dem_subs[this_years_dem_subs["num_comments"] > 9]), len(this_years_rep_subs[this_years_rep_subs["num_comments"] > 9]), len(this_years_dem_subs[this_years_dem_subs["num_comments"] > 9]) + len(this_years_rep_subs[this_years_rep_subs["num_comments"] > 9]), "-", "-", "-"]

In [None]:
count_pop_dems = this_years_dem_comms[["link_id", "author"]]
populated_dem_submissions = count_pop_dems.groupby("link_id")["author"].nunique().sort_values(ascending=False)

slightly_populated_dem_submissions = populated_dem_submissions[populated_dem_submissions>4]

very_populated_dem_submissions =  populated_dem_submissions[populated_dem_submissions>9]


count_pop_reps = this_years_rep_comms[["link_id", "author"]]
populated_rep_submissions = count_pop_reps.groupby("link_id")["author"].nunique().sort_values(ascending=False)

slightly_populated_rep_submissions = populated_rep_submissions[populated_rep_submissions>4]

very_populated_rep_submissions =  populated_rep_submissions[populated_rep_submissions>9]



In [None]:
slightly_populated_subs = [len(slightly_populated_dem_submissions), len(slightly_populated_rep_submissions), len(slightly_populated_dem_submissions) + len(slightly_populated_rep_submissions), "-", "-", "-"]
very_populated_subs = [len(very_populated_dem_submissions), len(very_populated_rep_submissions), len(very_populated_dem_submissions) + len(very_populated_rep_submissions), "-", "-", "-"]

### DataFrame Containing Results

In [None]:
summary_df["Redditors"] = this_years_user_column
summary_df["Frequent Redditors"] = frequ_redditors
summary_df["More frequent Redditors"] = more_frequ_redditors
summary_df["Submissions"] = this_years_submission_column
summary_df["Comments"] = this_years_comment_column
summary_df["Comments/Subs"] = comm_sub_ratio
summary_df["Posting/Redditor Ratio"] = post_redditor_ratio
summary_df["altogether_domains"] = aggregate_domains
summary_df["submission_domains"] = submission_domains
summary_df["found_submission_selftext_domains"] = found_submission_selftext_domains
summary_df["all_submission_domains_incl_selftext"] = submission_domains_incl_selftext
summary_df["found_comment_domains"] = found_comment_domains 
summary_df["frequ alltog domains"] = frequent_agg_domains
summary_df["freq sub domains"] = frequent_sub_domains
summary_df["frequent_found_submission_domains"] = frequent_found_sub_domains
summary_df["frequent_aggregated_submission_domains"] = frequent_agg_submission_domains
summary_df["freq comm domains"] = frequent_comm_domains
summary_df["distinct agg domains"] = dist_agg_doms
summary_df["distinct freq agg domains"] = dist_freq_doms
summary_df["distinct subm domains"] = distinct_submission_domains
summary_df["distinct freq subm domains"] = distinct_frequent_submission_domains
summary_df["Sub creators"] = sub_creators
summary_df["frequ Sub creators"] = freq_sub_creators
summary_df["controversial comments"] = controversy_list
summary_df["submissions with comments >4"] = submission_with_comments
summary_df["submissions with comments >9"] = submission_with_more_comments
summary_df["sligthly populated subs"] = slightly_populated_subs
summary_df["very populated subs"] = very_populated_subs


In [None]:
summary_df.set_index(this_specific_year)

In [None]:
summary_df.to_csv(save_csvs + this_specific_year + "/summary_df.csv")


### Comparison to previous years if applicable

In [None]:
if os.path.isfile(sub_path + str(int(this_specific_year)-file_year_intervalls) +".csv"):

    last_time_subs = pd.read_csv(sub_path + str(int(this_specific_year)-file_year_intervalls) +".csv")
    last_time_comms = pd.read_csv(comm_path + str(int(this_specific_year)-file_year_intervalls) +".csv")

    last_time_subs = last_time_subs[~last_time_subs["author"].isin(unwanted_users)]
    last_time_comms = last_time_comms[~last_time_comms["author"].isin(unwanted_users)]    

    last_time_dem_subs = last_time_subs[last_time_subs["subreddit"] == "democrats"]
    last_time_rep_subs = last_time_subs[last_time_subs["subreddit"] == "Republican"]
    last_time_dem_comms = last_time_comms[last_time_comms["subreddit"] == "democrats"]
    last_time_rep_comms = last_time_comms[last_time_comms["subreddit"] == "Republican"]

    this_years_dem_authors = set(this_years_dem_subs["author"].to_list() + this_years_dem_comms["author"].to_list())
    this_years_dem_authors.remove("[deleted]")
    this_years_rep_authors = set(this_years_rep_subs["author"].to_list() + this_years_rep_comms["author"].to_list())
    this_years_rep_authors.remove("[deleted]")

    this_years_dem_authors = this_years_dem_authors - (set(unwanted_users))
    this_years_rep_authors = this_years_rep_authors - (set(unwanted_users))


    last_time_dem_authors = set(last_time_dem_subs["author"].to_list() + last_time_dem_comms["author"].to_list())
    last_time_rep_authors = set(last_time_rep_subs["author"].to_list() + last_time_rep_comms["author"].to_list())

    new_dems = list(this_years_dem_authors - last_time_dem_authors)
    new_reps = list(this_years_rep_authors - last_time_rep_authors)

    old_dems = last_time_dem_authors & this_years_dem_authors
    old_reps = last_time_rep_authors & this_years_rep_authors

    ty_freq_dem_authors = set(even_more_freq_dem_redditors.index.to_list())
    ty_freq_rep_authors = set(even_more_freq_rep_redditors.index.to_list())

    freq_new_dems = ty_freq_dem_authors - last_time_dem_authors
    freq_new_reps = ty_freq_rep_authors - last_time_rep_authors

    freq_old_dems = ty_freq_dem_authors & last_time_dem_authors
    freq_old_reps = ty_freq_rep_authors & last_time_rep_authors


    only_old_dems = last_time_dem_authors - last_time_rep_authors
    only_old_reps = last_time_rep_authors - last_time_dem_authors

    only_new_dems = this_years_dem_authors - this_years_rep_authors
    only_new_reps = this_years_rep_authors - this_years_dem_authors

    switched_to_dem = only_new_dems & only_old_reps
    switched_to_rep = only_new_reps & only_old_dems


    print(f"In {this_specific_year} the democats subreddit saw {len(new_dems)} new users and the Republican subreddit {len(new_reps)} as compared to {int(this_specific_year)-file_year_intervalls}")
    print(f"In {this_specific_year} the democats subreddit saw {len(old_dems)} and the Republican subreddit {len(old_reps)} remaining users from the year {int(this_specific_year)-file_year_intervalls}")
    print("")
    print(f"Considering (very) frequent posters (10 or more postings in this year) the democrat subreddit saw {len(freq_new_dems)} new users, the Republicans {len(freq_new_reps)} new users, while {len(freq_old_dems)} democrats and {len(freq_old_reps)} Republicans remained")
    print("")
    print(f"{len(switched_to_dem)} authors changed from Republicans to Democrats and {len(switched_to_rep)} changed from democrats to Republican")
    print("")
    print("")

else:
    new_dems = []
    new_reps = []

    

if os.path.isfile(sub_path + str(int(this_specific_year)-2*file_year_intervalls) +".csv"):

    second_last_time_subs = pd.read_csv(sub_path + str(int(this_specific_year)-2*file_year_intervalls) +".csv")
    second_last_time_comms = pd.read_csv(comm_path + str(int(this_specific_year)-2*file_year_intervalls) +".csv")

    second_last_time_subs = second_last_time_subs[~second_last_time_subs["author"].isin(unwanted_users)]
    second_last_time_comms = second_last_time_comms[~second_last_time_comms["author"].isin(unwanted_users)]    

    second_last_time_dem_subs = second_last_time_subs[second_last_time_subs["subreddit"] == "democrats"]
    second_last_time_rep_subs = second_last_time_subs[second_last_time_subs["subreddit"] == "Republican"]
    second_last_time_dem_comms = second_last_time_comms[second_last_time_comms["subreddit"] == "democrats"]
    second_last_time_rep_comms = second_last_time_comms[second_last_time_comms["subreddit"] == "Republican"]

    this_years_dem_authors = set(this_years_dem_subs["author"].to_list() + this_years_dem_comms["author"].to_list())
    this_years_dem_authors.remove("[deleted]")
    this_years_rep_authors = set(this_years_rep_subs["author"].to_list() + this_years_rep_comms["author"].to_list())
    this_years_rep_authors.remove("[deleted]")

    this_years_dem_authors = this_years_dem_authors - (set(unwanted_users))
    this_years_rep_authors = this_years_rep_authors - (set(unwanted_users))



    second_last_time_dem_authors = set(second_last_time_dem_subs["author"].to_list() + second_last_time_dem_comms["author"].to_list())
    second_last_time_rep_authors = set(second_last_time_rep_subs["author"].to_list() + second_last_time_rep_comms["author"].to_list())

    eight_year_new_dems = list(this_years_dem_authors - second_last_time_dem_authors)
    eight_year_new_reps = list(this_years_rep_authors - second_last_time_rep_authors)

    eight_year_int_old_dems = second_last_time_dem_authors & this_years_dem_authors
    eight_year_int_old_reps = second_last_time_rep_authors & this_years_rep_authors

    ty_freq_dem_authors = set(even_more_freq_dem_redditors.index.to_list())
    ty_freq_rep_authors = set(even_more_freq_rep_redditors.index.to_list())

    freq_ey_new_dems = ty_freq_dem_authors - second_last_time_dem_authors
    freq_ey_new_reps = ty_freq_rep_authors - second_last_time_rep_authors

    freq_ey_old_dems = ty_freq_dem_authors & second_last_time_dem_authors
    freq_ey_old_reps = ty_freq_rep_authors & second_last_time_rep_authors


    ey_only_old_dems = second_last_time_dem_authors - second_last_time_rep_authors
    ey_only_old_reps = second_last_time_rep_authors - second_last_time_dem_authors

    only_new_dems = this_years_dem_authors - this_years_rep_authors
    only_new_reps = this_years_rep_authors - this_years_dem_authors

    ey_switched_to_dem = only_new_dems & ey_only_old_reps
    ey_switched_to_rep = only_new_reps & ey_only_old_dems



    print(f"In {this_specific_year} the democats subreddit saw {len(eight_year_new_dems)} new users and the Republican subreddit {len(eight_year_new_reps)} as compared to {int(this_specific_year)-2*file_year_intervalls}")
    print(f"In {this_specific_year} the democats subreddit saw {len(eight_year_int_old_dems)} and the Republican subreddit {len(eight_year_int_old_reps)} remaining users from the year {int(this_specific_year)-2*file_year_intervalls}")
    print("")
    print(f"Considering (very) frequent posters (10 or more postings in this year) the democrat subreddit saw {len(freq_ey_new_dems)} new users, the Republicans {len(freq_ey_new_reps)} new users, while {len(freq_ey_old_dems)} democrats and {len(freq_ey_old_reps)} Republicans remained")
    print("")
    print(f"{len(ey_switched_to_dem)} authors changed from Republicans to Democrats and {len(ey_switched_to_rep)} changed from democrats to Republican")
else:
    eight_year_new_dems = []
    eight_year_new_reps = []

### Redditors in both partisan subreddits

In [None]:
com_redditors_in_dems = pd.concat([this_years_dem_subs["author"], this_years_dem_comms["author"]], ignore_index=True).value_counts()
com_redditors_in_dems = com_redditors_in_dems.loc[list(common_redditors)].sort_values(ascending=False)

com_redditors_in_reps = pd.concat([this_years_rep_subs["author"], this_years_rep_comms["author"]], ignore_index=True).value_counts()
com_redditors_in_reps = com_redditors_in_reps.loc[list(common_redditors)].sort_values(ascending=False)

com_freq_redditors_in_dems = pd.concat([this_years_dem_subs["author"], this_years_dem_comms["author"]], ignore_index=True).value_counts()
com_freq_redditors_in_dems = com_freq_redditors_in_dems.loc[list(common_frequ_redditors)].sort_values(ascending=False)

com_freq_redditors_in_reps = pd.concat([this_years_rep_subs["author"], this_years_rep_comms["author"]], ignore_index=True).value_counts()
com_freq_redditors_in_reps = com_freq_redditors_in_reps.loc[list(common_frequ_redditors)].sort_values(ascending=False)

com_more_freq_redditors_in_dems = pd.concat([this_years_dem_subs["author"], this_years_dem_comms["author"]], ignore_index=True).value_counts()
com_more_freq_redditors_in_dems = com_more_freq_redditors_in_dems.loc[list(common_more_frequ_redditors)].sort_values(ascending=False)

com_more_freq_redditors_in_reps = pd.concat([this_years_rep_subs["author"], this_years_rep_comms["author"]], ignore_index=True).value_counts()
com_more_freq_redditors_in_reps = com_more_freq_redditors_in_reps.loc[list(common_more_frequ_redditors)].sort_values(ascending=False)


In [None]:
print(f"There are {len(common_redditors)} redditors altogether present in both subredddits. Incresing the number of minimum posts in both subreddit to 5 returns {len(common_frequ_redditors)} redditors in both subreddits. Increasing the number to 10 returns {len(common_more_frequ_redditors)} redditors")

### Further particular observations

In [None]:
print("In the democrat subreddit the ones with minimum 10 postings in each subreddit are:")
print(com_more_freq_redditors_in_dems[:25])

In [None]:
print("In the Republican subreddit the ones with minimum 10 postings in each subreddit are:")
print(com_more_freq_redditors_in_reps[:25])

In [None]:
print("Most active authors in Democrats")
print(freq_dem_redditors[:25])

In [None]:
print("Most active authors in Republicans")
print(freq_rep_redditors[:25])

In [None]:
print("25 Top submission creators for Democrats")
print(dem_sub_creators[:25])

In [None]:
print("25 Top submission creators for Republicans")
print(rep_sub_creators[:25])

In [None]:
print("Democrat Submission Domains without selftext")
print(this_years_dem_subs["domain"].value_counts()[:25])
this_years_dem_subs["domain"].value_counts().to_csv("/Users/luka/Documents/Masterarbeit_CSS/Data/csvs_for_analysis/" + this_specific_year + "/DemocratSubmissionDomains.csv")

In [None]:
print("Democrat Submission Domains including selftext")
print(dem_agg_submission_domains[:25])
dem_agg_submission_domains.to_csv("/Users/luka/Documents/Masterarbeit_CSS/Data/csvs_for_analysis/" + this_specific_year + "/DemocratAggregatedSubmissionDomains.csv")

In [None]:
print("Republican Submission Domains including selftext")
print(rep_agg_submission_domains[:25])
rep_agg_submission_domains.to_csv("/Users/luka/Documents/Masterarbeit_CSS/Data/csvs_for_analysis/" + this_specific_year + "/RepublicanAggregatedSubmissionDomains.csv")

In [None]:
dem_domain_df = pd.DataFrame(dem_comm_domains, columns=["domain"])
democrat_comment_domains = dem_domain_df.value_counts()
print("Democrat Comment Domains")
print(democrat_comment_domains[:25])
democrat_comment_domains.to_csv("/Users/luka/Documents/Masterarbeit_CSS/Data/csvs_for_analysis/" + this_specific_year + "/DemocratCommentDomains.csv")

In [None]:
rep_domain_df = pd.DataFrame(rep_comm_domains, columns=["domain"])
republican_comment_domains = rep_domain_df.value_counts()
print("Republican Comment Domains")
print(republican_comment_domains[:25])
republican_comment_domains.to_csv("/Users/luka/Documents/Masterarbeit_CSS/Data/csvs_for_analysis/" + this_specific_year + "/RepublicanCommentDomains.csv")

In [None]:
print("Toatal of Democrat aggregated submission domains and comment domains")
print(dem_agg_domain_count[:25])

In [None]:
for i in dem_agg_domain_count[:25].index.to_list():
    print(i)

In [None]:
dem_agg_domain_count[:25].to_list()

In [None]:
print("Toatal of Republican aggregated submission domains and comment domains")
print(rep_agg_domain_count[:25])

In [None]:
for i in rep_agg_domain_count[:25].index.to_list():
    print(i)

In [None]:
rep_agg_domain_count[:25].to_list()

In [None]:
print("Aggregate distinct Democrat sub + com domains")
print(dist_dem_domain_agg_freq[:25])

In [None]:
print("Aggregate distinct Republican sub + com domains")
print(dist_rep_domain_agg_freq[:25])

In [None]:
print("Aggregate distinct Democrat sub + com domains, neglecting (rep) domains < 5")
print(distinct_dem_domains_above_threshold[:25])

In [None]:
print("Aggregate distinct Republican sub + com domains, neglecting (rep) domains < 5")
print(distinct_rep_domains_above_threshold[:25])

In [None]:
print("Distinct Dem aggregated Sub Domains")
print(dist_dem_domain_sub_freq[:25])

In [None]:
print("Distinct Rep aggregated Sub Domains")
print(dist_rep_domain_sub_freq[:25])

In [None]:
print("Distinct frequent Democrat submission domains")
print(dist_frequ_dem_domain_sub_freq[:25])

In [None]:
print("Distinct frequent Republican submission domains")
print(dist_frequ_rep_domain_sub_freq[:25])

In [None]:
print("Most comments per Democrat submission")
this_years_dem_subs.sort_values(by="num_comments", ascending=False)[["id", "title", "created", "num_comments", "author"]].head(25)

In [None]:
this_years_dem_subs.sort_values(by="num_comments", ascending=False)[["id", "title", "created", "num_comments", "author"]].head(25)["title"].to_list()

In [None]:
print("Most comments per Republican submission")
this_years_rep_subs.sort_values(by="num_comments", ascending=False)[["id", "title", "created", "num_comments", "author"]].head(25)

In [None]:
this_years_rep_subs.sort_values(by="num_comments", ascending=False)[["id", "title", "created", "num_comments", "author"]].head(25)["title"].to_list()

In [None]:
dem_find_pop_subs = []
for i in very_populated_dem_submissions[:25].index.to_list():
    dem_find_pop_subs.append(i[3:])

rep_find_pop_subs = []
for i in very_populated_rep_submissions[:25].index.to_list():
    rep_find_pop_subs.append(i[3:])

In [None]:
most_individual_redditors_dems = this_years_dem_subs[this_years_dem_subs["id"].isin(dem_find_pop_subs)]
most_individual_redditors_dems = most_individual_redditors_dems.set_index("id").loc[dem_find_pop_subs].reset_index()
most_individual_redditors_dems[["id", "title", "created", "num_comments", "author"]]

In [None]:
most_individual_redditors_dems["title"].to_list()

In [None]:
very_populated_dem_submissions[:25]

In [None]:
most_individual_redditors_reps = this_years_rep_subs[this_years_rep_subs["id"].isin(rep_find_pop_subs)]
for i in rep_find_pop_subs:
    if i not in most_individual_redditors_reps["id"].to_list():
        rep_find_pop_subs.remove(i)
most_individual_redditors_reps = most_individual_redditors_reps.set_index("id").loc[rep_find_pop_subs].reset_index()
most_individual_redditors_reps[["id", "title", "created", "num_comments", "author"]]

In [None]:
most_individual_redditors_reps["title"].to_list()

In [None]:
very_populated_rep_submissions[:25]

### Charts 

In [None]:
posting_df_democ = pd.concat([this_years_dem_subs[["created", "author"]], this_years_dem_comms[["created", "author"]]], ignore_index=True)

posting_df_democ["day"] = posting_df_democ["created"].dt.floor("D") + pd.Timedelta(12, unit="h")


dem_activities = posting_df_democ.groupby("day").count()
dem_activities = dem_activities[["created"]]

fig = px.line(dem_activities, x=dem_activities.index, y="created", color_discrete_sequence = ["blue"], title= "Democrat Reddit Activity " + this_specific_year, labels = {"day":"Date", "created":"Number of postings"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()

plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/posting_activity_dem.html")
fig.write_image(save_plots + this_specific_year + "/posting_activity_dem.png")

In [None]:
posting_df_repu = pd.concat([this_years_rep_subs[["created", "author", "title", "subreddit"]], this_years_rep_comms[["created", "author", "body"]]], ignore_index=True)

posting_df_repu["day"] = posting_df_repu["created"].dt.floor("D") + pd.Timedelta(12, unit="h")


rep_activities = posting_df_repu.groupby("day").count()
rep_activities = rep_activities[["created"]]

fig = px.line(rep_activities, x=rep_activities.index, y="created", color_discrete_sequence = ["red"], title= "Republican Reddit Activity " + this_specific_year, labels = {"day":"Date", "created":"Number of postings"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()

plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/posting_activity_rep.html")
fig.write_image(save_plots + this_specific_year + "/posting_activity_rep.png")

In [None]:
dem_activities["Party"] = "Democrats"
rep_activities["Party"] = "Republicans"
both_parties_df = pd.concat([dem_activities, rep_activities])

fig = px.line(both_parties_df, x=both_parties_df.index, y="created", color = "Party", title= "Posting Activity for both partisan subreddits " + this_specific_year, labels = {"day":"Date", "created":"Number of postings"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()

plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/posting_activity_both.html")
fig.write_image(save_plots + this_specific_year + "/posting_activity_both.png")

In [None]:
dem_author_develop = posting_df_democ.groupby("day")["author"].apply(set).reset_index()

dem_daily_author_growth = set()

for i in range(len(dem_author_develop)):
    dem_daily_author_growth = dem_daily_author_growth.union(dem_author_develop.loc[i, "author"])
    dem_author_develop.loc[i, "authors of the day"] = len(dem_author_develop.loc[i, "author"])
    dem_author_develop.loc[i, "author growth"] = len(dem_daily_author_growth)

fig = px.line(dem_author_develop, x=dem_author_develop["day"], y="authors of the day", color_discrete_sequence = ["blue"], title= "Democrat Redditors per day " + this_specific_year, labels = {"day":"Date", "authors of the day":"Number of active Redditors"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()

plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/daily_democrat_authors.html")
fig.write_image(save_plots + this_specific_year + "/daily_democrat_authors.png")

In [None]:
fig = px.line(dem_author_develop, x=dem_author_develop["day"], y="author growth", color_discrete_sequence = ["blue"], title= "Cumulative Democrat Authors " + this_specific_year, labels = {"day":"Date", "created":"Number of postings"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()
plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/cumulative_democrat_authors.html")
fig.write_image(save_plots + this_specific_year + "/cumulative_democrat_authors.png")

In [None]:
rep_author_develop = posting_df_repu.groupby("day")["author"].apply(set).reset_index()

rep_daily_author_growth = set()

for i in range(len(rep_author_develop)):
    rep_daily_author_growth = rep_daily_author_growth.union(rep_author_develop.loc[i, "author"])
    rep_author_develop.loc[i, "authors of the day"] = len(rep_author_develop.loc[i, "author"])
    rep_author_develop.loc[i, "author growth"] = len(rep_daily_author_growth)

fig = px.line(rep_author_develop, x=rep_author_develop["day"], y="authors of the day", color_discrete_sequence = ["red"], title= "Republican Redditors per day " + this_specific_year, labels = {"day":"Date", "authors of the day":"Number of active Redditors"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()

plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/daily_republican_authors.html")
fig.write_image(save_plots + this_specific_year + "/daily_republican_authors.png")


In [None]:
fig = px.line(rep_author_develop, x=rep_author_develop["day"], y="author growth", color_discrete_sequence = ["red"], title= "Cumulative Republican Authors " + this_specific_year, labels = {"day":"Date", "created":"Number of postings"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()
plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/cumulative_republican_authors.html")
fig.write_image(save_plots + this_specific_year + "/cumulative_republican_authors.png")

In [None]:
dem_author_develop["Party"] = "Democrats"
rep_author_develop["Party"] = "Republicans"
both_develop_df = pd.concat([dem_author_develop, rep_author_develop])

fig = px.line(both_develop_df, x=both_develop_df["day"], y="authors of the day", color = "Party", title= "Redditors per day for both partisan subreddits " + this_specific_year, labels = {"day":"Date", "authors of the day":"Number of active Redditors"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()

plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/daily_both_authors.html")
fig.write_image(save_plots + this_specific_year + "/daily_both_authors.png")

In [None]:
fig = px.line(both_develop_df, x=both_develop_df["day"], y="author growth", color = "Party", title= "Cumulative Authors in both partisan subreddits " + this_specific_year, labels = {"day":"Date", "created":"Number of postings"})
fig.add_vline(election_date, line_color = "green", line_dash = "dash")
fig.show()
plotly.offline.plot(fig, filename= save_plots + this_specific_year + "/cumulative_both_authors.html")
fig.write_image(save_plots + this_specific_year + "/cumulative_both_authors.png")

## Topics

In [None]:
unwanted_titles = ["[ Removed by Reddit ]", "[deleted by user]"]

In [None]:
def look_up(name, dictionary):
    return dictionary[name]

In [None]:
def id_maker(parent):
    if isinstance(parent, str):
        return parent[3:]
    else:
        return np.nan 

### Democratic Topics

In [None]:
this_years_dem_comms["comm_id"] = this_years_dem_comms["id"]

In [None]:
combine_dem_subs = this_years_dem_subs[["id", "title", "selftext", "created"]]

dem_combiner_df = this_years_dem_comms[["link_id", "body", "created", "comm_id"]]

dem_combiner_df = pd.concat([combine_dem_subs, dem_combiner_df])

dem_combiner_df["to_this_sub"] = dem_combiner_df["link_id"].apply(id_maker)

dem_combiner_df = dem_combiner_df.sort_values(by="created")

dem_combiner_df["day"] = dem_combiner_df["created"].dt.floor("D") + pd.Timedelta(12, unit="h")

dem_combiner_df = dem_combiner_df[~dem_combiner_df["title"].isin(unwanted_titles)]

In [None]:
dem_thread_dict = {}
dem_id_dict = {}
dem_time_dict = {}

for index, row in dem_combiner_df.iterrows():
    if isinstance(row["id"], str):
        thread_list = [row["title"]]
        id_list = [row["id"]]
        if isinstance(row["selftext"], str):
            thread_list = [row["title"] + " " + row["selftext"]]
        date_list = [row["day"]]
        dem_thread_dict[row["id"]] = thread_list
        dem_id_dict[row["id"]] = id_list
        dem_time_dict[row["id"]] = date_list
    else:
        if row["to_this_sub"] in dem_thread_dict:
            dem_thread_dict[row["to_this_sub"]].append(row["body"])
            dem_id_dict[row["to_this_sub"]].append(row["comm_id"])
            dem_time_dict[row["to_this_sub"]].append(row["day"])
        else:
            thread_list = [row["body"]]
            id_list = [row["comm_id"]]
            date_list = [row["day"]]
            dem_thread_dict[row["to_this_sub"]] = thread_list
            dem_id_dict[row["to_this_sub"]] = id_list
            dem_time_dict[row["to_this_sub"]] = date_list

In [None]:
dem_documents = []
dem_ids = []
dem_times = []
dem_topic_threads = []

for key, value in dem_thread_dict.items():
    if len(value) > min_responses_in_subred:
        dem_documents.extend(value)
        dem_ids.extend(dem_id_dict[key])
        dem_times.extend(dem_time_dict[key])
        dem_topic_threads.append(key)

In [None]:
sentence_model = SentenceTransformer("all-mpnet-base-v2")
dem_embeddings = sentence_model.encode(dem_documents, show_progress_bar=True)

vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
umap_model = UMAP(random_state=16)

dem_model = BERTopic(language="english", umap_model=umap_model, vectorizer_model=vectorizer_model, calculate_probabilities=False, verbose=True)
topics, probs = dem_model.fit_transform(dem_documents, dem_embeddings)

In [None]:
dem_model.get_topic_info()

In [None]:
sum(dem_model.get_topic_info()["Count"])

In [None]:
dem_model.get_topic_info()[:11]

In [None]:
dem_model.save(save_csvs + this_specific_year + "/dem_model")

In [None]:
len(dem_topic_threads)

In [None]:
len(dem_documents)

In [None]:
dem_topic_df = dem_model.get_document_info(dem_documents)

In [None]:
dem_topic_dict = {}

dem_topic_list =  dem_topic_df["Topic"].to_list()

for i in range(len(dem_topic_list)):
    dem_topic_dict[dem_ids[i]] = dem_topic_list[i]

In [None]:
with open(save_csvs + this_specific_year + "/dem_topic_dict.txt", "w") as dem_dict:
    json.dump(dem_topic_dict, dem_dict)

In [None]:
dem_model.get_topic(0)

In [None]:
rep_model = BERTopic.load(save_csvs + this_specific_year + "/rep_model")

In [None]:
rep_model.get_topic_info()

In [None]:
sum(rep_model.get_topic_info()["Count"])

In [None]:
dem_topics_over_time = dem_model.topics_over_time(dem_documents, dem_times, nr_bins=52)

dem_topics_over_time.to_csv(save_csvs + this_specific_year + "/dem_topics_over_time.csv")

In [None]:
#dem_topics_over_time = BERTopic.load(save_csvs + this_specific_year + "/dem_topics_over_time.csv")

In [None]:
dem_tot_fig = dem_model.visualize_topics_over_time(dem_topics_over_time, top_n_topics=10, title="Democrat Topics Over Time " + this_specific_year)

dem_tot_fig.write_html(save_plots + this_specific_year + "/dem_topics_over_time.html")
dem_tot_fig.write_image(save_plots + this_specific_year + "/dem_topics_over_time.png")

dem_tot_fig

In [None]:
dem_top_viz = dem_model.visualize_topics(title = "Democrat Intertopic Distance Map " + this_specific_year)

dem_top_viz.write_html(save_plots + this_specific_year + "/dem_topics_distance.html")
dem_top_viz.write_image(save_plots + this_specific_year + "/dem_topics_distance.png")

dem_top_viz

In [None]:
dem_heatmap = dem_model.visualize_heatmap(title = "Democrat Similarity Matrix " + this_specific_year )

dem_heatmap.write_html(save_plots + this_specific_year + "/dem_topics_heatmap.html")
dem_heatmap.write_image(save_plots + this_specific_year + "/dem_topics_heatmap.png")

dem_heatmap

In [None]:
dem_doc_viz = dem_model.visualize_documents(dem_documents, embeddings=dem_embeddings, hide_annotations=True, title = "Democrat Documents and Topics " + this_specific_year)

dem_doc_viz.write_html(save_plots + this_specific_year + "/dem_doc_viz.html")
dem_doc_viz.write_image(save_plots + this_specific_year + "/dem_doc_viz.png")

dem_doc_viz

In [None]:
dem_barcharts = dem_model.visualize_barchart(n_words = 8, title= "Democrat Topic Word Scores " + this_specific_year)

dem_barcharts.write_html(save_plots + this_specific_year + "/dem_barcharts.html")
dem_barcharts.write_image(save_plots + this_specific_year + "/dem_barcharts.png")

dem_barcharts

In [None]:
party_topic_dictionary = dem_topic_dict

In [None]:
def look_up_topic(name):
    if name in party_topic_dictionary.keys():
        return party_topic_dictionary[name]
    else:
        return -1

In [None]:
this_years_dem_subs["topic"] = this_years_dem_subs["id"].apply(look_up_topic)
this_years_dem_comms["topic"] = this_years_dem_comms["id"].apply(look_up_topic)

In [None]:
this_years_dem_subs.to_csv(save_csvs + this_specific_year + "/dem_subs.csv")
this_years_dem_comms.to_csv(save_csvs + this_specific_year + "/dem_comms.csv")

### Republican Topics

In [None]:
this_years_rep_comms["comm_id"] = this_years_rep_comms["id"]

In [None]:
combine_rep_subs = this_years_rep_subs[["id", "title", "selftext", "created"]]

rep_combiner_df = this_years_rep_comms[["link_id", "body", "created", "comm_id"]]

rep_combiner_df = pd.concat([combine_rep_subs, rep_combiner_df])

rep_combiner_df["to_this_sub"] = rep_combiner_df["link_id"].apply(id_maker)

rep_combiner_df = rep_combiner_df.sort_values(by="created")

rep_combiner_df["day"] = rep_combiner_df["created"].dt.floor("D") + pd.Timedelta(12, unit="h")

rep_combiner_df = rep_combiner_df[~rep_combiner_df["title"].isin(unwanted_titles)]

In [None]:
rep_thread_dict = {}
rep_id_dict = {}
rep_time_dict = {}

for index, row in rep_combiner_df.iterrows():
    if isinstance(row["id"], str):
        thread_list = [row["title"]]
        id_list = [row["id"]]
        if isinstance(row["selftext"], str):
            thread_list = [row["title"] + " " + row["selftext"]]
        date_list = [row["day"]]
        rep_thread_dict[row["id"]] = thread_list
        rep_id_dict[row["id"]] = id_list
        rep_time_dict[row["id"]] = date_list
    else:
        if row["to_this_sub"] in rep_thread_dict:
            rep_thread_dict[row["to_this_sub"]].append(row["body"])
            rep_id_dict[row["to_this_sub"]].append(row["comm_id"])
            rep_time_dict[row["to_this_sub"]].append(row["day"])
        else:
            thread_list = [row["body"]]
            id_list = [row["comm_id"]]
            date_list = [row["day"]]
            rep_thread_dict[row["to_this_sub"]] = thread_list
            rep_id_dict[row["to_this_sub"]] = id_list
            rep_time_dict[row["to_this_sub"]] = date_list

In [None]:
rep_documents = []
rep_ids = []
rep_times = []
rep_topic_threads = []

for key, value in rep_thread_dict.items():
    if len(value) > min_responses_in_subred:
        rep_documents.extend(value)
        rep_ids.extend(rep_id_dict[key])
        rep_times.extend(rep_time_dict[key])
        rep_topic_threads.append(key)

In [None]:
sentence_model = SentenceTransformer("all-mpnet-base-v2")
rep_embeddings = sentence_model.encode(rep_documents, show_progress_bar=True)

vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
umap_model = UMAP(random_state=16)

rep_model = BERTopic(language="english", umap_model=umap_model, vectorizer_model=vectorizer_model, calculate_probabilities=False, verbose=True)
topics, probs = rep_model.fit_transform(rep_documents, rep_embeddings)

In [None]:
rep_model.get_topic_info()

In [None]:
rep_model.get_topic_info()[:11]

In [None]:
rep_model.save(save_csvs + this_specific_year + "/rep_model")

In [None]:
len(rep_topic_threads)

In [None]:
len(rep_documents)

In [None]:
rep_topic_df = rep_model.get_document_info(rep_documents)

In [None]:
rep_topic_dict = {}

rep_topic_list =  rep_topic_df["Topic"].to_list()

for i in range(len(rep_topic_list)):
    rep_topic_dict[rep_ids[i]] = rep_topic_list[i]

In [None]:
with open(save_csvs + this_specific_year + "/rep_topic_dict.txt", "w") as rep_dict:
    json.dump(rep_topic_dict, rep_dict)

In [None]:
rep_model.get_topic(0)

In [None]:
rep_topics_over_time = rep_model.topics_over_time(rep_documents, rep_times, nr_bins=52)

rep_topics_over_time.to_csv(save_csvs + this_specific_year + "/rep_topics_over_time.csv")

In [None]:
rep_tot_fig = rep_model.visualize_topics_over_time(rep_topics_over_time, top_n_topics=10, title="Republican Topics Over Time " + this_specific_year)

rep_tot_fig.write_html(save_plots + this_specific_year + "/rep_topics_over_time.html")
rep_tot_fig.write_image(save_plots + this_specific_year + "/rep_topics_over_time.png")

rep_tot_fig

In [None]:
rep_top_viz = rep_model.visualize_topics(title = "Republican Intertopic Distance Map " + this_specific_year)

rep_top_viz.write_html(save_plots + this_specific_year + "/rep_topics_distance.html")
rep_top_viz.write_image(save_plots + this_specific_year + "/rep_topics_distance.png")

rep_top_viz

In [None]:
rep_heatmap = rep_model.visualize_heatmap(title = "Republican Similarity Matrix " + this_specific_year )

rep_heatmap.write_html(save_plots + this_specific_year + "/rep_topics_heatmap.html")
rep_heatmap.write_image(save_plots + this_specific_year + "/rep_topics_heatmap.png")

rep_heatmap

In [None]:
rep_doc_viz = rep_model.visualize_documents(rep_documents, embeddings=rep_embeddings, hide_annotations=True, title = "Republican Documents and Topics " + this_specific_year)

rep_doc_viz.write_html(save_plots + this_specific_year + "/rep_doc_viz.html")
rep_doc_viz.write_image(save_plots + this_specific_year + "/rep_doc_viz.png")

rep_doc_viz

In [None]:
rep_barcharts = rep_model.visualize_barchart(n_words = 8, title= "Republican Topic Word Scores " + this_specific_year)

rep_barcharts.write_html(save_plots + this_specific_year + "/rep_barcharts.html")
rep_barcharts.write_image(save_plots + this_specific_year + "/rep_barcharts.png")

rep_barcharts

In [None]:
party_topic_dictionary = rep_topic_dict

In [None]:
this_years_rep_subs["topic"] = this_years_rep_subs["id"].apply(look_up_topic)
this_years_rep_comms["topic"] = this_years_rep_comms["id"].apply(look_up_topic)

In [None]:
this_years_rep_subs.to_csv(save_csvs + this_specific_year + "/rep_subs.csv")
this_years_rep_comms.to_csv(save_csvs + this_specific_year + "/rep_comms.csv")

### Comparison 

In [None]:
sim_matrix = cosine_similarity(dem_model.topic_embeddings_, rep_model.topic_embeddings_)

## Lifestyles

In [None]:
this_years_network_df = pd.read_csv(network_file_path)

print(len(this_years_network_df))

In [None]:
print(this_years_network_df["author"].nunique())
print(this_years_network_df["subreddit"].nunique())

In [None]:
print(this_years_network_df["subreddit"].value_counts().head(25))

In [None]:
this_years_engagement_network = this_years_network_df.groupby(["author", "subreddit"]).size().reset_index()

reduced_by_engagement_this_year = this_years_engagement_network[this_years_engagement_network[0]>=lifestyle_threshold]

print(reduced_by_engagement_this_year.head(25))

In [None]:
this_years_final_network_df = this_years_network_df.merge(reduced_by_engagement_this_year, on =["author", "subreddit"], how="inner")

In [None]:
mytwo = ["democrats", "Republican"]
len(set(this_years_final_network_df[this_years_final_network_df["subreddit"].isin(mytwo)]["author"].to_list()))

In [None]:
print(this_years_final_network_df["subreddit"].nunique())
print(this_years_final_network_df["author"].nunique())

In [None]:
this_years_lifestyle_subredds = this_years_final_network_df["subreddit"].to_list()
this_years_lifestyle_authors = this_years_final_network_df["author"].to_list()

In [None]:
this_years_authors_in_lifestyl_subred_dict = {}
for i in set(this_years_lifestyle_subredds):
    this_years_authors_in_lifestyl_subred_dict[i] = []

print(len(this_years_authors_in_lifestyl_subred_dict))

for i in range(len(this_years_lifestyle_authors)):
    this_years_authors_in_lifestyl_subred_dict[this_years_lifestyle_subredds[i]].append(this_years_lifestyle_authors[i])

for i in this_years_authors_in_lifestyl_subred_dict.keys():
    this_years_authors_in_lifestyl_subred_dict[i] = len(set(this_years_authors_in_lifestyl_subred_dict[i]))

In [None]:
this_years_network_subreds = []
for subred, n_redditors in this_years_authors_in_lifestyl_subred_dict.items():
    if n_redditors >= lifestyle_threshold:
        this_years_network_subreds.append(subred)

In [None]:
this_years_final_network_df = this_years_final_network_df[this_years_final_network_df["subreddit"].isin(this_years_network_subreds)]

### NOTE

One particular correction was necessary in one of the yearly data sets, as a case was identified as flawed username due to a deleted "0"

As this user shall remain anonymous only the method is shown. 

In [None]:
this_years_final_network_df["author"].replace("USERNAME", "0USERNAME", inplace=True)

print(this_years_final_network_df)

In [None]:
print(this_years_final_network_df["subreddit"].nunique())
print(this_years_final_network_df["author"].nunique())

In [None]:
this_years_final_redditors = list(set(this_years_final_network_df["author"].to_list()))

my_two = ["democrats", "Republican"]

this_years_rep_dem_subs = this_years_subs[this_years_subs["author"].isin(this_years_final_redditors)]
this_years_rep_dem_subs = this_years_rep_dem_subs[this_years_rep_dem_subs["subreddit"].isin(my_two)]
this_years_rep_dem_subs = this_years_rep_dem_subs[["author", "subreddit"]]
this_years_rep_dem_comms = this_years_comms[this_years_comms["author"].isin(this_years_final_redditors)]
this_years_rep_dem_comms = this_years_rep_dem_comms[this_years_rep_dem_comms["subreddit"].isin(my_two)]
this_years_rep_dem_comms = this_years_rep_dem_comms[["author", "subreddit"]]

this_years_all_rep_dem_posts = this_years_rep_dem_subs.append(this_years_rep_dem_comms, ignore_index=True)

this_years_ratio_base_df = this_years_all_rep_dem_posts.groupby(["author", "subreddit"]).size().reset_index()

In [None]:
this_years_double_redditors = (this_years_ratio_base_df["author"].value_counts()[this_years_ratio_base_df["author"].value_counts()>1])
len(this_years_double_redditors)

### Political score for redditors

In [None]:
this_years_rep_dem_ratio_dict = {}

for redditor in this_years_final_redditors:
    redditor_ratio_df = this_years_ratio_base_df[this_years_ratio_base_df["author"]==redditor]
    if len(redditor_ratio_df[redditor_ratio_df["subreddit"] == "Republican"]) == 1:
        rep_score = redditor_ratio_df[redditor_ratio_df["subreddit"] == "Republican"][0].iloc[0]
    else:
        rep_score = 0
    if len(redditor_ratio_df[redditor_ratio_df["subreddit"] == "democrats"]) == 1:
        dem_score = redditor_ratio_df[redditor_ratio_df["subreddit"] == "democrats"][0].iloc[0]
    else:
        dem_score = 0
    rep_dem_ratio = ((rep_score/(rep_score+dem_score))*2)-1
    
    this_years_rep_dem_ratio_dict[redditor]=rep_dem_ratio

In [None]:
this_years_total_score = 0
for value in this_years_rep_dem_ratio_dict.values():
    this_years_total_score += value

this_years_average_score = this_years_total_score/len(this_years_rep_dem_ratio_dict)

print(f"This years average score is {this_years_average_score}")



In [None]:
rep_dem_ratio_list = list(this_years_rep_dem_ratio_dict.values())


In [None]:
plt.hist(rep_dem_ratio_list, bins =20, color="teal", edgecolor = "black")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.title("Political Score Distribution " + this_specific_year)
plt.savefig(save_plots + this_specific_year + "/political_score_distribution.png")
plt.show()

In [None]:
np.histogram(rep_dem_ratio_list, bins = 20, )

In [None]:
only_lifestyles_df = this_years_final_network_df[~this_years_final_network_df["subreddit"].isin(my_two)]

In [None]:
print(only_lifestyles_df["author"].nunique())
print(only_lifestyles_df["author"].nunique()/this_years_final_network_df["author"].nunique())

In [None]:
len(set(this_years_double_redditors.index.to_list()) & set(only_lifestyles_df["author"].to_list()))

In [None]:
ls_enaged_redditors = only_lifestyles_df["author"].unique()

ls_engaged_rep_dem_ratio_list = [] 
for i in ls_enaged_redditors:
    ideological_score = this_years_rep_dem_ratio_dict[i]
    ls_engaged_rep_dem_ratio_list.append(ideological_score)

In [None]:
plt.hist(ls_engaged_rep_dem_ratio_list, bins =20, color="teal", edgecolor = "black")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.title("Political Score Distribution for LS engaged Authors " + this_specific_year)
plt.savefig(save_plots + this_specific_year + "/political_score_distribution_LS_engaged.png")
plt.show()

In [None]:
np.histogram(ls_engaged_rep_dem_ratio_list, bins =20)

In [None]:
this_years_total_only_ls_score = 0
for pol_scor_engaged in ls_engaged_rep_dem_ratio_list:
    this_years_total_only_ls_score += pol_scor_engaged

this_years_average_only_ls_score = this_years_total_only_ls_score/len(ls_engaged_rep_dem_ratio_list)

print(f"This years average score for authors engaged in any ls subreddits is {this_years_average_only_ls_score}")

In [None]:
ls_enaged_redditors = list(ls_enaged_redditors)

dems = 0
ls_dems = 0
reps = 0
ls_reps = 0

for key, value in this_years_rep_dem_ratio_dict.items():
    if value <= -0.2:
        dems +=1
        if key in ls_enaged_redditors:
            ls_dems +=1
    if value >= 0.2:
        reps +=1
        if key in ls_enaged_redditors:
            ls_reps +=1

print(f"All together found {dems} Democrats and {ls_dems} lifestyle engaged Democrats and {reps} Republicans and {ls_reps} lifestyle engaged Republicans")

### Creating the Graphs

In [None]:
G = nx.Graph()

In [None]:
this_years_network_edge_list = []

In [None]:
for index,row in this_years_final_network_df.iterrows():
    redditor = row["author"]
    subreddit = row["subreddit"]
    G.add_node(redditor, node_type="redditor")
    G.add_node(subreddit, node_type="subreddit")        
    G.add_edge(redditor, subreddit)
    if subreddit != "Republican" and subreddit != "democrats":
        this_years_network_edge_list.append((redditor,subreddit))

In [None]:
print(G.number_of_nodes())
print(G.number_of_edges())
print(nx.is_connected(G))
print(len(set(this_years_network_edge_list)))

In [None]:
this_years_all_non_pol_nodes = []
for i in this_years_network_edge_list:
    this_years_all_non_pol_nodes.append(i[0])
    this_years_all_non_pol_nodes.append(i[1])
this_years_non_lifestyle_redditors = set(G.nodes()) - set(this_years_all_non_pol_nodes)
this_years_non_lifestyle_redditors.remove("democrats")
this_years_non_lifestyle_redditors.remove("Republican")
len(this_years_non_lifestyle_redditors)

In [None]:
bottom_nodes, top_nodes = bipartite.sets(G)

In [None]:
top_nodes = {n for n, d in G.nodes(data=True) if d["node_type"] == "subreddit"}
bottom_nodes = set(G) - top_nodes

In [None]:
A = bipartite.projected_graph(G, top_nodes)

In [None]:
print(A)

In [None]:
graph1 = from_edge_list(this_years_network_edge_list, bipartite=True)

### Communities

In [None]:
biadjacency = graph1.biadjacency
names = graph1.names
names_col = graph1.names_col
names_row = graph1.names_row

In [None]:
louvain = Louvain()
louvain.fit(biadjacency)
labels_row = louvain.labels_row_
labels_col = louvain.labels_col_

In [None]:
get_modularity(biadjacency, labels_row, labels_col)

In [None]:
communities = {}
what_in_communes = {}
who_in_communes = {}

for i in range(len(names_row)):
    communities[names_row[i]] = labels_row[i]
 
for i in range(len(names_col)):
    communities[names_col[i]] = labels_col[i]

communities["Republican"] = 120
communities["democrats"] = 160

for i in range(len(names_col)):
    if labels_col[i] in what_in_communes:
        what_in_communes[labels_col[i]].append(names_col[i])
    else:
        what_in_communes[labels_col[i]] = [names_col[i]]

for i in range(len(names_row)):
    if labels_row[i] in who_in_communes:
        who_in_communes[labels_row[i]].append(names_row[i])
    else:
        who_in_communes[labels_row[i]] = [names_row[i]]

print(len(what_in_communes))
print(what_in_communes)
print()
print(who_in_communes)

In [None]:
for key,value in communities.items():
    communities[key] = int(value)

what_in_communities = {}
for key,value in what_in_communes.items():
    what_in_communities[int(key)] = value

who_in_communities = {}
for key,value in who_in_communes.items():
    who_in_communities[int(key)] = value

In [None]:
with open(save_csvs + this_specific_year + "/communities_dict.txt", "w") as comun_dict:
    json.dump(communities, comun_dict)

with open(save_csvs + this_specific_year + "/what_communities_dict.txt", "w") as what_comun_dict:
    json.dump(what_in_communities, what_comun_dict)

with open(save_csvs + this_specific_year + "/who_communities_dict.txt", "w") as who_comun_dict:
    json.dump(who_in_communities, who_comun_dict)

In [None]:
for i in range(len(what_in_communities)):
    print(f"Community {i} has {len(who_in_communities[i])} members and covers {len(what_in_communities[i])} lifestyles")
    print(f"The lifestyles are {what_in_communities[i]}")

In [None]:
nx.average_clustering(A)

In [None]:
nx.number_connected_components(G)

In [None]:
nx.is_connected(A)

In [None]:
ordered_communities = OrderedDict(sorted(what_in_communities.items(), key = lambda x : len(x[1]))).keys()

In [None]:
degree_dict = dict(G.degree())
betweenness_dict = nx.betweenness_centrality(G)

### Partisan Redditors

In [None]:
this_years_scored_dem_redditors = []
this_years_scored_rep_redditors = []

for redditor, dem_rep_score in this_years_rep_dem_ratio_dict.items():
    if dem_rep_score < 0 - neutral_zone_marker:
        this_years_scored_dem_redditors.append(redditor)
    elif dem_rep_score > 0 + neutral_zone_marker:
        this_years_scored_rep_redditors.append(redditor)

print(len(this_years_scored_dem_redditors))
print(len(this_years_scored_rep_redditors))

In [None]:
with open(save_csvs + this_specific_year + "/scored_dems.txt", "w") as political_scored_dems:
    json.dump(this_years_scored_dem_redditors, political_scored_dems)

with open(save_csvs + this_specific_year + "/scored_repss.txt", "w") as political_scored_reps:
    json.dump(this_years_scored_rep_redditors, political_scored_reps)

In [None]:
this_years_final_network_df[this_years_final_network_df["author"].isin(this_years_scored_dem_redditors)]["subreddit"].value_counts().head(25)

In [None]:
this_years_final_network_df[this_years_final_network_df["author"].isin(this_years_scored_rep_redditors)]["subreddit"].value_counts().head(25)

### Calculating the political appeal of subreddits

In [None]:
freq_dem_subreds = {}
freq_rep_subreds = {}

for author in this_years_scored_dem_redditors:
    subred_list = set(this_years_final_network_df[this_years_final_network_df["author"] == author]["subreddit"].to_list())
    for subred in subred_list:
        if subred in freq_dem_subreds.keys():
            freq_dem_subreds[subred] += 1
        else:
            freq_dem_subreds[subred] = 1

for author in this_years_scored_rep_redditors:
    subred_list = set(this_years_final_network_df[this_years_final_network_df["author"] == author]["subreddit"].to_list())
    for subred in subred_list:
        if subred in freq_rep_subreds.keys():
            freq_rep_subreds[subred] += 1
        else:
            freq_rep_subreds[subred] = 1

In [None]:
sorted(freq_dem_subreds.items(), key=lambda x: x[1], reverse=True)[:25]

In [None]:
sorted(freq_rep_subreds.items(), key=lambda x: x[1], reverse=True)[:25]

In [None]:
dem_ratio_dict = {}
rep_ratio_dict = {}

for subred in freq_rep_subreds.keys():
    rep_ratio_dict[subred] = freq_rep_subreds[subred]/freq_rep_subreds["Republican"]
    dem_ratio_dict[subred] = 0

for subred in freq_dem_subreds.keys():
    dem_ratio_dict[subred] = freq_dem_subreds[subred]/freq_dem_subreds["democrats"]
    if subred not in rep_ratio_dict.keys():
        rep_ratio_dict[subred] = 0


In [None]:
sorted(dem_ratio_dict.items(), key=lambda x: x[1], reverse=True)[:25]

In [None]:
sorted(rep_ratio_dict.items(), key=lambda x: x[1], reverse=True)[:25]

In [None]:
for node in A.nodes():
    if node not in rep_ratio_dict.keys():
        print(node)

In [None]:
ls_normalized_dem_rep_score_dict = {}
for subred in rep_ratio_dict.keys():
    dem_rep_ratio = rep_ratio_dict[subred]/(rep_ratio_dict[subred]+dem_ratio_dict[subred])
    ls_normalized_dem_rep_score_dict[subred] = (dem_rep_ratio*2)-1


### Creating the lifestyle map

Here some adaptations have to be made purely for better vizualisation. Therefore the numbers of recurring communities are indicated to show how this was done for 2022


The code still works without these adjustments.

#### Numbers of Common community definitions

Science and Technology : 0 in all 3 years

Arms and Weapons: 6 in 2014, 2 in 2018, 2 in 2022

Science Fiction, Comics Fantasy: 2 in 2014, 1 in 2018, 1 in 2022


In [None]:
ordered_communities = list(ordered_communities)

In [None]:
def move_element_to_end(some_list, some_element):
    some_list.remove(some_element)
    some_list.append(some_element)
    return some_list

This list has to be adjusted for every year.

In [None]:
adjust_list = [6, 4, 10, 1, 2, 0]

In [None]:
for i in adjust_list:
    move_element_to_end(ordered_communities, i)

In [None]:
ls_normalized_pos = {}
distance_factor = 2/len(A.nodes())
top_vertical_coordinate = 1
bottom_vertical_coordinate = -1
community_count = 0

for comune in ordered_communities:
    if community_count%2 == 0:
        for subred in what_in_communities[comune]:
            ls_normalized_pos[subred] = np.array([ls_normalized_dem_rep_score_dict[subred], top_vertical_coordinate])
            top_vertical_coordinate -= distance_factor
    else:
        for subred in what_in_communities[comune]:
            ls_normalized_pos[subred] = np.array([ls_normalized_dem_rep_score_dict[subred], bottom_vertical_coordinate])
            bottom_vertical_coordinate += distance_factor       
    community_count+=1
ls_normalized_pos["Republican"] = np.array([1,0])
ls_normalized_pos["democrats"] = np.array([-1,0]) 

The map is created in different sizes

In [None]:
plt.figure(figsize=(48, 30))


color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


size = [10 + 4*degree_dict[node] for node in A.nodes()]
color = [color_dict[communities[node]] for node in A.nodes()]

node_spec = {"node_size": size, "node_color": color} 



edge_spec = {"width": .5, "alpha": .03, "edge_color": "black"}

nx.draw_networkx_nodes(A, ls_normalized_pos, **node_spec,)

nx.draw_networkx_edges(A, ls_normalized_pos, **edge_spec)

nx.draw_networkx_labels(A, ls_normalized_pos, font_size=16)

plt.axvline(c="black", alpha = 1)

plt.axvline(-neutral_zone_marker, c="blue", alpha = 1, linestyle = "--")

plt.axvline(neutral_zone_marker, c="red", alpha = 1, linestyle = "--")

plt.axvline(-1/3, c="navy", alpha = 0.1, linestyle = "--")

plt.axvline(1/3, c="firebrick", alpha = 0.1, linestyle = "--")

plt.title(this_specific_year, fontsize=40)
plt.savefig(save_plots + this_specific_year + "/LS_map_small_normalized.png")
plt.show()


In [None]:
plt.figure(figsize=(80, 50))


color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


size = [10 + 2*degree_dict[node] for node in A.nodes()]
color = [color_dict[communities[node]] for node in A.nodes()]

node_spec = {"node_size": size, "node_color": color} 



edge_spec = {"width": .5, "alpha": .01, "edge_color": "black"}

nx.draw_networkx_nodes(A, ls_normalized_pos, **node_spec,)

nx.draw_networkx_edges(A, ls_normalized_pos, **edge_spec)

nx.draw_networkx_labels(A, ls_normalized_pos, font_size=32)

plt.axvline(c="black", alpha = 1)

plt.axvline(-neutral_zone_marker, c="blue", alpha = 1, linestyle = "--")

plt.axvline(neutral_zone_marker, c="red", alpha = 1, linestyle = "--")

plt.axvline(-1/3, c="navy", alpha = 0.1, linestyle = "--")

plt.axvline(1/3, c="firebrick", alpha = 0.1, linestyle = "--")

plt.title(this_specific_year, fontsize=80)
plt.savefig(save_plots + this_specific_year + "/LS_map_large_normalized.png")
plt.show()


In [None]:
plt.figure(figsize=(160, 100))


color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


size = [10 + 4*degree_dict[node] for node in A.nodes()]
color = [color_dict[communities[node]] for node in A.nodes()]

node_spec = {"node_size": size, "node_color": color} 



edge_spec = {"width": .5, "alpha": .01, "edge_color": "black"}

nx.draw_networkx_nodes(A, ls_normalized_pos, **node_spec,)

nx.draw_networkx_edges(A, ls_normalized_pos, **edge_spec)

nx.draw_networkx_labels(A, ls_normalized_pos, font_size=32)

plt.axvline(c="black", alpha = 1)

plt.axvline(-neutral_zone_marker, c="blue", alpha = 1, linestyle = "--")

plt.axvline(neutral_zone_marker, c="red", alpha = 1, linestyle = "--")

plt.axvline(-1/3, c="navy", alpha = 0.1, linestyle = "--")

plt.axvline(1/3, c="firebrick", alpha = 0.1, linestyle = "--")

plt.title(this_specific_year, fontsize=80)
plt.savefig(save_plots + this_specific_year + "/LS_map_very_large_normalized.png")
plt.show()

### Additional observations

In [None]:
considered_subred_author_dict = {}

considerd_subreds = set(list(freq_rep_subreds.keys()) + list(freq_dem_subreds.keys()))

for subredd in considerd_subreds:
    if subredd in freq_rep_subreds.keys():
        rep_n = freq_rep_subreds[subredd]
    else:
        rep_n = 0
    if subredd in freq_dem_subreds.keys():
        dem_n = freq_dem_subreds[subredd]
    else:
        dem_n = 0

    considered_subred_author_dict[subredd] = rep_n + dem_n

In [None]:
this_years_normalized_democratic_subreddits = []
this_years_normalized_republican_subreddits = []
this_years_normalized_all_subreddits = []



for subred, dem_rep_score in ls_normalized_dem_rep_score_dict.items():
    this_years_normalized_all_subreddits.append([subred, dem_rep_score])
    if dem_rep_score < -neutral_zone_marker:
        this_years_normalized_democratic_subreddits.append([subred, dem_rep_score])
    if dem_rep_score > neutral_zone_marker:
        this_years_normalized_republican_subreddits.append([subred, dem_rep_score])

this_years_high_normalized_dem_subred_df = pd.DataFrame(this_years_normalized_democratic_subreddits, columns = ["subreddit", "score"])
this_years_high_normalized_rep_subred_df = pd.DataFrame(this_years_normalized_republican_subreddits, columns = ["subreddit", "score"])
this_years_normalized_all_subreddits = pd.DataFrame(this_years_normalized_all_subreddits, columns = ["subreddit", "score"])

this_years_high_normalized_dem_subred_df = this_years_high_normalized_dem_subred_df.sort_values(by = ["score"], ascending = True, ignore_index = True)
this_years_high_normalized_rep_subred_df = this_years_high_normalized_rep_subred_df.sort_values(by = ["score"], ascending = False, ignore_index = True)
this_years_normalized_all_subreddits = this_years_normalized_all_subreddits.sort_values(by = ["subreddit"], ascending = True, ignore_index = True)

this_years_high_normalized_dem_subred_df["n_authors"] = this_years_high_normalized_dem_subred_df["subreddit"].map(considered_subred_author_dict)
this_years_high_normalized_rep_subred_df["n_authors"] = this_years_high_normalized_rep_subred_df["subreddit"].map(considered_subred_author_dict)
this_years_normalized_all_subreddits["n_authors"] = this_years_normalized_all_subreddits["subreddit"].map(considered_subred_author_dict)

this_years_high_normalized_dem_subred_df["n_dem_authors"] = this_years_high_normalized_dem_subred_df["subreddit"].map(freq_dem_subreds)
this_years_high_normalized_rep_subred_df["n_dem_authors"] = this_years_high_normalized_rep_subred_df["subreddit"].map(freq_dem_subreds)
this_years_normalized_all_subreddits["n_dem_authors"] = this_years_normalized_all_subreddits["subreddit"].map(freq_dem_subreds)

this_years_high_normalized_dem_subred_df["n_rep_authors"] = this_years_high_normalized_dem_subred_df["subreddit"].map(freq_rep_subreds)
this_years_high_normalized_rep_subred_df["n_rep_authors"] = this_years_high_normalized_rep_subred_df["subreddit"].map(freq_rep_subreds)
this_years_normalized_all_subreddits["n_rep_authors"] = this_years_normalized_all_subreddits["subreddit"].map(freq_rep_subreds)


print(len(this_years_normalized_democratic_subreddits))
print(this_years_high_normalized_dem_subred_df["subreddit"].to_list())
print(this_years_high_normalized_dem_subred_df["score"].to_list())
print(len(this_years_normalized_republican_subreddits))
print(this_years_high_normalized_rep_subred_df["subreddit"].to_list())
print(this_years_high_normalized_rep_subred_df["score"].to_list())
print(len(this_years_normalized_all_subreddits))

In [None]:
this_years_high_normalized_dem_subred_df.round({"score":4}).head(25)

In [None]:
this_years_high_normalized_dem_subred_df[this_years_high_normalized_dem_subred_df["subreddit"]=="cats"]


In [None]:
this_years_high_normalized_rep_subred_df.round({"score":4}).head(25)

In [None]:
this_years_normalized_all_subreddits = this_years_normalized_all_subreddits.round({"score":4})

this_years_normalized_all_subreddits = this_years_normalized_all_subreddits.sort_values(by="score", ascending=False)

In [None]:
this_years_normalized_all_subreddits.to_csv(save_csvs + this_specific_year + "/subreddit_scores_and_n_members.csv")
this_years_normalized_all_subreddits

In [None]:
community_score_dict = {}

for i in range(len(what_in_communities.keys())):
    community_total_score = 0
    subreddit_count = 0
    for s in what_in_communities[i]:
        community_total_score += ls_normalized_dem_rep_score_dict[s]
        subreddit_count +=1
    community_score_dict[i] = community_total_score/subreddit_count

community_score_dict

In [None]:
for i in community_score_dict.values():
    print(round(i, 4))

In [None]:
average_score_calc_df = this_years_normalized_all_subreddits[~this_years_normalized_all_subreddits["subreddit"].isin(my_two)]
average_score_calc_df["score"].sum()/len(average_score_calc_df)

### Additional checking code

In [None]:
# check_subred = ""

# print(len(this_years_final_network_df[this_years_final_network_df["subreddit"]==check_subred]))
# print(this_years_final_network_df[this_years_final_network_df["subreddit"]==check_subred]["author"].nunique())
# print(ls_normalized_dem_rep_score_dict[check_subred])
# if check_subred in freq_dem_subreds.keys():
#    print(freq_dem_subreds[check_subred])
# else:
#    print("No democrat redditors")
# if check_subred in freq_rep_subreds.keys():
#    print(freq_rep_subreds[check_subred])
# else:
#    print("No republican redditors")
# this_years_final_network_df[this_years_final_network_df["subreddit"]==check_subred]["author"].value_counts()

In [None]:
# check_author = ""


# print(len(this_years_final_network_df[this_years_final_network_df["author"]==check_author]))
# print(this_years_final_network_df[this_years_final_network_df["author"]==check_author]["subreddit"].nunique())
# this_years_final_network_df[this_years_final_network_df["author"]==check_author]["subreddit"].value_counts()

## Intra-Party Communication

This first code cell in this chapter could be used to pick up analysis at this point, without running the lengthy topic modelling as well as the lifestyle analysis before.

In [None]:
# this_years_dem_subs = pd.read_csv(save_csvs + this_specific_year + "/dem_subs.csv")
# this_years_dem_comms = pd.read_csv(save_csvs + this_specific_year + "/dem_comms.csv")

# this_years_rep_subs = pd.read_csv(save_csvs + this_specific_year + "/rep_subs.csv")
# this_years_rep_comms = pd.read_csv(save_csvs + this_specific_year + "/rep_comms.csv")

# with open(save_csvs + this_specific_year + "/dem_topic_dict.txt", "r") as dem_dict:
#    dem_topic_dict = json.load(dem_dict)

# with open(save_csvs + this_specific_year + "/rep_topic_dict.txt", "r") as rep_dict:
#    rep_topic_dict = json.load(rep_dict)

# with open(save_csvs + this_specific_year + "/communities_dict.txt", "r") as co_dict:
#    communities = json.load(co_dict)

# what_in_communities = {}
# with open(save_csvs + this_specific_year + "/what_communities_dict.txt", "r") as what_co_dict:
#    what_in_com = json.load(what_co_dict)
# for key,value in what_in_com.items():
#     what_in_communities[int(key)] = value

# who_in_communities = {}
# with open(save_csvs + this_specific_year + "/who_communities_dict.txt", "r") as who_co_dict:
#    who_in_com = json.load(who_co_dict)
# for key,value in who_in_com.items():
#     who_in_communities[int(key)] = value

# def look_up(name, dictionary):
#    return dictionary[name]

# dem_model = BERTopic.load(save_csvs + this_specific_year + "/dem_model")

# rep_model = BERTopic.load(save_csvs + this_specific_year + "/rep_model")



# remove_users = ["[deleted]", "AutoModerator", "election_info_bot"]

# unwanted_user_sources = [path_to_bots, path_to_found_bots, path_to_more_trolls_and_bots]

# unwanted_users = remove_users.copy()

# for bot_list in unwanted_user_sources:
#     with open(bot_list, "r") as txt:
#         for line in txt:
#             unwanted_users.append(line.strip()[3:])


# troll_bot_users = list(set(unwanted_users))
# troll_bot_users.remove("[deleted]")

# with open(save_csvs + this_specific_year + "/scored_dems.txt", "r") as political_scored_dems:
#     this_years_scored_dem_redditors = json.load(political_scored_dems)

# with open(save_csvs + this_specific_year + "/scored_repss.txt", "r") as political_scored_reps:
#     this_years_scored_rep_redditors = json.load(political_scored_reps)

### r/democrats

Note: All analyses are first performed for Democrats and then following the same procedure for Republicans

In [None]:
this_years_dem_posts = this_years_dem_subs[["author"]].append(this_years_dem_comms[["author"]])
dem_authors_reducer = this_years_dem_posts["author"].value_counts()
dem_authors_gone = dem_authors_reducer[dem_authors_reducer < subreddit_engagement_threshold].index.tolist()
dem_authors_gone.append("[deleted]")

In [None]:
len(this_years_scored_dem_redditors)

In [None]:
print(this_years_dem_posts["author"].nunique())
print(len(dem_authors_gone))
print(this_years_dem_posts["author"].nunique() - len(dem_authors_gone))

In [None]:
dem_sub_author_dict = {}
for index,row in this_years_dem_subs.iterrows():
    dem_sub_author_dict[row["id"]] = row["author"]

dem_comm_author_dict = {}
for index,row in this_years_dem_comms.iterrows():
    dem_comm_author_dict[row["id"]] = row["author"]

In [None]:
this_years_dem_comms = this_years_dem_comms.sort_values(by="created")

Note: In the next step  multiple answers from one author to the same other author within one thread are removed.

In [None]:
dem_tuple_straigth_list = []
dem_topic_tuple_list = []
dem_tuple_double_check_list = []
tuples = []
topic_tuple = []
for index,row in this_years_dem_comms.iterrows():
    if row["parent_id"][3:] in dem_comm_author_dict:
        check_tup = (row["link_id"], row["author"], dem_comm_author_dict[row["parent_id"][3:]])
    elif row["parent_id"][3:] in dem_sub_author_dict:
        check_tup = (row["link_id"], row["author"], dem_sub_author_dict[row["parent_id"][3:]])
    else:
        check_tup = ("It does not exist", "Next")
    if check_tup not in dem_tuple_double_check_list:
        if row["parent_id"][3:] in dem_comm_author_dict:
            tuples = [(dem_comm_author_dict[row["id"]], dem_comm_author_dict[row["parent_id"][3:]])]
            if row["parent_id"][3:] in dem_topic_dict.keys():
                topic_tuple = [(dem_comm_author_dict[row["id"]], dem_comm_author_dict[row["parent_id"][3:]], dem_topic_dict[row["parent_id"][3:]])]
            else:
                topic_tuple = [(dem_comm_author_dict[row["id"]], dem_comm_author_dict[row["parent_id"][3:]], -1)]
            dem_tuple_straigth_list.extend(tuples)
            dem_topic_tuple_list.extend(topic_tuple)
            dem_tuple_double_check_list.append(check_tup)
        elif row["parent_id"][3:] in dem_sub_author_dict:         
            tuples = [(dem_comm_author_dict[row["id"]], dem_sub_author_dict[row["parent_id"][3:]])]
            if row["parent_id"][3:] in dem_topic_dict.keys():
                topic_tuple = [(dem_comm_author_dict[row["id"]], dem_sub_author_dict[row["parent_id"][3:]], dem_topic_dict[row["parent_id"][3:]])]
            else:
                topic_tuple = [(dem_comm_author_dict[row["id"]], dem_sub_author_dict[row["parent_id"][3:]], -1)]
            dem_tuple_straigth_list.extend(tuples)
            dem_topic_tuple_list.extend(topic_tuple)
            dem_tuple_double_check_list.append(check_tup)

In [None]:
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[0] not in unwanted_users]
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[1] not in unwanted_users]
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[0] != i[1]]
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[0] not in dem_authors_gone]
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[1] not in dem_authors_gone]
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[0] in this_years_scored_dem_redditors]
dem_tuple_straigth_list = [i for i in dem_tuple_straigth_list if i[1] in this_years_scored_dem_redditors]

dem_straight_for_df_list = [", ".join(list(i)) for i in dem_tuple_straigth_list]
dem_straight_df = pd.DataFrame(dem_straight_for_df_list)

dem_straight_weight_tuple_list = []
for index,row in dem_straight_df[0].value_counts().items():
    comp_tup = tuple(index.split(", ") + [row])
    dem_straight_weight_tuple_list.append(comp_tup)

In [None]:
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[0] not in unwanted_users]
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[1] not in unwanted_users]
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[0] != i[1]]
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[0] not in dem_authors_gone]
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[1] not in dem_authors_gone]
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[0] in this_years_scored_dem_redditors]
dem_topic_tuple_list = [i for i in dem_topic_tuple_list if i[1] in this_years_scored_dem_redditors]

In [None]:
dem_undirected_dict = {}
for i in dem_straight_weight_tuple_list:
    if (i[1], i[0]) in dem_undirected_dict.keys():
        dem_undirected_dict[(i[1], i[0])] += i[2]
    else:
        dem_undirected_dict[(i[0], i[1])] = i[2]

In [None]:
dem_undirected_weighted_tuples_list = []

for key,value in dem_undirected_dict.items():
    dem_undirected_weighted_tuples_list.append((key[0], key[1], value))

In [None]:
def create_smi(indegree, betweennes):
    return np.sqrt(indegree**2 + betweennes**2)

In [None]:
def cent_rat(in_deg, postings):
    return in_deg/postings

In [None]:
dem_comm_posting_dict = {}
for auth, posts in this_years_dem_comms["author"].value_counts().iteritems():
    dem_comm_posting_dict[auth] = posts
for auth, posts in this_years_dem_subs["author"].value_counts().iteritems():
    if auth in dem_comm_posting_dict:
        dem_comm_posting_dict[auth] += posts
    else:
        dem_comm_posting_dict[auth] = posts

In [None]:
D = nx.DiGraph()
M = nx.Graph()

In [None]:
for tup in dem_straight_weight_tuple_list:
    D.add_edge(tup[0], tup[1], weight=tup[2])

for tup in dem_undirected_weighted_tuples_list:
    M.add_edge(tup[0], tup[1], weight=tup[2])

In [None]:
print(D.number_of_nodes())

In [None]:
print(D.number_of_edges())

L is only used for calculations

In [None]:
L = nx.MultiDiGraph()

In [None]:
L.add_edges_from(dem_tuple_straigth_list)

In [None]:
only_dem_authors = []
who_in_communities[160] = []
for i in this_years_scored_dem_redditors:
    if i not in communities.keys():
        communities[i] = 160
        who_in_communities[160].append(i)
        only_dem_authors.append(i)

In [None]:
dem_degree_dict = dict(D.degree())
dem_in_degree_dict = dict(D.in_degree())
dem_multi_degree_dict = dict(L.degree())
dem_multi_in_degree_dict = dict(L.in_degree())
dem_betweenness_dict = nx.betweenness_centrality(D)
dem_tot_indegree = sum(dem_in_degree_dict.values())
dem_multi_tot_indegree = sum(dem_multi_in_degree_dict.values())
dem_eigenvector_dict = nx.eigenvector_centrality(D)
dem_pagerank_dict = nx.pagerank(D)
dem_closeness_dict = nx.closeness_centrality(D)
dem_node_df = pd.DataFrame(list(D.nodes), columns = ["redditor"])
dem_node_df["degree"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_degree_dict))
dem_node_df["in_degree"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_in_degree_dict))
dem_node_df["multi_degree"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_multi_degree_dict))
dem_node_df["multi_in_degree"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_multi_in_degree_dict))
dem_node_df["betweenness"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_betweenness_dict))
dem_node_df["SMI"] = dem_node_df.apply(lambda row: create_smi(row["in_degree"], row["betweenness"]), axis=1)
dem_node_df["nr_postings"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_comm_posting_dict))
dem_node_df["centrality_ratio"] = dem_node_df.apply(lambda row: cent_rat(row["in_degree"], row["nr_postings"]), axis=1)
dem_node_df["Eigenvector"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_eigenvector_dict))
dem_node_df["Pagerank"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_pagerank_dict))
dem_node_df["Closeness"] = dem_node_df["redditor"].apply(lambda x: look_up(x, dem_closeness_dict))
dem_node_df["community"] = dem_node_df["redditor"].map(communities)

### Vizualisations of the network

In [None]:
dem_net_pos = nx.spring_layout(M, iterations=2000, weight="weight", seed= 16)

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = dem_net_pos

size = [10 + 1000*(dem_multi_in_degree_dict[node]/dem_multi_tot_indegree) for node in D.nodes()]
color = [color_dict[communities[node]] for node in D.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(D, pos, **node_spec)

nx.draw_networkx_edges(D, pos, **edge_spec)

plt.title("Democrat Communication network " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Democrat_Communication_network.png")

plt.show()

For better vizualisation, some "poorly connected" nodes are removed from depiction.

In [None]:
poorly_connected_dems = [i for i in M.nodes() if M.degree(i) < 3]
D1 = D.copy()
for i in poorly_connected_dems:
    D1.remove_node(i)

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = dem_net_pos

size = [10 + 1000*(dem_multi_in_degree_dict[node]/dem_multi_tot_indegree) for node in D1.nodes()]
color = [color_dict[communities[node]] for node in D1.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(D1, pos, **node_spec)

nx.draw_networkx_edges(D1, pos, **edge_spec)

plt.title("Democrat Communication network zoomed in" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Democrat_zoomed_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = dem_net_pos

size = [10 + 1000*(dem_multi_in_degree_dict[node]/dem_multi_tot_indegree) for node in D.nodes()]
color = [color_dict[communities[node]] for node in D.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(D, pos, **node_spec)

nx.draw_networkx_edges(D, pos, **edge_spec)

plt.title("Democrat Communication network " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Democrat_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = dem_net_pos

size = [10 + 1000*(dem_multi_in_degree_dict[node]/dem_multi_tot_indegree) for node in D1.nodes()]
color = [color_dict[communities[node]] for node in D1.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(D1, pos, **node_spec)

nx.draw_networkx_edges(D1, pos, **edge_spec)

plt.title("Democrat Communication network zoomed in" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Democrat_zoomed_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = dem_net_pos

size = [10 + 1000*(dem_multi_in_degree_dict[node]/dem_multi_tot_indegree) for node in D.nodes()]
color = [color_dict[communities[node]] for node in D.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(D, pos, **node_spec)

nx.draw_networkx_edges(D, pos, **edge_spec)

plt.title("Democrat Communication network " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/very_large_Democrat_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = dem_net_pos

size = [10 + 1000*(dem_multi_in_degree_dict[node]/dem_multi_tot_indegree) for node in D1.nodes()]
color = [color_dict[communities[node]] for node in D1.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(D1, pos, **node_spec)

nx.draw_networkx_edges(D1, pos, **edge_spec)

plt.title("Democrat Communication network zoomed in" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/very_large_Democrat_zoomed_Communication_network.png")

plt.show()

### Understanding the communication structure on an individual level

In [None]:
D.degree()

In [None]:
D.in_degree()

In [None]:
dem_in_degrees = list((i for a, i in D.in_degree()))


plt.hist(dem_in_degrees, bins=max(dem_in_degrees) ,color="blue", edgecolor = "blue")
plt.xlabel("Indegree")
plt.ylabel("# Redditors")
plt.title("Democrat Indegree values " + this_specific_year)
plt.savefig(save_plots + this_specific_year + "/democrat_indegrees.png")
plt.show()

In [None]:
dem_responses_to = dem_node_df["multi_in_degree"].to_list()


plt.hist(dem_responses_to, bins=max(dem_responses_to) ,color="blue", edgecolor = "blue")
plt.xlabel("Generated replies")
plt.ylabel("# Redditors")
plt.title("Democrat replies received " + this_specific_year)
plt.savefig(save_plots + this_specific_year + "/democrat_replies.png")
plt.show()

In [None]:
dem_sorted_com_received_list = sorted(dem_responses_to, reverse=True)
dem_eighty_percent_comms = 0.8*sum(dem_responses_to)
dem_eighty_comm_received_sum = 0
dem_eighty_author_count = 0
while dem_eighty_comm_received_sum <= dem_eighty_percent_comms:
    dem_eighty_comm_received_sum += dem_sorted_com_received_list[dem_eighty_author_count]
    dem_eighty_author_count +=1

In [None]:
print(f"Democrats: {100*(dem_eighty_author_count/len(D.nodes()))} percent of authors triggered 80% of the responses")

In [None]:
dem_tot_indegree

In [None]:
dem_node_df.sort_values(by= "degree", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "in_degree", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "multi_degree", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "multi_in_degree", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "betweenness", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "SMI", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "nr_postings", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "centrality_ratio", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "Eigenvector", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "Pagerank", ascending=False).head(25)

In [None]:
dem_node_df.sort_values(by= "Closeness", ascending=False).head(25)

### Community specificities:

#### New authors

Note variables needed here are not loaded in and have to be created in first code section

In [None]:
if len(new_dems) >0:
    new_dem_com_dict = {}
    for i in new_dems:
        if i in communities.keys():
            if i in this_years_scored_dem_redditors:
                if communities[i] in new_dem_com_dict.keys():
                    new_dem_com_dict[communities[i]].append(i)
                else:
                    new_dem_com_dict[communities[i]] = [i]

    for key,value in new_dem_com_dict.items():
        print(f"Community {key} has {len(value)} Democrat members that are new since {int(this_specific_year) - 4}")


    dem_node_df[dem_node_df["redditor"].isin(new_dems)].sort_values(by= "in_degree", ascending=False).head(25)

In [None]:
if len(eight_year_new_dems) > 0:
    ey_new_dem_com_dict = {}
    for i in eight_year_new_dems:
        if i in communities.keys():
            if i in this_years_scored_dem_redditors:
                if communities[i] in ey_new_dem_com_dict.keys():
                    ey_new_dem_com_dict[communities[i]].append(i)
                else:
                    ey_new_dem_com_dict[communities[i]] = [i]

    for key,value in ey_new_dem_com_dict.items():
        print(f"Community {key} has {len(value)} Democrat members that are new since {int(this_specific_year) - 8}")

    dem_node_df[dem_node_df["redditor"].isin(eight_year_new_dems)].sort_values(by= "in_degree", ascending=False).head(25)

#### Authors and communication

In [None]:
media_per_dem_communities = pd.DataFrame()
topics_per_dem_communities = pd.DataFrame()
top_8_topics_per_dem_communities = pd.DataFrame()
dem_in_degree_in_communities = pd.DataFrame()
dem_responses_in_communities = pd.DataFrame()
dems_in_communities = {}

for key, value in who_in_communities.items():
    community_com_df = this_years_dem_comms[this_years_dem_comms["author"].isin(value)]

    community_com_df = community_com_df[community_com_df["author"].isin(this_years_scored_dem_redditors)]

    community_domain_unpacked = community_com_df["domain"].to_list()
    comunity_com_domains = []
    for i in community_domain_unpacked:
        if isinstance(i, str):
            if "," in i:
                multis = i.split(",")
                multis = list(set(multis))
                comunity_com_domains.extend(multis)
            else:
                comunity_com_domains.append(i)
                
    community_sub_df = this_years_dem_subs[this_years_dem_subs["author"].isin(value)]

    community_sub_df = community_sub_df[community_sub_df["author"].isin(this_years_scored_dem_redditors)]

    community_selftext_domains = []
    com_seltex_dom_unpa = community_sub_df["selftext_domains"].to_list()
    for i in com_seltex_dom_unpa:
        if isinstance(i, str):
            if "," in i:
                multis = i.split(",")
                multis = list(set(multis))
                community_selftext_domains.extend(multis)
            else:
                community_selftext_domains.append(i)

    agg_community_domains = community_sub_df["domain"].to_list() + community_selftext_domains + comunity_com_domains
    agg_community_df = pd.DataFrame(agg_community_domains, columns=["domain"])
    comunity_domains = agg_community_df.value_counts()[:25].index.tolist()
    community_domain_freq = agg_community_df.value_counts()[:25].tolist()

    
    domain_list_for_comunity_df = []
    for i in range(len(comunity_domains)):
        domain_list_for_comunity_df.append((comunity_domains[i], community_domain_freq[i]))
    community_length_checker = 25 - len(comunity_domains)
    if community_length_checker > 0:
        for i in range(community_length_checker):
            domain_list_for_comunity_df.append("-")
    media_per_dem_communities["Community " + str(key) + " Total domain links: " + str(len(agg_community_domains))] = domain_list_for_comunity_df


    agg_community_topics = community_sub_df["topic"].to_list() + community_com_df["topic"].to_list()
    agg_topic_df = pd.DataFrame(agg_community_topics, columns=["topic"])
    comunity_topics = agg_topic_df.value_counts()[:25].index.tolist()
    community_topic_freq = agg_topic_df.value_counts()[:25].tolist()

    topic_list_for_topic_df = []
    for i in range(len(comunity_topics)):
        topic_list_for_topic_df.append((comunity_topics[i], community_topic_freq[i]))
    community_length_checker = 25 - len(comunity_topics)
    if community_length_checker > 0:
        for i in range(community_length_checker):
            topic_list_for_topic_df.append("-")
    topics_per_dem_communities["Community " + str(key) + " Total (topical) posts: " + str(len(agg_community_topics))] = topic_list_for_topic_df

    top_8_df = agg_topic_df[agg_topic_df["topic"].isin(range(0,8))]
    community_top_8_tops = top_8_df.value_counts().index.tolist()
    community_top_8_tops_freq = top_8_df.value_counts().tolist()

    topic_list_for_top8_topic_df = []
    for i in range(len(community_top_8_tops)):
        topic_list_for_top8_topic_df.append((community_top_8_tops[i], community_top_8_tops_freq[i]))
    community_top_length_checker = 8 - len(community_top_8_tops)
    if community_top_length_checker > 0:
        for i in range(community_top_length_checker):
            topic_list_for_top8_topic_df.append("-")
    top_8_topics_per_dem_communities["Community " + str(key)] = topic_list_for_top8_topic_df


    C = D.copy()
    Z = L.copy()
    out_of_community = set(D.nodes()) - set(value)
    for i in out_of_community:
        C.remove_node(i)
        Z.remove_node(i)
    
    community_in_degree = dict(C.in_degree())
    tot_community_indegree = sum(community_in_degree.values())
    community_in_degree = sorted(community_in_degree.items(), key=lambda x:x[1], reverse=True)
    twentifive_opinion_leaders = []
    if len(community_in_degree) >= 25:
        for i in community_in_degree[:25]:
            if tot_community_indegree == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_opinion_leaders.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_indegree)
                twentifive_opinion_leaders.append(community_ol)
    else:
        for i in community_in_degree:
            if tot_community_indegree == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_opinion_leaders.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_indegree)
                twentifive_opinion_leaders.append(community_ol)
        community_length_checker = 25 - len(community_in_degree)
        if community_length_checker > 0:
            for i in range(community_length_checker):
                twentifive_opinion_leaders.append("-")

    dem_in_degree_in_communities["Community " + str(key) + " Total intra-community Indegree: " + str(tot_community_indegree)] = twentifive_opinion_leaders
    
    community_response = dict(Z.in_degree())
    tot_community_response = sum(community_response.values())
    community_response = sorted(community_response.items(), key=lambda x:x[1], reverse=True)
    twentifive_response_generators = []
    if len(community_response) >= 25:
        for i in community_response[:25]:
            if tot_community_response == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_response_generators.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_response)
                twentifive_response_generators.append(community_ol)
    else:
        for i in community_response:
            if tot_community_response == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_response_generators.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_response)
                twentifive_response_generators.append(community_ol)
        community_length_checker = 25 - len(community_response)
        if community_length_checker > 0:
            for i in range(community_length_checker):
                twentifive_response_generators.append("-")

    dem_responses_in_communities["Community " + str(key) + " Total intra-community responses: " + str(tot_community_response)] = twentifive_response_generators

    print(f"Community {key} has {len(set(community_sub_df['author'].tolist() + community_com_df['author'].tolist()))} democrat members and created {len(community_sub_df)} submissions and {len(community_com_df)} comments")
    dems_in_communities[key] = list(set(community_sub_df['author'].tolist() + community_com_df['author'].tolist()))

print(f"{len(only_dem_authors)} democrat authors considered are not part of a lifestyle community")    

media_per_dem_communities.to_csv(save_csvs + this_specific_year + "/media_per_dem_communities.csv")
dem_in_degree_in_communities.to_csv(save_csvs + this_specific_year + "/indegree_dem_communities.csv")
dem_responses_in_communities.to_csv(save_csvs + this_specific_year + "/responses_dem_communities.csv")
topics_per_dem_communities.to_csv(save_csvs + this_specific_year + "/topics_per_dem_communities.csv")

In [None]:
media_per_dem_communities

In [None]:
dem_in_degree_in_communities

In [None]:
dem_responses_in_communities

In [None]:
topics_per_dem_communities

In [None]:
top_8_topics_per_dem_communities

### Vizualisations of communication patterns

In [None]:
dem_changed_straight_list = [] 
for i in dem_topic_tuple_list:
    new_tuple = (i[0]+"_01", i[1], i[2])
    dem_changed_straight_list.append(new_tuple)

In [None]:
O = nx.MultiDiGraph()
O1 = nx.MultiDiGraph()

In [None]:
for i in dem_changed_straight_list:
    O.add_edge(i[0], i[1], topic = i[2])
    if i[2] >= 0 and i[2] < 8:
        O1.add_edge(i[0], i[1], topic=i[2])

In [None]:
dem_edge_top_dict = {}
dem_edge_top_dict = nx.get_edge_attributes(O, "topic")
dem_reduced_edge_topic_dict = nx.get_edge_attributes(O1, "topic")

In [None]:
ordered_dem_communities_dict = OrderedDict(sorted(dems_in_communities.items(), key = lambda x : len(x[1]),reverse=True))
ordered_dem_communities_dict.move_to_end(160, last=False)
ordered_dem_communities = ordered_dem_communities_dict.keys()

In [None]:
ordered_dem_communities

In [None]:
dem_topic_pos = {}
distance_factor = 2/(len(O.nodes())+5)
top_vertical_coordinate = 1
for comune in ordered_dem_communities:
    if comune == 160:
        top_vertical_coordinate -= distance_factor*5
    for redditor in dems_in_communities[comune]:
        if redditor in O.nodes():
            dem_topic_pos[redditor] = np.array([1, top_vertical_coordinate])
            dem_topic_pos[redditor + "_01"] = np.array([-1, top_vertical_coordinate])
            top_vertical_coordinate -= distance_factor
        elif redditor +"_01" in O.nodes():
            dem_topic_pos[redditor] = np.array([1, top_vertical_coordinate])
            dem_topic_pos[redditor + "_01"] = np.array([-1, top_vertical_coordinate])
            top_vertical_coordinate -= distance_factor

In [None]:
extended_communities = {}
for key,value in communities.items():
    extended_communities[key] = value
    extended_communities[key + "_01"] = value

In [None]:
dem_extended_multi_indegree_dict = {}
for key,value in dem_multi_in_degree_dict.items():
    dem_extended_multi_indegree_dict[key] = value
    dem_extended_multi_indegree_dict[key + "_01"] = value

In [None]:
dem_extended_nodes_gone = []
for i in O.nodes():
    if i not in dem_topic_pos.keys():
        print(i)
        dem_extended_nodes_gone.append(i)

for i in dem_extended_nodes_gone:
    O.remove_node(i)
    if i in O1.nodes():
        O1.remove_node(i)

In [None]:
for i in dem_topic_pos.keys():
    if i not in O.nodes():
        O.add_node(i)
        O1.add_node(i)

In [None]:
len(O.edges())

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"deeppink",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(dem_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(dem_extended_multi_indegree_dict[node]/dem_multi_tot_indegree) for node in O.nodes()]
color = [color_dict[extended_communities[node]] for node in O.nodes()]
colors = [colors_dict[dem_edge_top_dict[edge]] for edge in O.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .1, "edge_color": colors}

nx.draw_networkx_nodes(O, dem_topic_pos, **node_spec)

nx.draw_networkx_edges(O, dem_topic_pos, **edge_spec)

plt.title("Democrat Comunities Communication " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Democrat_Communities_communication.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"deeppink",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(dem_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(dem_extended_multi_indegree_dict[node]/dem_multi_tot_indegree) for node in O.nodes()]
color = [color_dict[extended_communities[node]] for node in O.nodes()]
colors = [colors_dict[dem_edge_top_dict[edge]] for edge in O.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .2, "edge_color": colors}

nx.draw_networkx_nodes(O, dem_topic_pos, **node_spec)

nx.draw_networkx_edges(O, dem_topic_pos, **edge_spec)

plt.title("Democrat Comunities Communication " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Democrat_Communities_communication.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
dem_colors_dict = {-1: "grey", 0:"magenta", 1:"yellow", 
              2:"darkorange", 3: "blue", 4:"tan", 5:"sienna", 6:"rebeccapurple", 7:"lime",
              8:"grey", 9:"grey"}

for i in range(10, len(dem_model.get_topic_info())):
    dem_colors_dict[i] = "grey"



size = [100 + 50000*(dem_extended_multi_indegree_dict[node]/dem_multi_tot_indegree) for node in O.nodes()]
color = [color_dict[extended_communities[node]] for node in O.nodes()]
colors = [dem_colors_dict[dem_edge_top_dict[edge]] for edge in O.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .2, "edge_color": colors}

nx.draw_networkx_nodes(O, dem_topic_pos, **node_spec)

nx.draw_networkx_edges(O, dem_topic_pos, **edge_spec)

plt.title("Democrat Comunities Communication " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Democrat_Communities_communication.png")

plt.show()

In [None]:
color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}

In [None]:
dem_auth_ad_tuple_list = []
dem_comun_count_list = []

for i in O.edges:
    dem_auth_ad_tuple_list.append((str(extended_communities[i[0]]), str(extended_communities[i[1]])))
    dem_comun_count_list.append(extended_communities[i[0]])
    dem_comun_count_list.append(extended_communities[i[1]])

dem_auth_ad_for_df = [", ".join(list(i)) for i in dem_auth_ad_tuple_list]

dem_count_auth_ad_df = pd.DataFrame(dem_auth_ad_for_df)

dem_auth_ad_weight_tup_list = []
for index,row in dem_count_auth_ad_df[0].value_counts().items():
    auth_ad_wei_tup = tuple(index.split(", ") + [row])
    auth_ad_wei_tup = (int(auth_ad_wei_tup[0]), int(auth_ad_wei_tup[1]) + len(set(dem_comun_count_list)), int(auth_ad_wei_tup[2]), int(auth_ad_wei_tup[1]))
    dem_auth_ad_weight_tup_list.append(auth_ad_wei_tup)

dem_sank_prep_df = pd.DataFrame(dem_auth_ad_weight_tup_list, columns=["author", "adressee", "weight", "sorter"])
dem_sank_prep_df["color"] = dem_sank_prep_df["author"].map(color_dict)

dem_sank_prep_df = dem_sank_prep_df.sort_values(by=["author","sorter"], ascending=False)

dem_sank_prep_df = dem_sank_prep_df.replace({"author":{160:len(set(dem_comun_count_list))-1}, "adressee":{160 + len(set(dem_comun_count_list)):2*len(set(dem_comun_count_list))-1}})

In [None]:
dem_sank_prep_df

In [None]:
source = dem_sank_prep_df["author"].to_list()
target = dem_sank_prep_df["adressee"].to_list()
value = dem_sank_prep_df["weight"].to_list()
color = dem_sank_prep_df["color"].to_list()
colors = [matplotlib.colors.to_rgba(i) for i in color]
colors = ["rgba"+str((i[0],i[1],i[2],0.6)) for i in colors]
node_colors = [color_dict[i] for i in range(len(set(source))-1)]
node_colors.append("blue")
node_colors = node_colors*2
total_height = sum(dem_sank_prep_df["weight"])


left_y = 0.001
right_y = 0.001

left_y_list = []
for i in range(len(set(source))):
    left_y_list.append(left_y)
    left_y += (sum(dem_sank_prep_df[dem_sank_prep_df["author"]==i]["weight"])/total_height)#*(0.999-0.001)

right_y_list = []
for i in range(len(set(target))):
    right_y_list.append(right_y)
    right_y += (sum(dem_sank_prep_df[dem_sank_prep_df["adressee"]==i+len(set(source))]["weight"])/total_height)#*(0.999-0.001)


left_labels = [str(i) for i in range(len(set(source))-1)]
left_labels.append(160)
right_labels = left_labels
labels = right_labels + left_labels


link = dict(arrowlen=15, source=source, target=target, value=value, color=colors)
node = dict(label = labels, pad=0, thickness=30, color=node_colors)#, x = [0.001]*len(set(source))+[0.9999]*len(set(target)), y = left_y_list+right_y_list)

data = go.Sankey(link=link, node=node, arrangement="snap")

fig = go.Figure(data)

fig.update_layout(hovermode="x", autosize=False, width=1600, height=1000)

fig.show()


fig.write_image(save_plots + this_specific_year + "/Democrat_Sankey.png")

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"red",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(dem_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(dem_extended_multi_indegree_dict[node]/dem_multi_tot_indegree) for node in O.nodes()]
color = [color_dict[extended_communities[node]] for node in O.nodes()]
colors = [colors_dict[dem_reduced_edge_topic_dict[edge]] for edge in O1.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .5, "edge_color": colors}

nx.draw_networkx_nodes(O, dem_topic_pos, **node_spec)

nx.draw_networkx_edges(O1, dem_topic_pos, **edge_spec)

plt.title("Democrat Comunities Communication Top 10 topics" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Democrat_Communities_top_communication.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"red",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(dem_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(dem_extended_multi_indegree_dict[node]/dem_multi_tot_indegree) for node in O.nodes()]
color = [color_dict[extended_communities[node]] for node in O.nodes()]
colors = [colors_dict[dem_reduced_edge_topic_dict[edge]] for edge in O1.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .5, "edge_color": colors}

nx.draw_networkx_nodes(O, dem_topic_pos, **node_spec)

nx.draw_networkx_edges(O1, dem_topic_pos, **edge_spec)

plt.title("Democrat Comunities Communication Top 10 " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Democrat_Communities_top_communication_large.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
dem_colors_dict = {-1: "grey", 0:"magenta", 1:"yellow", 
              2:"darkorange", 3: "blue", 4:"tan", 5:"sienna", 6:"rebeccapurple", 7:"lime",
              8:"grey", 9:"grey"}

for i in range(10, len(dem_model.get_topic_info())):
    dem_colors_dict[i] = "grey"



size = [100 + 50000*(dem_extended_multi_indegree_dict[node]/dem_multi_tot_indegree) for node in O.nodes()]
color = [color_dict[extended_communities[node]] for node in O.nodes()]
colors = [dem_colors_dict[dem_reduced_edge_topic_dict[edge]] for edge in O1.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .5, "edge_color": colors}

nx.draw_networkx_nodes(O, dem_topic_pos, **node_spec)

nx.draw_networkx_edges(O1, dem_topic_pos, **edge_spec)

plt.title("Democrat Comunities Communication Top 10 " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Democrat_Communities_top_communication_large.png")

plt.show()

In [None]:
dem_community_edge_weight_dict = nx.get_edge_attributes(D, "weight")

In [None]:
dem_community_communication_dict = {}
for comnty in dems_in_communities.keys():
    dem_community_communication_dict["Community " + str(comnty)] = {}
    dem_community_communication_dict["Community " + str(comnty)]["Total"] = 0


for key, value in dem_community_edge_weight_dict.items():
    if "Community " + str(communities[key[1]]) in dem_community_communication_dict.keys():
        if "Community " + str(communities[key[0]]) in dem_community_communication_dict["Community " + str(communities[key[1]])]:
            dem_community_communication_dict["Community " + str(communities[key[1]])]["Community " + str(communities[key[0]])] += value
            dem_community_communication_dict["Community " + str(communities[key[1]])]["Total"] += value
        else:
            dem_community_communication_dict["Community " + str(communities[key[1]])]["Community " + str(communities[key[0]])] = value
            dem_community_communication_dict["Community " + str(communities[key[1]])]["Total"] += value

In [None]:
dem_community_communication_dict = dict(sorted(dem_community_communication_dict.items()))

dem_community_communication_dict["Community 0"] = dict(sorted(dem_community_communication_dict["Community 0"].items()))
for i in range(len(dem_community_communication_dict.keys())-1):
    if "Community " + str(i) not in dem_community_communication_dict["Community 0"].keys():
        dem_community_communication_dict["Community 0"]["Community " + str(i)] = 0

In [None]:
dem_community_communication_df = pd.DataFrame()
for key in dem_community_communication_dict.keys():
    dem_community_communication_df[str(key)] = dem_community_communication_dict[key]
dem_community_communication_df["Total"] = dem_community_communication_df.sum(axis=1)
dem_community_communication_df.to_csv(save_csvs + this_specific_year + "/dem_community_communication.csv")

These DataFrames quantify community to community responses:

In [None]:
print("Columns are getting responses from rows")
dem_community_communication_df


In [None]:
dem_community_topic_exchange_dict = {}
for comnty in ordered_dem_communities:
    dem_community_topic_exchange_dict["Community " + str(comnty)] = {}
for comnty in ordered_dem_communities:
    for value in set(communities.values()):
        dem_community_topic_exchange_dict["Community " + str(comnty)]["Total"] = 0
        dem_community_topic_exchange_dict["Community " + str(comnty)]["Community " + str(value)] = {}


for tup in dem_topic_tuple_list:
    if tup[0] in communities.keys():
        auth_com = communities[tup[0]]
        if tup[1] in communities.keys():
            adresee_com = communities[tup[1]]
        else:
            adresee_com = 160
        topic_reacted_to = tup[2]
        if "Community " + str(adresee_com) in dem_community_topic_exchange_dict.keys():
            if "Community " + str(auth_com) in dem_community_topic_exchange_dict["Community " + str(adresee_com)].keys():
                dem_community_topic_exchange_dict["Community " + str(adresee_com)]["Total"] += 1
                if topic_reacted_to in dem_community_topic_exchange_dict["Community " + str(adresee_com)]["Community " + str(auth_com)].keys():
                    dem_community_topic_exchange_dict["Community " + str(adresee_com)]["Community " + str(auth_com)][topic_reacted_to] +=1
                else:
                    dem_community_topic_exchange_dict["Community " + str(adresee_com)]["Community " + str(auth_com)][topic_reacted_to] =1


dem_well_ordered_community_communication = {}
for key in dem_community_topic_exchange_dict.keys():
    dem_well_ordered_community_communication[key] = {}

for key in dem_community_topic_exchange_dict.keys():
    for subkey in dem_community_topic_exchange_dict[key].keys():
        if subkey == "Total":
            new_order = {"Total" : dem_community_topic_exchange_dict[key][subkey]}
        else:
            to_be_ordered_dict = dem_community_topic_exchange_dict[key][subkey]
            new_order = dict(sorted(to_be_ordered_dict.items(), key = lambda x:x[1], reverse=True))
        
        dem_well_ordered_community_communication[key][subkey] = new_order

In [None]:
dem_well_ordered_community_communication_df = pd.DataFrame(dem_well_ordered_community_communication)
dem_well_ordered_community_communication_df.to_csv(save_csvs + this_specific_year + "/dem_community_topic_communication.csv")

In [None]:
dem_well_ordered_community_communication_df

In [None]:
dem_top_8_topic_communit_communic = {}

for key,value in dem_well_ordered_community_communication.items():

    dem_top_8_topic_communit_communic[key]={}
    for subkey,subvalue in value.items():
        dem_top_8_topic_communit_communic[key][subkey]={}
        for subsubkey,subsubvalue in subvalue.items():
            if subsubkey in range(0,8):

                dem_top_8_topic_communit_communic[key][subkey][subsubkey] = subsubvalue

In [None]:
dem_top_8_top_exchange_df = pd.DataFrame(dem_top_8_topic_communit_communic)

In [None]:
dem_top_8_top_exchange_df

In [None]:
dem_top_8_top_exchange_df.to_csv(save_csvs + this_specific_year + "/dem_top8_community_topic_communication.csv")

### Reduction of communities in Sankey diagram:

Community numbers need to be specified

In [None]:
dem_specific_sank_communities = [0,1,2, 28] ## These numbers are the ones used for 2022
dem_specific_to_communities = [i + len(set(dem_sank_prep_df["author"].tolist())) for i in dem_specific_sank_communities]

dem_specific_sank_df = dem_sank_prep_df[dem_sank_prep_df["author"].isin(dem_specific_sank_communities)]
dem_specific_sank_df = dem_specific_sank_df[dem_specific_sank_df["adressee"].isin(dem_specific_to_communities)]


source = dem_specific_sank_df["author"].to_list()
target = dem_specific_sank_df["adressee"].to_list()
value = dem_specific_sank_df["weight"].to_list()
color = dem_specific_sank_df["color"].to_list()
colors = [matplotlib.colors.to_rgba(i) for i in color]
colors = ["rgba"+str((i[0],i[1],i[2],0.6)) for i in colors]
node_colors = [color_dict[i] for i in range(len(set(dem_sank_prep_df["author"].tolist()))-1)]
node_colors.append("blue")
node_colors = node_colors*2


link = dict(arrowlen=15, source=source, target=target, value=value, color=colors)
node = dict(label = labels, pad=0, thickness=30, color=node_colors)

data = go.Sankey(link=link, node=node, arrangement="snap")


fig = go.Figure(data)

fig.update_layout(hovermode="x", autosize=False, width=1600, height=1000)

fig.show()



### Optional specific investigations

In [None]:
# interested_topic = [4]

# dem_specific_topic_communit_communic = {}

# for key,value in dem_well_ordered_community_communication.items():

#     dem_specific_topic_communit_communic[key]={}
#     if key == "Total":
#         dem_specific_topic_communit_communic["Total"] = 0
#     for subkey,subvalue in value.items():
#         dem_specific_topic_communit_communic[key][subkey]={}
#         for subsubkey,subsubvalue in subvalue.items():
#             if subsubkey in interested_topic:

#                 dem_specific_topic_communit_communic[key][subkey][subsubkey] = subsubvalue

# dem_specific_top_exchange_df = pd.DataFrame(dem_specific_topic_communit_communic)

# dem_specific_top_exchange_df

In [None]:
# dem_redditor_of_interest = ""


# print(f"{dem_redditor_of_interest} is part of community {communities[dem_redditor_of_interest]}")

# dem_roi_subs = this_years_dem_subs[this_years_dem_subs["author"]==dem_redditor_of_interest] 
# dem_roi_comms = this_years_dem_comms[this_years_dem_comms["author"]==dem_redditor_of_interest]
# all_dem_roi_post_tops = dem_roi_subs["topic"].append(dem_roi_comms["topic"]).value_counts()

# print("This redditors most posted topics were:")
# print(all_dem_roi_post_tops.head(10))

# dem_roi_react_redditors = []

# for edge in L.edges():
#     if edge[1] == dem_redditor_of_interest:
#         dem_roi_react_redditors.append(edge[0])

# dem_roi_comunity_reacts = [communities[i] for i in dem_roi_react_redditors]

# print("This redditor received reactions from these communities:")

# print(pd.DataFrame(dem_roi_comunity_reacts).value_counts())

# ind_dem_roi_react_reds = set(dem_roi_react_redditors)

# ind_dem_roi_comu_react = [communities[i] for i in ind_dem_roi_react_reds]

# print("On the individual redditor level, Redditors from these communities reacted:")

# print(pd.DataFrame(ind_dem_roi_comu_react).value_counts())


In [None]:
# topic_of_interest = 0


# dem_roi_top_react_reds  = []
# for i in dem_topic_tuple_list:
#     if i[1] == dem_redditor_of_interest:
#         if i[2] == topic_of_interest:
#             dem_roi_top_react_reds.append(i[0])

# dem_roi_topi_comunity_reacts = [communities[i] for i in dem_roi_top_react_reds]

# print(f"Posting about topic {topic_of_interest} triggered responses from these communities:")

# print(pd.DataFrame(dem_roi_topi_comunity_reacts).value_counts())

# ind_dem_roi_topi_react_reds = set(dem_roi_top_react_reds)

# ind_dem_roi_topi_comu_react = [communities[i] for i in ind_dem_roi_topi_react_reds]

# print("On the individual redditor level, Redditors from these communities reacted to this topic:")

# print(pd.DataFrame(ind_dem_roi_topi_comu_react).value_counts())

In [None]:
# community_of_interest = 0

# dem_roi_com_react_tops  = []
# for i in dem_topic_tuple_list:
#     if i[1] == dem_redditor_of_interest:
#         if communities[i[0]] == community_of_interest:
#             dem_roi_com_react_tops.append(i[2])

# print(f"Postings by {dem_redditor_of_interest} receiving reactions from community {community_of_interest} where about these topics:")

# print(pd.DataFrame(dem_roi_com_react_tops).value_counts())



### r/Republican

In [None]:
this_years_rep_posts = this_years_rep_subs[["author"]].append(this_years_rep_comms[["author"]])
rep_authors_reducer = this_years_rep_posts["author"].value_counts()
rep_authors_gone = rep_authors_reducer[rep_authors_reducer < subreddit_engagement_threshold].index.tolist()
rep_authors_gone.append("[deleted]")

In [None]:
len(this_years_scored_rep_redditors)

In [None]:
print(this_years_rep_posts["author"].nunique())
print(len(rep_authors_gone))
print(this_years_rep_posts["author"].nunique() - len(rep_authors_gone))

In [None]:
rep_sub_author_dict = {}
for index,row in this_years_rep_subs.iterrows():
    rep_sub_author_dict[row["id"]] = row["author"]

rep_comm_author_dict = {}
for index,row in this_years_rep_comms.iterrows():
    rep_comm_author_dict[row["id"]] = row["author"]

In [None]:
this_years_rep_comms = this_years_rep_comms.sort_values(by="created")

Note: In the next step  multiple answers from one author to the same other author within one thread are removed.

In [None]:
rep_tuple_straigth_list = []
rep_topic_tuple_list = []
rep_tuple_double_check_list = []
tuples = []
topic_tuple = []
for index,row in this_years_rep_comms.iterrows():
    if row["parent_id"][3:] in rep_comm_author_dict:
        check_tup = (row["link_id"], row["author"], rep_comm_author_dict[row["parent_id"][3:]])
    elif row["parent_id"][3:] in rep_sub_author_dict:
        check_tup = (row["link_id"], row["author"], rep_sub_author_dict[row["parent_id"][3:]])
    else:
        check_tup = ("It does not exist", "Next")
    if check_tup not in rep_tuple_double_check_list:
        if row["parent_id"][3:] in rep_comm_author_dict:
            tuples = [(rep_comm_author_dict[row["id"]], rep_comm_author_dict[row["parent_id"][3:]])]
            if row["parent_id"][3:] in rep_topic_dict.keys():
                topic_tuple = [(rep_comm_author_dict[row["id"]], rep_comm_author_dict[row["parent_id"][3:]], rep_topic_dict[row["parent_id"][3:]])]
            else:
                topic_tuple = [(rep_comm_author_dict[row["id"]], rep_comm_author_dict[row["parent_id"][3:]], -1)]
            rep_tuple_straigth_list.extend(tuples)
            rep_topic_tuple_list.extend(topic_tuple)
            rep_tuple_double_check_list.append(check_tup)
        elif row["parent_id"][3:] in rep_sub_author_dict:         
            tuples = [(rep_comm_author_dict[row["id"]], rep_sub_author_dict[row["parent_id"][3:]])]
            if row["parent_id"][3:] in rep_topic_dict.keys():
                topic_tuple = [(rep_comm_author_dict[row["id"]], rep_sub_author_dict[row["parent_id"][3:]], rep_topic_dict[row["parent_id"][3:]])]
            else:
                topic_tuple = [(rep_comm_author_dict[row["id"]], rep_sub_author_dict[row["parent_id"][3:]], -1)]
            rep_tuple_straigth_list.extend(tuples)
            rep_topic_tuple_list.extend(topic_tuple)
            rep_tuple_double_check_list.append(check_tup)

In [None]:
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[0] not in unwanted_users]
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[1] not in unwanted_users]
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[0] != i[1]]
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[0] not in rep_authors_gone]
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[1] not in rep_authors_gone]
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[0] in this_years_scored_rep_redditors]
rep_tuple_straigth_list = [i for i in rep_tuple_straigth_list if i[1] in this_years_scored_rep_redditors]

rep_straight_for_df_list = [", ".join(list(i)) for i in rep_tuple_straigth_list]
rep_straight_df = pd.DataFrame(rep_straight_for_df_list)

rep_straight_weight_tuple_list = []
for index,row in rep_straight_df[0].value_counts().items():
    comp_tup = tuple(index.split(", ") + [row])
    rep_straight_weight_tuple_list.append(comp_tup)

In [None]:
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[0] not in unwanted_users]
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[1] not in unwanted_users]
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[0] != i[1]]
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[0] not in rep_authors_gone]
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[1] not in rep_authors_gone]
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[0] in this_years_scored_rep_redditors]
rep_topic_tuple_list = [i for i in rep_topic_tuple_list if i[1] in this_years_scored_rep_redditors]

In [None]:
rep_undirected_dict = {}
for i in rep_straight_weight_tuple_list:
    if (i[1], i[0]) in rep_undirected_dict.keys():
        rep_undirected_dict[(i[1], i[0])] += i[2]
    else:
        rep_undirected_dict[(i[0], i[1])] = i[2]

In [None]:
rep_undirected_weighted_tuples_list = []

for key,value in rep_undirected_dict.items():
    rep_undirected_weighted_tuples_list.append((key[0], key[1], value))

In [None]:
rep_comm_posting_dict = {}
for auth, posts in this_years_rep_comms["author"].value_counts().iteritems():
    rep_comm_posting_dict[auth] = posts
for auth, posts in this_years_rep_subs["author"].value_counts().iteritems():
    if auth in rep_comm_posting_dict:
        rep_comm_posting_dict[auth] += posts
    else:
        rep_comm_posting_dict[auth] = posts

In [None]:
R = nx.DiGraph()
P = nx.Graph()

In [None]:
for tup in rep_straight_weight_tuple_list:
    R.add_edge(tup[0], tup[1], weight=tup[2])

for tup in rep_undirected_weighted_tuples_list:
    P.add_edge(tup[0], tup[1], weight=tup[2])

In [None]:
print(R.number_of_nodes())

In [None]:
print(R.number_of_edges())

N is only used for calculations

In [None]:
N = nx.MultiDiGraph()

In [None]:
N.add_edges_from(rep_tuple_straigth_list)

In [None]:
only_rep_authors = []
who_in_communities[120] = []
for i in this_years_scored_rep_redditors:
    if i not in communities.keys():
        communities[i] = 120
        who_in_communities[120].append(i)       
        only_rep_authors.append(i)

In [None]:
rep_degree_dict = dict(R.degree())
rep_in_degree_dict = dict(R.in_degree())
rep_multi_degree_dict = dict(N.degree())
rep_multi_in_degree_dict = dict(N.in_degree())
rep_betweenness_dict = nx.betweenness_centrality(R)
rep_tot_indegree = sum(rep_in_degree_dict.values())
rep_multi_tot_indegree = sum(rep_multi_in_degree_dict.values())
rep_eigenvector_dict = nx.eigenvector_centrality(R)
rep_pagerank_dict = nx.pagerank(R)
rep_closeness_dict = nx.closeness_centrality(R)
rep_node_df = pd.DataFrame(list(R.nodes), columns = ["redditor"])
rep_node_df["degree"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_degree_dict))
rep_node_df["in_degree"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_in_degree_dict))
rep_node_df["multi_degree"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_multi_degree_dict))
rep_node_df["multi_in_degree"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_multi_in_degree_dict))
rep_node_df["betweenness"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_betweenness_dict))
rep_node_df["SMI"] = rep_node_df.apply(lambda row: create_smi(row["in_degree"], row["betweenness"]), axis=1)
rep_node_df["nr_postings"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_comm_posting_dict))
rep_node_df["centrality_ratio"] = rep_node_df.apply(lambda row: cent_rat(row["in_degree"], row["nr_postings"]), axis=1)
rep_node_df["Eigenvector"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_eigenvector_dict))
rep_node_df["Pagerank"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_pagerank_dict))
rep_node_df["Closeness"] = rep_node_df["redditor"].apply(lambda x: look_up(x, rep_closeness_dict))
rep_node_df["community"] = rep_node_df["redditor"].map(communities)

### Vizualisations of the network

In [None]:
rep_net_pos = nx.spring_layout(P, iterations=2000, weight="weight", seed= 16)

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = rep_net_pos

size = [10 + 1000*(rep_multi_in_degree_dict[node]/rep_multi_tot_indegree) for node in R.nodes()]
color = [color_dict[communities[node]] for node in R.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(R, pos, **node_spec)

nx.draw_networkx_edges(R, pos, **edge_spec)

plt.title("Republican Communication network " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Republican_Communication_network.png")

plt.show()

For better vizualisation, some "poorly connected" nodes are removed from depiction.

In [None]:
poorly_connected_reps = [i for i in P.nodes() if P.degree(i) < 3]
R1 = R.copy()
for i in poorly_connected_reps:
    R1.remove_node(i)

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = rep_net_pos

size = [10 + 1000*(rep_multi_in_degree_dict[node]/rep_multi_tot_indegree) for node in R1.nodes()]
color = [color_dict[communities[node]] for node in R1.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .0, "edge_color": "black"}

nx.draw_networkx_nodes(R1, pos, **node_spec)

nx.draw_networkx_edges(R1, pos, **edge_spec)

plt.title("Republican Communication network zoomed in" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Republican_zoomed_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = rep_net_pos

size = [10 + 1000*(rep_multi_in_degree_dict[node]/rep_multi_tot_indegree) for node in R.nodes()]
color = [color_dict[communities[node]] for node in R.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(R, pos, **node_spec)

nx.draw_networkx_edges(R, pos, **edge_spec)

plt.title("Republican Communication network " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Republican_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = rep_net_pos

size = [10 + 1000*(rep_multi_in_degree_dict[node]/rep_multi_tot_indegree) for node in R1.nodes()]
color = [color_dict[communities[node]] for node in R1.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(R1, pos, **node_spec)

nx.draw_networkx_edges(R1, pos, **edge_spec)

plt.title("Republican Communication network zoomed in" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Republican_zoomed_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = rep_net_pos

size = [10 + 1000*(rep_multi_in_degree_dict[node]/rep_multi_tot_indegree) for node in R.nodes()]
color = [color_dict[communities[node]] for node in R.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(R, pos, **node_spec)

nx.draw_networkx_edges(R, pos, **edge_spec)

plt.title("Republican Communication network " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/very_large_Republican_Communication_network.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}


pos = rep_net_pos

size = [10 + 1000*(rep_multi_in_degree_dict[node]/rep_multi_tot_indegree) for node in R1.nodes()]
color = [color_dict[communities[node]] for node in R1.nodes()]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .05, "edge_color": "black"}

nx.draw_networkx_nodes(R1, pos, **node_spec)

nx.draw_networkx_edges(R1, pos, **edge_spec)

plt.title("Republican Communication network zoomed in" + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/very_large_Republican_zoomed_Communication_network.png")

plt.show()

### Understanding the communication structure on an individual level

In [None]:
R.degree()

In [None]:
R.in_degree()

In [None]:
rep_in_degrees = list((i for a, i in R.in_degree()))


plt.hist(rep_in_degrees, bins=max(rep_in_degrees) ,color="blue", edgecolor = "blue")
plt.xlabel("Indegree")
plt.ylabel("# Redditors")
plt.title("Republican Indegree values " + this_specific_year)
plt.savefig(save_plots + this_specific_year + "/republican_indegrees.png")
plt.show()

In [None]:
rep_responses_to = rep_node_df["multi_in_degree"].to_list()


plt.hist(rep_responses_to, bins=max(rep_responses_to) ,color="blue", edgecolor = "blue")
plt.xlabel("Generated replies")
plt.ylabel("# Redditors")
plt.title("Republican replies received " + this_specific_year)
plt.savefig(save_plots + this_specific_year + "/republican_replies.png")
plt.show()

In [None]:
rep_sorted_com_received_list = sorted(rep_responses_to, reverse=True)
rep_eighty_percent_comms = 0.8*sum(rep_responses_to)
rep_eighty_comm_received_sum = 0
rep_eighty_author_count = 0
while rep_eighty_comm_received_sum <= rep_eighty_percent_comms:
    rep_eighty_comm_received_sum += rep_sorted_com_received_list[rep_eighty_author_count]
    rep_eighty_author_count +=1


In [None]:
print(f"Republicans: {100*(rep_eighty_author_count/len(R.nodes()))} percent of authors triggered 80% of the responses")

In [None]:
rep_tot_indegree

In [None]:
rep_multi_tot_indegree

In [None]:
rep_node_df.sort_values(by= "degree", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "in_degree", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "multi_degree", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "multi_in_degree", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "betweenness", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "SMI", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "nr_postings", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "centrality_ratio", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "Eigenvector", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "Pagerank", ascending=False).head(25)

In [None]:
rep_node_df.sort_values(by= "Closeness", ascending=False).head(25)

### Community specificities:

#### New authors

Note variables needed here are not loaded in and have to be created in first code section

In [None]:
if len(new_reps) >0:
    new_rep_com_dict = {}
    for i in new_reps:
        if i in communities.keys():
            if i in this_years_scored_rep_redditors:
                if communities[i] in new_rep_com_dict.keys():
                    new_rep_com_dict[communities[i]].append(i)
                else:
                    new_rep_com_dict[communities[i]] = [i]

    for key,value in new_rep_com_dict.items():
        print(f"Community {key} has {len(value)} Republican members that are new since {int(this_specific_year) - 4}")


    rep_node_df[rep_node_df["redditor"].isin(new_reps)].sort_values(by= "in_degree", ascending=False).head(25)

In [None]:
if len(eight_year_new_reps) > 0:
    ey_new_rep_com_dict = {}
    for i in eight_year_new_reps:
        if i in communities.keys():
            if i in this_years_scored_rep_redditors:
                if communities[i] in ey_new_rep_com_dict.keys():
                    ey_new_rep_com_dict[communities[i]].append(i)
                else:
                    ey_new_rep_com_dict[communities[i]] = [i]

    for key,value in ey_new_rep_com_dict.items():
        print(f"Community {key} has {len(value)} Republican members that are new since {int(this_specific_year) - 8}")

    rep_node_df[rep_node_df["redditor"].isin(eight_year_new_reps)].sort_values(by= "in_degree", ascending=False).head(25)

#### Authors and communication

In [None]:
media_per_rep_communities = pd.DataFrame()
topics_per_rep_communities = pd.DataFrame()
top_8_topics_per_rep_communities = pd.DataFrame()
rep_in_degree_in_communities = pd.DataFrame()
rep_responses_in_communities = pd.DataFrame()
reps_in_communities = {}

for key, value in who_in_communities.items():
    community_com_df = this_years_rep_comms[this_years_rep_comms["author"].isin(value)]

    community_com_df = community_com_df[community_com_df["author"].isin(this_years_scored_rep_redditors)]

    community_domain_unpacked = community_com_df["domain"].to_list()
    comunity_com_domains = []
    for i in community_domain_unpacked:
        if isinstance(i, str):
            if "," in i:
                multis = i.split(",")
                multis = list(set(multis))
                comunity_com_domains.extend(multis)
            else:
                comunity_com_domains.append(i)

    community_sub_df = this_years_rep_subs[this_years_rep_subs["author"].isin(value)]

    community_sub_df = community_sub_df[community_sub_df["author"].isin(this_years_scored_rep_redditors)]

    community_selftext_domains = []
    com_seltex_dom_unpa = community_sub_df["selftext_domains"].to_list()
    for i in com_seltex_dom_unpa:
        if isinstance(i, str):
            if "," in i:
                multis = i.split(",")
                multis = list(set(multis))
                community_selftext_domains.extend(multis)
            else:
                community_selftext_domains.append(i)

    agg_community_domains = community_sub_df["domain"].to_list() + community_selftext_domains + comunity_com_domains
    agg_community_df = pd.DataFrame(agg_community_domains, columns=["domain"])
    comunity_domains = agg_community_df.value_counts()[:25].index.tolist()
    community_domain_freq = agg_community_df.value_counts()[:25].tolist()

    
    domain_list_for_comunity_df = []
    for i in range(len(comunity_domains)):
        domain_list_for_comunity_df.append((comunity_domains[i], community_domain_freq[i]))
    community_length_checker = 25 - len(comunity_domains)
    if community_length_checker > 0:
        for i in range(community_length_checker):
            domain_list_for_comunity_df.append("-")
    media_per_rep_communities["Community " + str(key) + " Total domain links: " + str(len(agg_community_domains))] = domain_list_for_comunity_df


    agg_community_topics = community_sub_df["topic"].to_list() + community_com_df["topic"].to_list()
    agg_topic_df = pd.DataFrame(agg_community_topics, columns=["topic"])
    comunity_topics = agg_topic_df.value_counts()[:25].index.tolist()
    community_topic_freq = agg_topic_df.value_counts()[:25].tolist()

    topic_list_for_topic_df = []
    for i in range(len(comunity_topics)):
        topic_list_for_topic_df.append((comunity_topics[i], community_topic_freq[i]))
    community_length_checker = 25 - len(comunity_topics)
    if community_length_checker > 0:
        for i in range(community_length_checker):
            topic_list_for_topic_df.append("-")
    topics_per_rep_communities["Community " + str(key) + " Total (topical) posts: " + str(len(agg_community_topics))] = topic_list_for_topic_df

    top_8_df = agg_topic_df[agg_topic_df["topic"].isin(range(0,8))]
    community_top_8_tops = top_8_df.value_counts().index.tolist()
    community_top_8_tops_freq = top_8_df.value_counts().tolist()

    topic_list_for_top8_topic_df = []
    for i in range(len(community_top_8_tops)):
        topic_list_for_top8_topic_df.append((community_top_8_tops[i], community_top_8_tops_freq[i]))
    community_top_length_checker = 8 - len(community_top_8_tops)
    if community_top_length_checker > 0:
        for i in range(community_top_length_checker):
            topic_list_for_top8_topic_df.append("-")
    top_8_topics_per_rep_communities["Community " + str(key)] = topic_list_for_top8_topic_df


    C = R.copy()
    Z = N.copy()
    out_of_community = set(R.nodes()) - set(value)
    for i in out_of_community:
        C.remove_node(i)
        Z.remove_node(i)
    
    community_in_degree = dict(C.in_degree())
    tot_community_indegree = sum(community_in_degree.values())
    community_in_degree = sorted(community_in_degree.items(), key=lambda x:x[1], reverse=True)
    twentifive_opinion_leaders = []
    if len(community_in_degree) >= 25:
        for i in community_in_degree[:25]:
            if tot_community_indegree == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_opinion_leaders.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_indegree)
                twentifive_opinion_leaders.append(community_ol)
    else:
        for i in community_in_degree:
            if tot_community_indegree == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_opinion_leaders.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_indegree)
                twentifive_opinion_leaders.append(community_ol)
        community_length_checker = 25 - len(community_in_degree)
        if community_length_checker > 0:
            for i in range(community_length_checker):
                twentifive_opinion_leaders.append("-")

    rep_in_degree_in_communities["Community " + str(key) + " Total intra-community Indegree: " + str(tot_community_indegree)] = twentifive_opinion_leaders
    
    community_response = dict(Z.in_degree())
    tot_community_response = sum(community_response.values())
    community_response = sorted(community_response.items(), key=lambda x:x[1], reverse=True)
    twentifive_response_generators = []
    if len(community_response) >= 25:
        for i in community_response[:25]:
            if tot_community_response == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_response_generators.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_response)
                twentifive_response_generators.append(community_ol)
    else:
        for i in community_response:
            if tot_community_response == 0:
                community_ol = (i[0], i[1], 0)
                twentifive_response_generators.append(community_ol)
            else:
                community_ol = (i[0], i[1], i[1]/tot_community_response)
                twentifive_response_generators.append(community_ol)
        community_length_checker = 25 - len(community_response)
        if community_length_checker > 0:
            for i in range(community_length_checker):
                twentifive_response_generators.append("-")

    rep_responses_in_communities["Community " + str(key) + " Total intra-community responses: " + str(tot_community_response)] = twentifive_response_generators

    print(f"Community {key} has {len(set(community_sub_df['author'].tolist() + community_com_df['author'].tolist()))} republican members and created {len(community_sub_df)} submissions and {len(community_com_df)} comments")
    reps_in_communities[key] = list(set(community_sub_df['author'].tolist() + community_com_df['author'].tolist()))

print(f"{len(only_rep_authors)} republican authors considered are not part of a lifestyle community") 
    

media_per_rep_communities.to_csv(save_csvs + this_specific_year + "/media_per_rep_communities.csv")
rep_in_degree_in_communities.to_csv(save_csvs + this_specific_year + "/indegree_rep_communities.csv")
rep_responses_in_communities.to_csv(save_csvs + this_specific_year + "/responses_rep_communities.csv")
topics_per_rep_communities.to_csv(save_csvs + this_specific_year + "/topics_per_rep_communities.csv")

In [None]:
media_per_rep_communities

In [None]:
rep_in_degree_in_communities

In [None]:
rep_responses_in_communities

In [None]:
topics_per_rep_communities

In [None]:
top_8_topics_per_rep_communities

### Vizualisations of communication patterns

In [None]:
rep_changed_straight_list = [] 
for i in rep_topic_tuple_list:
    new_tuple = (i[0]+"_01", i[1], i[2])
    rep_changed_straight_list.append(new_tuple)

In [None]:
U = nx.MultiDiGraph()
U1 = nx.MultiDiGraph()

In [None]:
for i in rep_changed_straight_list:
    U.add_edge(i[0], i[1], topic = i[2])
    if i[2] >= 0 and i[2] < 8:
        U1.add_edge(i[0], i[1], topic=i[2])

In [None]:
rep_edge_top_dict = {}
rep_edge_top_dict = nx.get_edge_attributes(U, "topic")
rep_reduced_edge_topic_dict = nx.get_edge_attributes(U1, "topic")

In [None]:
ordered_rep_communities_dict = OrderedDict(sorted(reps_in_communities.items(), key = lambda x : len(x[1]),reverse=True))
ordered_rep_communities_dict.move_to_end(120, last=False)
ordered_rep_communities = ordered_rep_communities_dict.keys()

In [None]:
ordered_rep_communities

In [None]:
rep_topic_pos = {}
distance_factor = 2/(len(U.nodes())+5)
top_vertical_coordinate = 1
for comune in ordered_rep_communities:
    if comune == 120:
        top_vertical_coordinate -= distance_factor*5        
    for redditor in reps_in_communities[comune]:
        if redditor in U.nodes():
            rep_topic_pos[redditor] = np.array([1, top_vertical_coordinate])
            rep_topic_pos[redditor + "_01"] = np.array([-1, top_vertical_coordinate])
            top_vertical_coordinate -= distance_factor
        elif redditor +"_01" in U.nodes():
            rep_topic_pos[redditor] = np.array([1, top_vertical_coordinate])
            rep_topic_pos[redditor + "_01"] = np.array([-1, top_vertical_coordinate])
            top_vertical_coordinate -= distance_factor

In [None]:
extended_communities = {}
for key,value in communities.items():
    extended_communities[key] = value
    extended_communities[key + "_01"] = value

In [None]:
rep_extended_multi_indegree_dict = {}
for key,value in rep_multi_in_degree_dict.items():
    rep_extended_multi_indegree_dict[key] = value
    rep_extended_multi_indegree_dict[key + "_01"] = value

In [None]:
rep_extended_nodes_gone = []
for i in U.nodes():
    if i not in rep_topic_pos.keys():
        print(i)
        rep_extended_nodes_gone.append(i)

for i in rep_extended_nodes_gone:
    U.remove_node(i)
    if i in U1.nodes():
        U1.remove_node(i)

In [None]:
for i in rep_topic_pos.keys():
    if i not in U.nodes():
        U.add_node(i)
        U1.add_node(i)

In [None]:
len(U.edges())

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"deeppink",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(rep_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(rep_extended_multi_indegree_dict[node]/rep_multi_tot_indegree) for node in U.nodes()]
color = [color_dict[extended_communities[node]] for node in U.nodes()]
colors = [colors_dict[rep_edge_top_dict[edge]] for edge in U.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .1, "edge_color": colors}

nx.draw_networkx_nodes(U, rep_topic_pos, **node_spec)

nx.draw_networkx_edges(U, rep_topic_pos, **edge_spec)

plt.title("Republican Comunities Communication " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Republican_Communities_communication.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"deeppink",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(rep_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(rep_extended_multi_indegree_dict[node]/rep_multi_tot_indegree) for node in U.nodes()]
color = [color_dict[extended_communities[node]] for node in U.nodes()]
colors = [colors_dict[rep_edge_top_dict[edge]] for edge in U.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .2, "edge_color": colors}

nx.draw_networkx_nodes(U, rep_topic_pos, **node_spec)

nx.draw_networkx_edges(U, rep_topic_pos, **edge_spec)

plt.title("Republican Comunities Communication " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Republican_Communities_communication.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
rep_colors_dict = {-1: "grey", 0:"green", 1:"grey", 
              2:"darkorange", 3: "magenta", 4:"blue", 5:"red", 6:"chocolate", 7:"mediumpurple",
              8:"grey", 9:"grey"}

for i in range(10, len(rep_model.get_topic_info())):
    rep_colors_dict[i] = "grey"



size = [100 + 50000*(rep_extended_multi_indegree_dict[node]/rep_multi_tot_indegree) for node in U.nodes()]
color = [color_dict[extended_communities[node]] for node in U.nodes()]
colors = [rep_colors_dict[rep_edge_top_dict[edge]] for edge in U.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .2, "edge_color": colors}

nx.draw_networkx_nodes(U, rep_topic_pos, **node_spec)

nx.draw_networkx_edges(U, rep_topic_pos, **edge_spec)

plt.title("Republican Comunities Communication " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/large_Republican_Communities_communication.png")

plt.show()

In [None]:
color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"brown", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}

In [None]:
rep_auth_ad_tuple_list = []
rep_comun_count_list = []

for i in U.edges:
    rep_auth_ad_tuple_list.append((str(extended_communities[i[0]]), str(extended_communities[i[1]])))
    rep_comun_count_list.append(extended_communities[i[0]])
    rep_comun_count_list.append(extended_communities[i[1]])

rep_auth_ad_for_df = [", ".join(list(i)) for i in rep_auth_ad_tuple_list]

rep_count_auth_ad_df = pd.DataFrame(rep_auth_ad_for_df)

rep_auth_ad_weight_tup_list = []
for index,row in rep_count_auth_ad_df[0].value_counts().items():
    auth_ad_wei_tup = tuple(index.split(", ") + [row])
    auth_ad_wei_tup = (int(auth_ad_wei_tup[0]), int(auth_ad_wei_tup[1]) + len(set(rep_comun_count_list)), int(auth_ad_wei_tup[2]), int(auth_ad_wei_tup[1]))
    rep_auth_ad_weight_tup_list.append(auth_ad_wei_tup)

rep_sank_prep_df = pd.DataFrame(rep_auth_ad_weight_tup_list, columns=["author", "adressee", "weight", "sorter"])
rep_sank_prep_df["color"] = rep_sank_prep_df["author"].map(color_dict)

rep_sank_prep_df = rep_sank_prep_df.sort_values(by=["author","sorter"], ascending=False)

rep_sank_prep_df = rep_sank_prep_df.replace({"author":{120:len(set(rep_comun_count_list))-1}, "adressee":{120 + len(set(rep_comun_count_list)):2*len(set(rep_comun_count_list))-1}})

In [None]:
rep_sank_prep_df

In [None]:
source = rep_sank_prep_df["author"].to_list()
target = rep_sank_prep_df["adressee"].to_list()
value = rep_sank_prep_df["weight"].to_list()
color = rep_sank_prep_df["color"].to_list()
colors = [matplotlib.colors.to_rgba(i) for i in color]
colors = ["rgba"+str((i[0],i[1],i[2],0.6)) for i in colors]
node_colors = [color_dict[i] for i in range(len(set(source))-1)]
node_colors.append("red")
node_colors = node_colors*2
total_height = sum(rep_sank_prep_df["weight"])


left_y = 0.001
right_y = 0.001

left_y_list = []
for i in range(len(set(source))):
    left_y_list.append(left_y)
    left_y += sum(rep_sank_prep_df[rep_sank_prep_df["author"]==i]["weight"])/total_height

right_y_list = []
for i in range(len(set(target))):
    right_y_list.append(right_y)
    right_y += sum(rep_sank_prep_df[rep_sank_prep_df["adressee"]==i+len(set(source))]["weight"])/total_height


left_labels = [str(i) for i in range(len(set(source))-1)]
left_labels.append(120)
right_labels = left_labels
labels = right_labels + left_labels


link = dict(arrowlen=15, source=source, target=target, value=value, color=colors)
node = dict(label = labels, pad=0, thickness=30, color=node_colors)#, x = [0.001]*len(set(source))+[0.9999]*len(set(target)), y = left_y_list+right_y_list)

data = go.Sankey(link=link, node=node, arrangement="snap")

fig = go.Figure(data)

fig.update_layout(hovermode="x", autosize=False, width=1600, height=1000)

fig.show()


fig.write_image(save_plots + this_specific_year + "/Republican_Sankey.png")

In [None]:
ordered_rep_communities

In [None]:
plt.figure(figsize=(16, 10))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"red",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(rep_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(rep_extended_multi_indegree_dict[node]/rep_multi_tot_indegree) for node in U.nodes()]
color = [color_dict[extended_communities[node]] for node in U.nodes()]
colors = [colors_dict[rep_reduced_edge_topic_dict[edge]] for edge in U1.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .5, "edge_color": colors}

nx.draw_networkx_nodes(U, rep_topic_pos, **node_spec)

nx.draw_networkx_edges(U1, rep_topic_pos, **edge_spec)

plt.title("Republican Comunities Communication Top 10 " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Republican_Communities_top_communication.png")

plt.show()

In [None]:
plt.figure(figsize=(48, 30))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
colors_dict = {-1: "grey", 0:"mintcream", 1:"honeydew", 
              2:"aquamarine", 3: "pink", 4:"deepskyblue", 5:"peru", 6:"indigo", 7:"red",
              8:"limegreen", 9:"yellowgreen"}

for i in range(10, len(rep_model.get_topic_info())):
    colors_dict[i] = "grey"



size = [10 + 1000*(rep_extended_multi_indegree_dict[node]/rep_multi_tot_indegree) for node in U.nodes()]
color = [color_dict[extended_communities[node]] for node in U.nodes()]
colors = [colors_dict[rep_reduced_edge_topic_dict[edge]] for edge in U1.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .5, "edge_color": colors}

nx.draw_networkx_nodes(U, rep_topic_pos, **node_spec)

nx.draw_networkx_edges(U1, rep_topic_pos, **edge_spec)

plt.title("Republican Comunities Communication Top 10 " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Republican_Communities_top_communication_large.png")

plt.show()

In [None]:
plt.figure(figsize=(160, 100))

color_dict = {0:"lime", 1:"cyan", 2:"magenta", 3:"mediumpurple", 4:"olive", 5:"yellow", 
              6: "plum", 7:"khaki", 8:"salmon", 9:"lightsteelblue", 10: "saddlebrown", 
              11:"tan", 12:"black", 13:"darkgreen", 14:"lightgreen", 15:"sienna", 
              16: "teal", 17:"forestgreen", 18:"rosybrown", 19:"rebeccapurple",20:"lavender", 
              21:"chocolate", 22:"slategrey", 23:"green", 24:"wheat", 
              25:"aquamarine", 26: "pink", 27:"deepskyblue", 28:"peru", 29:"indigo", 30:"deeppink",
              31:"limegreen", 32:"yellowgreen", 33:"tomato",  
              120:"red", 160:"blue"}
rep_colors_dict = {-1: "grey", 0:"green", 1:"grey", 
              2:"darkorange", 3: "magenta", 4:"blue", 5:"red", 6:"chocolate", 7:"mediumpurple",
              8:"grey", 9:"grey"}

for i in range(10, len(rep_model.get_topic_info())):
    rep_colors_dict[i] = "grey"



size = [100 + 50000*(rep_extended_multi_indegree_dict[node]/rep_multi_tot_indegree) for node in U.nodes()]
color = [color_dict[extended_communities[node]] for node in U.nodes()]
colors = [rep_colors_dict[rep_reduced_edge_topic_dict[edge]] for edge in U1.edges(keys=True)]


node_spec = {"node_color": color, "node_size": size}

edge_spec = {"width": .5, "alpha": .5, "edge_color": colors}

nx.draw_networkx_nodes(U, rep_topic_pos, **node_spec)

nx.draw_networkx_edges(U1, rep_topic_pos, **edge_spec)

plt.title("Republican Comunities Communication Top 10 " + this_specific_year, fontsize=10)
plt.savefig(save_plots + this_specific_year + "/Republican_Communities_top_communication_large.png")

plt.show()

In [None]:
rep_community_edge_weight_dict = nx.get_edge_attributes(R, "weight")

In [None]:
rep_community_communication_dict = {}
for comnty in reps_in_communities.keys():
    rep_community_communication_dict["Community " + str(comnty)] = {}
    rep_community_communication_dict["Community " + str(comnty)]["Total"] = 0


for key, value in rep_community_edge_weight_dict.items():
    if "Community " + str(communities[key[1]]) in rep_community_communication_dict.keys():
        if "Community " + str(communities[key[0]]) in rep_community_communication_dict["Community " + str(communities[key[1]])]:
            rep_community_communication_dict["Community " + str(communities[key[1]])]["Community " + str(communities[key[0]])] += value
            rep_community_communication_dict["Community " + str(communities[key[1]])]["Total"] += value
        else:
            rep_community_communication_dict["Community " + str(communities[key[1]])]["Community " + str(communities[key[0]])] = value
            rep_community_communication_dict["Community " + str(communities[key[1]])]["Total"] += value

In [None]:
rep_community_communication_dict = dict(sorted(rep_community_communication_dict.items()))

rep_community_communication_dict["Community 0"] = dict(sorted(rep_community_communication_dict["Community 0"].items()))
for i in range(len(rep_community_communication_dict.keys())-2):
    if "Community " + str(i) not in rep_community_communication_dict["Community 0"].keys():
        rep_community_communication_dict["Community 0"]["Community " + str(i)] = 0

In [None]:
rep_community_communication_df = pd.DataFrame()
for key in rep_community_communication_dict.keys():
    rep_community_communication_df[str(key)] = rep_community_communication_dict[key]
rep_community_communication_df["Total"] = rep_community_communication_df.sum(axis=1)
rep_community_communication_df.to_csv(save_csvs + this_specific_year + "/rep_community_communication.csv")

These DataFrames quantify community to community responses:

In [None]:
print("Columns are getting responses from rows")
rep_community_communication_df

In [None]:
rep_community_topic_exchange_dict = {}
for comnty in ordered_rep_communities:
    rep_community_topic_exchange_dict["Community " + str(comnty)] = {}
for comnty in ordered_rep_communities:
    for value in set(communities.values()):
        rep_community_topic_exchange_dict["Community " + str(comnty)]["Total"] = 0
        rep_community_topic_exchange_dict["Community " + str(comnty)]["Community " + str(value)] = {}


for tup in rep_topic_tuple_list:
    if tup[0] in communities.keys():
        auth_com = communities[tup[0]]
        if tup[1] in communities.keys():
            adresee_com = communities[tup[1]]
        else:
            adresee_com = 160
        topic_reacted_to = tup[2]
        if "Community " + str(adresee_com) in rep_community_topic_exchange_dict.keys():
            if "Community " + str(auth_com) in rep_community_topic_exchange_dict["Community " + str(adresee_com)].keys():
                rep_community_topic_exchange_dict["Community " + str(adresee_com)]["Total"] += 1
                if topic_reacted_to in rep_community_topic_exchange_dict["Community " + str(adresee_com)]["Community " + str(auth_com)].keys():
                    rep_community_topic_exchange_dict["Community " + str(adresee_com)]["Community " + str(auth_com)][topic_reacted_to] +=1
                else:
                    rep_community_topic_exchange_dict["Community " + str(adresee_com)]["Community " + str(auth_com)][topic_reacted_to] =1


rep_well_ordered_community_communication = {}
for key in rep_community_topic_exchange_dict.keys():
    rep_well_ordered_community_communication[key] = {}
    #for subkey in rep_community_topic_exchange_dict[key].keys():
    #    rep_well_ordered_community_communication

for key in rep_community_topic_exchange_dict.keys():
    for subkey in rep_community_topic_exchange_dict[key].keys():
        if subkey == "Total":
            new_order = {"Total" : rep_community_topic_exchange_dict[key][subkey]}
        else:
            to_be_ordered_dict = rep_community_topic_exchange_dict[key][subkey]
            new_order = dict(sorted(to_be_ordered_dict.items(), key = lambda x:x[1], reverse=True))
        
        rep_well_ordered_community_communication[key][subkey] = new_order



In [None]:
rep_well_ordered_community_communication_df = pd.DataFrame(rep_well_ordered_community_communication)
rep_well_ordered_community_communication_df.to_csv(save_csvs + this_specific_year + "/rep_community_topic_communication.csv")

In [None]:
rep_well_ordered_community_communication_df

In [None]:
rep_top_8_topic_communit_communic = {}

for key,value in rep_well_ordered_community_communication.items():

    rep_top_8_topic_communit_communic[key]={}
    for subkey,subvalue in value.items():
        rep_top_8_topic_communit_communic[key][subkey]={}
        for subsubkey,subsubvalue in subvalue.items():
            if subsubkey in range(0,8):

                rep_top_8_topic_communit_communic[key][subkey][subsubkey] = subsubvalue

In [None]:
rep_top_8_top_exchange_df = pd.DataFrame(rep_top_8_topic_communit_communic)

In [None]:
rep_top_8_top_exchange_df

In [None]:
rep_top_8_top_exchange_df.to_csv(save_csvs + this_specific_year + "/rep_top8_community_topic_communication.csv")

### Reduction of communities in Sankey diagram:

Community numbers need to be specified

In [None]:
rep_specific_sank_communities = [0,1, 2, 28] ## These numbers are the ones used for 2022
rep_specific_to_communities = [i + len(set(rep_sank_prep_df["author"].tolist())) for i in rep_specific_sank_communities]

rep_specific_sank_df = rep_sank_prep_df[rep_sank_prep_df["author"].isin(rep_specific_sank_communities)]
rep_specific_sank_df = rep_specific_sank_df[rep_specific_sank_df["adressee"].isin(rep_specific_to_communities)]


source = rep_specific_sank_df["author"].to_list()
target = rep_specific_sank_df["adressee"].to_list()
value = rep_specific_sank_df["weight"].to_list()
color = rep_specific_sank_df["color"].to_list()
colors = [matplotlib.colors.to_rgba(i) for i in color]
colors = ["rgba"+str((i[0],i[1],i[2],0.6)) for i in colors]
node_colors = [color_dict[i] for i in range(len(set(rep_sank_prep_df["author"].tolist()))-1)]
node_colors.append("red")
node_colors = node_colors*2


link = dict(arrowlen=15, source=source, target=target, value=value, color=colors)
node = dict(label = labels, pad=0, thickness=30, color=node_colors)#, x = [0.001]*len(set(source))+[0.9999]*len(set(target)), y = left_y_list+right_y_list)

data = go.Sankey(link=link, node=node, arrangement="snap")


fig = go.Figure(data)

fig.update_layout(hovermode="x", autosize=False, width=1600, height=1000)

fig.show()



### Optional specific investigations

In [None]:
#interested_topic = [4]

# rep_specific_topic_communit_communic = {}

# for key,value in rep_well_ordered_community_communication.items():

#     rep_specific_topic_communit_communic[key]={}
#     if key == "Total":
#         rep_specific_topic_communit_communic["Total"] = 0
#     for subkey,subvalue in value.items():
#         rep_specific_topic_communit_communic[key][subkey]={}
#         for subsubkey,subsubvalue in subvalue.items():
#             if subsubkey in interested_topic:

#                 rep_specific_topic_communit_communic[key][subkey][subsubkey] = subsubvalue

# rep_specific_top_exchange_df = pd.DataFrame(rep_specific_topic_communit_communic)

# rep_specific_top_exchange_df

In [None]:
# rep_redditor_of_interest = ""


# print(f"{rep_redditor_of_interest} is part of community {communities[rep_redditor_of_interest]}")

# rep_roi_subs = this_years_rep_subs[this_years_rep_subs["author"]==rep_redditor_of_interest] 
# rep_roi_comms = this_years_rep_comms[this_years_rep_comms["author"]==rep_redditor_of_interest]
# all_rep_roi_post_tops = rep_roi_subs["topic"].append(rep_roi_comms["topic"]).value_counts()

# print("This redditors most posted topics were:")
# print(all_rep_roi_post_tops.head(10))

# rep_roi_react_redditors = []

# for edge in L.edges():
#     if edge[1] == rep_redditor_of_interest:
#         rep_roi_react_redditors.append(edge[0])

# rep_roi_comunity_reacts = [communities[i] for i in rep_roi_react_redditors]

# print("This redditor received reactions from these communities:")

# print(pd.DataFrame(rep_roi_comunity_reacts).value_counts())

# ind_rep_roi_react_reds = set(rep_roi_react_redditors)

# ind_rep_roi_comu_react = [communities[i] for i in ind_rep_roi_react_reds]

# print("On the individual redditor level, Redditors from these communities reacted:")

# print(pd.DataFrame(ind_rep_roi_comu_react).value_counts())


In [None]:
# topic_of_interest = -1


# rep_roi_top_react_reds  = []
# for i in rep_topic_tuple_list:
#     if i[1] == rep_redditor_of_interest:
#         if i[2] == topic_of_interest:
#             rep_roi_top_react_reds.append(i[0])

# rep_roi_topi_comunity_reacts = [communities[i] for i in rep_roi_top_react_reds]

# print(f"Posting about topic {topic_of_interest} triggered responses from these communities:")

# print(pd.DataFrame(rep_roi_topi_comunity_reacts).value_counts())

# ind_rep_roi_topi_react_reds = set(rep_roi_top_react_reds)

# ind_rep_roi_topi_comu_react = [communities[i] for i in ind_rep_roi_topi_react_reds]

# print("On the individual redditor level, Redditors from these communities reacted to this topic:")

# print(pd.DataFrame(ind_rep_roi_topi_comu_react).value_counts())

In [None]:
# community_of_interest = 0

# rep_roi_com_react_tops  = []
# for i in rep_topic_tuple_list:
#     if i[1] == rep_redditor_of_interest:
#         if communities[i[0]] == community_of_interest:
#             rep_roi_com_react_tops.append(i[2])

# print(f"Postings by {rep_redditor_of_interest} receiving reactions from community {community_of_interest} where about these topics:")

# print(pd.DataFrame(rep_roi_com_react_tops).value_counts())

