In [1]:
# journalist_names = ["Linling_Wei", "Muyi_Xiao", "Alice_Su", "Marianna Spring", "Jiayang Fan"]
# journalist_user_ids = ["499603787", "1616608316", "24709718", "613832082", "418780384"]
# file_names = ["Lingling_Wei_context_5000.csv", "muyixiao_494.csv", "aliceysu_convs_all.csv", "mariannaspring_convs.csv", "JiayangFan_convs.csv"]

In [2]:
journalist_names = ["Alice_Su", "Lingling_Wei", "Marianna_Spring", "Mei Fong", "Muyi_Xiao", "Sagarika Ghose", "Bainjal", "Carole Cadwalldr", "Ghada Oueiss", "Jiayang Fan", "Nighat Dad", "Rana Ayyub", "Sally Kohn"]
journalist_user_ids = ["24709718", "499603787", "613832082", "14742251", "1616608316", "56312411", "89732309", "722242009", "2782521229", "418780384", "111011528", "268676434", "18978610"]
file_names = ["aliceysu_convs.csv", "Lingling_wei_convs.csv", "mariannaspring_convs.csv", "meifongwriter_conv_labels.csv", "muyixiao_conv_labels.csv", "sagarikaghose_convs.csv", "bainjal_conv_labeled.csv","carolecadwalla_conv_labeled.csv", "ghadaoueiss_conv_labeled.csv", "JiayangFan_conv_labels.csv", "nighatdad_conv_labeled.csv", "RanaAyuub_conv_labeled.csv", "sallykohn_conv_labeled.csv"]

In [3]:
# journalist_names = []
# journalist_user_ids = []
# file_names = []
# for i in range(len(journalist_names_raw)):
#     journalist_names.append([journalist_names_raw[i]])
#     journalist_user_ids.append([journalist_user_ids_raw[i]])
#     file_names.append([file_names_raw[i]])

In [4]:
import pandas as pd
import pymc as pm
import numpy as np
import arviz as az
import pytensor.tensor as pt
from pytensor import shared


import matplotlib.pyplot as plt
import random
from collections import Counter
import seaborn as sns
from pyvis.network import Network
import pyvis
from scipy.special import softmax as s
from scipy.stats import chisquare

In [5]:
import csv

In [6]:
class Conversation_Tree():
    def __init__(self, root, edge_list, tweet_type_list):
        self.root = root
        self.edge_list = edge_list
        self.tweet_type_list = tweet_type_list

class Node():
    def __init__(self, data=None):
        self.data = data
        self.children = []

class Output_Row():
    def __init__(self, tweet_id, tweet_type, parent_tweet_id, parent_tweet_type, child_tweet_ids, child_tweet_type, hop_length):
        self.tweet_id = tweet_id
        self.tweet_type = tweet_type
        self.parent_tweet_id = parent_tweet_id
        self.parent_tweet_type = parent_tweet_type
        self.child_tweet_ids = child_tweet_ids
        self.child_tweet_type = child_tweet_type
        self.hop_to_root = hop_length

def create_tree(edges):
    node_keys = set(key for keys in edges for key in keys)
    nodes = { key: Node(key) for key in node_keys }

    for parent, child in edges:
        nodes[parent].children.append(nodes[child])
        node_keys.remove(child)

    dangling_and_root_tweets = []
    for root_key in node_keys:
        dangling_and_root_tweets.append(nodes[root_key])
    return dangling_and_root_tweets, nodes

In [7]:
class Pymc_Model:
    def __init__(self, file_name, journalist_name, journalist_user_id, journalist_index):
        self.file_name = file_name
        self.journalist_name = journalist_name
        self.journalist_user_id = journalist_user_id

        self.conversation = []

        self.attacker_tweets = []
        self.bystander_tweets = []
        self.supporter_tweets = []

        self.attacker_users = []
        self.bystander_users = []
        self.supporter_users = []

        self.outputs = []
        self.prior_frequencies_sorted = []

        self.total_attacker_tweets = 0
        self.total_bystander_tweets = 0
        self.total_supporter_tweets = 0
        self.total_journalist_tweets = 0

        self.child_type_outcomes = []

        self.journalist_index = journalist_index
    
    def obtain_conversation_df(self):
        df = pd.read_csv(self.file_name)
        aggregated_labels = df.groupby('user_id')['labels'].agg(list).reset_index()
        df1 = aggregated_labels[aggregated_labels['labels'].apply(lambda x: 0 in x)]
        df2 = aggregated_labels[(~aggregated_labels['labels'].apply(lambda x: 0 in x)) & (aggregated_labels['labels'].apply(lambda x: 2 in x))]
        df3 = aggregated_labels[(~aggregated_labels['labels'].apply(lambda x: 0 in x)) & (~aggregated_labels['labels'].apply(lambda x: 2 in x))]

        df['Group'] = df['user_id'].apply(lambda x: 0 if x in df1['user_id'].values else (2 if x in df2['user_id'].values else 1))
        self.df_clean = df
        print(len(df))
        print(f"Number of rows in df_clean for journalist {self.journalist_name}: {len(df)}")
        conversation_ids = set(df['conversation_id'].unique())
        print(f"Number of unique conversations in this set of data: {len(conversation_ids)}")
        print(f"Number of tweets authored by journalist {self.journalist_name}:{len(df[df['user_id'] == int(self.journalist_user_id)])}")
        tweets_by_conversation = df.groupby('conversation_id').agg(list).to_dict()
        self.conversation_data = {}
        for i, key1 in enumerate(tweets_by_conversation.keys()):
            for j, key2 in enumerate(tweets_by_conversation[key1].keys()):
                if key2 not in self.conversation_data:
                    self.conversation_data[key2] = {}
                    self.conversation_data[key2][key1] = tweets_by_conversation[key1][key2]
                else:
                    self.conversation_data[key2][key1] = tweets_by_conversation[key1][key2]

    def obtain_specific_conversation(self):
        for key in self.conversation_data.keys():
            root_temp = key
            edges_temp = []
            tweet_type_temp = []
            for j in range(len(self.conversation_data[key]['tweet_id'])):
                if self.conversation_data[key]['user_id'][j] == int(self.journalist_user_id):
                    tweet_type_temp.append(3)
                else:
                    tweet_type_temp.append(self.conversation_data[key]['Group'][j])
                edges_temp.append((self.conversation_data[key]['reference_id'][j], self.conversation_data[key]['tweet_id'][j]))
                
            
            self.conversation.append(Conversation_Tree(root_temp, edges_temp, tweet_type_temp))
        
    def setting_up_conversation_trees(self):
        self.conversation_tree_roots = []
        self.tree_node_map = {}
        for j in range(len(self.conversation)):
            conversation = self.conversation[j]  
            if len(conversation.edge_list) == 0:
                self.conversation_tree_roots.append(Node(conversation.root))
                dict_temp = {conversation.root : Node(conversation.root)}
                self.tree_node_map.update(dict_temp)
            else:
                root_node, node_map = create_tree(conversation.edge_list) 
                self.tree_node_map.update(node_map)
                for node in root_node:
                    if node.data == conversation.root and len(conversation.edge_list) > 0:
                        self.conversation_tree_roots.append(node)
        

    def obtain_sets_of_tweet_types(self):
        self.attacker_tweets.append(self.df_clean[(self.df_clean['labels'] == 0) & (self.df_clean['user_id'] != int(self.journalist_user_id))]['tweet_id'].to_list())
        self.bystander_tweets.append(self.df_clean[(self.df_clean['labels'] == 1) & (self.df_clean['user_id'] != int(self.journalist_user_id))]['tweet_id'].to_list())
        self.supporter_tweets.append(self.df_clean[(self.df_clean['labels'] == 2) & (self.df_clean['user_id'] != int(self.journalist_user_id))]['tweet_id'].to_list())
        

    def obtain_sets_of_user_types(self):
        self.attacker_users = self.df_clean[(self.df_clean['Group'] == 0) & (self.df_clean['user_id'] != int(self.journalist_user_id))]['tweet_id'].to_list()
        self.bystander_users = self.df_clean[(self.df_clean['Group'] == 1) & (self.df_clean['user_id'] != int(self.journalist_user_id))]['tweet_id'].to_list()
        self.supporter_users = self.df_clean[(self.df_clean['Group'] == 2) & (self.df_clean['user_id'] != int(self.journalist_user_id))]['tweet_id'].to_list()
        
    def recursive_tree_traversal(self, root_node, depth, max_depth, i):
        if len(root_node.children) == 0:
            root_node.depth = depth
            if root_node.data in self.attacker_users:
                self.total_attacker_tweets += 1
                root_node.type = 0
            elif root_node.data in self.bystander_users:
                self.total_bystander_tweets += 1
                root_node.type = 1
            elif root_node.data in self.supporter_users:
                self.total_supporter_tweets += 1
                root_node.type = 2
            else:
                root_node.type = 3
                self.total_journalist_tweets += 1
            return max(depth, max_depth)
        else:
            max_depth_temp = max_depth
            for child in root_node.children:
                max_depth_temp = max(max_depth_temp, self.recursive_tree_traversal(child, depth+1, max_depth, i))
            root_node.depth = depth
            if root_node.data in self.attacker_users:
                self.total_attacker_tweets += 1
                root_node.type = 0
            elif root_node.data in self.bystander_users:
                self.total_bystander_tweets += 1
                root_node.type = 1
            elif root_node.data in self.supporter_users:
                self.total_supporter_tweets += 1
                root_node.type = 2
            else:
                root_node.type = 3
                self.total_journalist_tweets += 1
            return max_depth_temp
        
    def assigning_depth_and_type_to_nodes(self):
        master_depth = 0
        for tree_root in self.conversation_tree_roots:
            master_depth = max(self.recursive_tree_traversal(tree_root,1, 0, i), master_depth)
        self.master_depth = master_depth

    def update_tree_node_map(self):
        for j in range(len(self.conversation)):
            try:    
                self.tree_node_map[self.conversation[j].root].parent_tweet_id = None
                self.tree_node_map[self.conversation[j].root].parent_tweet_type = None
                child_tweet_ids_temp = []
                child_tweet_types_temp = []
                for kid in self.tree_node_map[self.conversation[j].root].children:
                    child_tweet_ids_temp.append(kid.data)
                    child_tweet_types_temp.append(kid.type)
                if len(child_tweet_ids_temp) == 0:
                    self.tree_node_map[self.conversation[j].root].child_tweet_ids = [0]
                    self.tree_node_map[self.conversation[j].root].child_tweet_types = [4]
                else:
                    self.tree_node_map[self.conversation[j].root].child_tweet_ids = child_tweet_ids_temp
                    self.tree_node_map[self.conversation[j].root].child_tweet_types = child_tweet_types_temp
            except:
                pass

            for k,edge in enumerate(self.conversation[j].edge_list):
                parent, child = edge
                try:
                    self.tree_node_map[child].parent_tweet_type = self.tree_node_map[parent].type
                    self.tree_node_map[child].parent_tweet_id = self.tree_node_map[parent].data
                except:                     
                    pass
                child_tweet_ids_temp = []
                child_tweet_types_temp = []
                try:
                    for m, kid in enumerate(self.tree_node_map[child].children):
                        child_tweet_types_temp.append(kid.type)
                        child_tweet_ids_temp.append(kid.data)
                except:                     
                    pass
                if len(child_tweet_ids_temp) == 0:
                    self.tree_node_map[child].child_tweet_ids = [0]
                    self.tree_node_map[child].child_tweet_types = [4]
                else:
                    self.tree_node_map[child].child_tweet_ids = child_tweet_ids_temp
                    self.tree_node_map[child].child_tweet_types = child_tweet_types_temp

    def converting_tree_node_map(self):
        self.outputs = []
        error1 = 0
        error2 = 0
        for key in self.tree_node_map:
            try:
                dummy = self.tree_node_map[key].type
            except:
                error1 += 1
                continue
            try:
                dummy = [self.tree_node_map[key].data, self.tree_node_map[key].type, self.tree_node_map[key].parent_tweet_id, self.tree_node_map[key].parent_tweet_type, self.tree_node_map[key].child_tweet_ids, self.tree_node_map[key].child_tweet_types]
                try: 
                    self.outputs.append(Output_Row(self.tree_node_map[key].data, self.tree_node_map[key].type, self.tree_node_map[key].parent_tweet_id, self.tree_node_map[key].parent_tweet_type, self.tree_node_map[key].child_tweet_ids, self.tree_node_map[key].child_tweet_types, self.tree_node_map[key].depth-1))
                except:
                    self.outputs.append(Output_Row(self.tree_node_map[key].data, self.tree_node_map[key].type, self.tree_node_map[key].parent_tweet_id, self.tree_node_map[key].parent_tweet_type, self.tree_node_map[key].child_tweet_ids, self.tree_node_map[key].child_tweet_types, 0))
            except:
                error2 += 1   
                pass
    
    def obtain_model_input_output_data(self):
        self.child_type_input = {"parent_tweet_type" : [], "tweet_type" : [], "hop_to_root" : []}
        self.child_type_output = []
        #prior_frequencies = {}
        for j, output in enumerate(self.outputs):
            if output.parent_tweet_type is None:
                continue
            for child_type in output.child_tweet_type:
                if child_type == 4:
                    break
                # if (output.parent_tweet_type, output.tweet_type, output.hop_to_root, child_type) in prior_frequencies:
                #     prior_frequencies[(output.parent_tweet_type, output.tweet_type, output.hop_to_root, child_type)] += 1
                # else:
                #     prior_frequencies[(output.parent_tweet_type, output.tweet_type, output.hop_to_root, child_type)] = 1
                self.child_type_input["parent_tweet_type"].append(output.parent_tweet_type)
                self.child_type_input["tweet_type"].append(output.tweet_type)
                self.child_type_input["hop_to_root"].append(output.hop_to_root)
                self.child_type_output.append(child_type)
            # self.prior_frequencies_sorted.append(sorted(prior_frequencies.items()))

    def convert_data_to_csv(self, number):
        array1 = self.child_type_input["parent_tweet_type"]
        array2 = self.child_type_input["tweet_type"]
        array3 = self.child_type_input["hop_to_root"]
        array4 = self.child_type_output
        array5 = [self.journalist_name] * len(array1)
        rows = zip(array1, array2, array3, array4, array5)
        if number == 0:
            with open('output.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Grandparent_tweet_label', 'Parent_tweet_label', 'Depth', 'Child_label(output)'])
                writer.writerows(rows)
        else:
            with open('output.csv', mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerows(rows)


    
    def bayesian_modelling(self):
        N = len(self.child_type_output)
        mean = 0
        sigma = 1
        nu = 2
        max_depth = self.master_depth
        batch_size = 4000
        if len(self.df_clean) < 15000:
            with pm.Model() as ChildTypeModelSample:

                InterceptAttackerProbability = pm.StudentT('InterceptAttackerProbability',nu=nu, mu = mean, sigma = sigma)
                GrandParentTweetTypeAttackerProbability = pm.StudentT('GrandParentTweetTypeAttackerProbability',nu=nu, mu = mean, sigma = sigma, shape=4)
                ParentTweetTypeAttackerProbability = pm.StudentT('ParentTweetTypeAttackerProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                HopToRootAttackerProbability = pm.StudentT('HopToRootAttackerProbability',nu=nu, mu = mean, sigma = sigma,shape=max_depth)
                
                InterceptBystanderProbability = pm.StudentT('InterceptBystanderProbability',nu=nu, mu = mean, sigma = sigma)
                GrandParentTweetTypeBystanderProbability = pm.StudentT('GrandParentTweetTypeBystanderProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                ParentTweetTypeBystanderProbability = pm.StudentT('ParentTweetTypeBystanderProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                HopToRootBystanderProbability = pm.StudentT('HopToRootBystanderProbability',nu=nu, mu = mean, sigma = sigma,shape=max_depth) 

                InterceptSupporterProbability = pm.StudentT('InterceptSupporterProbability',nu=nu, mu = mean, sigma = sigma)
                GrandParentTweetTypeSupporterProbability = pm.StudentT('GrandParentTweetTypeSupporterProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                ParentTweetTypeSupporterProbability = pm.StudentT('ParentTweetTypeSupporterProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                HopToRootSupporterProbability = pm.StudentT('HopToRootSupporterProbability',nu=nu, mu = mean, sigma =sigma,shape=max_depth)

                s0 = InterceptAttackerProbability + GrandParentTweetTypeAttackerProbability[self.child_type_input['parent_tweet_type']] + ParentTweetTypeAttackerProbability[self.child_type_input['tweet_type']] + HopToRootAttackerProbability[self.child_type_input['hop_to_root']]
                s1 = InterceptBystanderProbability + GrandParentTweetTypeBystanderProbability[self.child_type_input['parent_tweet_type']] + ParentTweetTypeBystanderProbability[self.child_type_input['tweet_type']] + HopToRootBystanderProbability[self.child_type_input['hop_to_root']]
                s2 = InterceptSupporterProbability + GrandParentTweetTypeSupporterProbability[self.child_type_input['parent_tweet_type']] + ParentTweetTypeSupporterProbability[self.child_type_input['tweet_type']] + HopToRootSupporterProbability[self.child_type_input['hop_to_root']]
                s3 = np.zeros(N)
                s = pm.math.stack([s0, s1, s2, s3]).T

                p_ = pm.math.softmax(s, axis=1)
                child_type = pm.Categorical("child_type", p=p_, observed=self.child_type_output, total_size = N)
                mean_field_2 = pm.fit(method = 'advi', obj_optimizer = pm.adagrad_window(learning_rate=1e-3))

        else:
            with pm.Model() as ChildTypeModelSample:
                parent_tweet_type, tweet_type, hop_to_root, output = pm.Minibatch(self.child_type_input['parent_tweet_type'], self.child_type_input['tweet_type'], self.child_type_input['hop_to_root'], self.child_type_output, batch_size=batch_size)

                InterceptAttackerProbability = pm.StudentT('InterceptAttackerProbability',nu=nu, mu = mean, sigma = sigma)
                GrandParentTweetTypeAttackerProbability = pm.StudentT('GrandParentTweetTypeAttackerProbability',nu=nu, mu = mean, sigma = sigma, shape=4)
                ParentTweetTypeAttackerProbability = pm.StudentT('ParentTweetTypeAttackerProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                HopToRootAttackerProbability = pm.StudentT('HopToRootAttackerProbability',nu=nu, mu = mean, sigma = sigma,shape=max_depth)
                
                InterceptBystanderProbability = pm.StudentT('InterceptBystanderProbability',nu=nu, mu = mean, sigma = sigma)
                GrandParentTweetTypeBystanderProbability = pm.StudentT('GrandParentTweetTypeBystanderProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                ParentTweetTypeBystanderProbability = pm.StudentT('ParentTweetTypeBystanderProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                HopToRootBystanderProbability = pm.StudentT('HopToRootBystanderProbability',nu=nu, mu = mean, sigma = sigma,shape=max_depth) 

                InterceptSupporterProbability = pm.StudentT('InterceptSupporterProbability',nu=nu, mu = mean, sigma = sigma)
                GrandParentTweetTypeSupporterProbability = pm.StudentT('GrandParentTweetTypeSupporterProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                ParentTweetTypeSupporterProbability = pm.StudentT('ParentTweetTypeSupporterProbability',nu=nu, mu = mean, sigma = sigma,shape=4)
                HopToRootSupporterProbability = pm.StudentT('HopToRootSupporterProbability',nu=nu, mu = mean, sigma =sigma,shape=max_depth)

                s0 = InterceptAttackerProbability + GrandParentTweetTypeAttackerProbability[parent_tweet_type] + ParentTweetTypeAttackerProbability[tweet_type] + HopToRootAttackerProbability[hop_to_root]
                s1 = InterceptBystanderProbability + GrandParentTweetTypeBystanderProbability[parent_tweet_type] + ParentTweetTypeBystanderProbability[tweet_type] + HopToRootBystanderProbability[hop_to_root]
                s2 = InterceptSupporterProbability + GrandParentTweetTypeSupporterProbability[parent_tweet_type] + ParentTweetTypeSupporterProbability[tweet_type] + HopToRootSupporterProbability[hop_to_root]
                s3 = np.zeros(batch_size)
                s = pm.math.stack([s0, s1, s2, s3]).T

                p_ = pm.math.softmax(s, axis=1)
                child_type = pm.Categorical("child_type", p=p_, observed=output, total_size = batch_size)
                mean_field_2 = pm.fit(method = 'advi', obj_optimizer = pm.adagrad_window(learning_rate=1e-3))

        with ChildTypeModelSample:
            self.child_type_trace = mean_field_2.sample(5000)

        with ChildTypeModelSample:
            self.posterior_predictive_samples = pm.sample_posterior_predictive(self.child_type_trace, extend_inferencedata=True)
        
        with ChildTypeModelSample:
            pm.compute_log_likelihood(self.child_type_trace)

        result = pm.model_to_graphviz(ChildTypeModelSample)
        # result.render(filename='model_graph', format='png')

    def posterior_ground_truth_distribution(self):
        result = self.posterior_predictive_samples.posterior_predictive.to_array()
        print(result.shape)
        predictions = result[0][0][0]
        counter = 0
        self.posterior_frequencies = {key: 0 for key in self.prior_frequencies}
        for i, output in enumerate(self.outputs):
            if output.parent_tweet_type is None:
                continue
            for child_type in output.child_tweet_type:
                if child_type == 4:
                    break
                if (output.parent_tweet_type, output.tweet_type, output.hop_to_root, int(predictions[counter])) in self.posterior_frequencies:
                    self.posterior_frequencies[(output.parent_tweet_type, output.tweet_type, output.hop_to_root, int(predictions[counter]))] += 1
                    counter += 1
                else:
                    self.posterior_frequencies[(output.parent_tweet_type, output.tweet_type, output.hop_to_root, int(predictions[counter]))] = 1
                    counter += 1
        self.posterior_frequencies_sorted = sorted(self.posterior_frequencies.items())
        #posterior_ground_truth_differences = {key: (self.prior_frequencies[key], self.posterior_frequencies.get(key)) for key in self.prior_frequencies if key[2] <=4}
        #print(sorted(posterior_ground_truth_differences.items()))

    def model_quality(self):
        print(az.waic(self.child_type_trace))
        print(az.loo(self.child_type_trace))
        result = self.posterior_predictive_samples.posterior_predictive.to_array()
        predictions = result[0][0][0]
        freq_dict_predicted = {0:0, 1:0, 2:0, 3:0}
        for i in range(len(predictions)):
            if int(predictions[i]) not in freq_dict_predicted:
                freq_dict_predicted[int(predictions[i])] = 1
            else:
                freq_dict_predicted[int(predictions[i])] += 1

        # bins = list(freq_dict_predicted.keys())
        # counts = list(freq_dict_predicted.values())

        df = pd.DataFrame(freq_dict_predicted.items(), columns=["Value", "Frequency"])

        plt.figure(figsize=(10, 6))
        sns.histplot(data=df, x="Value", weights="Frequency", bins=len(df), kde=True)
        sns.despine()

        
        plt.xlabel('Child labels')
        plt.ylabel('Counts')
        # plt.title(f'Histogram of posterior predictive outcome for Journalist {self.journalist_index}')
        plt.tight_layout()
        plt.savefig(f'{self.journalist_index}_posterior.pdf', format='pdf')
        plt.show()

        

        # plt.bar(bins, counts, color='blue', alpha=0.7)

        # plt.title('Histogram of posterior predictive outcome')
        # plt.xlabel('Child labels')
        # plt.ylabel('Counts')
        # sns.despine()
        # plt.grid(axis='y')

        # plt.tight_layout()
        # plt.show()
        
        freq_dict_expected = {0:0, 1:0, 2:0, 3:0}
        expected = random.choices(self.child_type_output, k=len(predictions))
        for element in expected:
            if element not in freq_dict_expected:
                freq_dict_expected[element] = 1
            else:
                freq_dict_expected[element] += 1
        
        df = pd.DataFrame(freq_dict_expected.items(), columns=["Value", "Frequency"])

        plt.figure(figsize=(10, 6))
        sns.histplot(data=df, x="Value", weights="Frequency", bins=len(df), kde=True)
        sns.despine()

        plt.xlabel('Child labels')
        plt.ylabel('Counts')
        # plt.title(f'Histogram of ground truth outcome distribution for Journalist {self.journalist_index}')
        plt.tight_layout()
        plt.savefig(f'{self.journalist_index}_ground_truth.pdf', format='pdf')
        plt.show()
        
        # bins = list(freq_dict_expected.keys())
        # counts = list(freq_dict_expected.values())

        # plt.figure(figsize=(10, 6))

        # plt.bar(bins, counts, color='blue', alpha=0.7)

        # plt.title('Histogram of expected data')
        # plt.xlabel('Bins')
        # plt.ylabel('Counts')
        # sns.despine()
        # plt.grid(axis='y')

        # plt.tight_layout()
        # plt.show()

        # for element in enumerate(self.prior_frequencies_sorted):
        #     if element[1][0] not in freq_dict.keys():
        #         freq_dict[element[1][0]] = [element[1][1], 0]
        #     else:
        #         freq_dict[element[1][0]] = [element[1][1], freq_dict[element[1][0]][1]]
        # expected_outcomes = []
        # observed_outcomes = []
        # for key in freq_dict.keys():
        #     expected_outcomes.append(freq_dict[key][0])
        #     observed_outcomes.append(freq_dict[key][1])

        # expected_outcomes = np.array(expected_outcomes)
        # observed_outcomes = np.array(observed_outcomes)
        # chi_square_distance = np.sum((expected_outcomes - observed_outcomes)**2 / (expected_outcomes+observed_outcomes))
        # print("Chi-Square Distance:", chi_square_distance)      
    
    def posterior_predictive_outcomes(self):
        tweet_class_coding = {0: "A", 1: "B", 2: "S", 3: "J"}
        child_type_probability = {}
        for i in range(4):
            for j in range(4):
                for k in range(1,5):
                    output_a = self.child_type_trace.posterior.GrandParentTweetTypeAttackerProbability[:,:,i] + self.child_type_trace.posterior.ParentTweetTypeAttackerProbability[:,:,j] + self.child_type_trace.posterior.InterceptAttackerProbability + self.child_type_trace.posterior.HopToRootAttackerProbability[:,:,k]
                    output_b = self.child_type_trace.posterior.GrandParentTweetTypeBystanderProbability[:,:,i] + self.child_type_trace.posterior.ParentTweetTypeBystanderProbability[:,:,j] + self.child_type_trace.posterior.InterceptBystanderProbability + self.child_type_trace.posterior.HopToRootBystanderProbability[:,:,k]
                    output_s = self.child_type_trace.posterior.GrandParentTweetTypeSupporterProbability[:,:,i] + self.child_type_trace.posterior.ParentTweetTypeSupporterProbability[:,:,j] + self.child_type_trace.posterior.InterceptSupporterProbability + self.child_type_trace.posterior.HopToRootSupporterProbability[:,:,k]

                    pre_probability_arr = np.array([output_a, output_b, output_s, np.zeros_like(output_a)])
                    pre_probability_arr = np.squeeze(pre_probability_arr).T

                    child_type_probability[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"] = s(pre_probability_arr, axis = 1)
        
        self.child_type_outcomes = {}
        for i in range(4):
            for j in range(4):
                for k in range(1,5):
                    if k == 1 and i != 3:
                        continue
                    self.child_type_outcomes[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"] = np.zeros(5000)
                    for l in range(child_type_probability[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"].shape[0]):
                        # print(child_type_probability[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"][l].shape, child_type_probability[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"][l])
                        self.child_type_outcomes[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"][l] = random.choices([0,1,2,3], weights = child_type_probability[f"{str(tweet_class_coding[i])}_{str(tweet_class_coding[j])}_{str(k)}"][l], k = 1)[0]


    def stacked_bar_plot(self, char, depth = 0):
        label_map = {0 : "Attacker (A)", 1 : "Bystander (B)", 2 : "Supporter (S)", 3 : "Journalist (J)"}
        if depth == 1:
            df_outcomes = pd.DataFrame(self.child_type_outcomes)
            df_filtered = df_outcomes.filter(regex=r'.*1$')
        else:
            df_outcomes = pd.DataFrame(self.child_type_outcomes)
            df_filtered = df_outcomes.filter(regex=fr'^[{char}].*[^1]$')
        value_counts = df_filtered.apply(lambda col: col.value_counts(normalize=True).reindex([0, 1, 2, 3], fill_value=0))
        fig, ax = plt.subplots(figsize=(12, 3))
        palette = sns.color_palette("pastel", 4)
        colors = [palette[3], palette[2], palette[0], palette[1]] 
        bottom = np.zeros(len(df_filtered.columns))
        new_axis_labels = []
        for item in df_filtered.columns:
            latex_string = f"${item[0]}_{str(int(item[-1])-1)} \\rightarrow {item[2]}_{item[-1]}$"
            new_axis_labels.append(latex_string)
        for value in [0, 1, 2, 3]:
            ax.bar(new_axis_labels, value_counts.loc[value], bottom=bottom, color=colors[value], label=f'{label_map[value]}')
            bottom += value_counts.loc[value]

        ax.set_xlabel("Path History ($GrandparentLabel_{GrandparentDepth} \\rightarrow ParentLabel_{ParentDepth}$)")
        ax.set_ylabel('Composition of Child label')
        ax.set_title('Distribution of Child label given Grandparent label, Parent label, and Depth')

        
        ax.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')
        ax = plt.gca()  # Get the current axis
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        # Set background color to light grey
        ax.set_facecolor('#F0F0F0')

        plt.tight_layout()
        
        plt.savefig(f'{self.journalist_name}_{char}_{depth}.pdf', format='pdf')
        plt.show()
        

        return value_counts

    def plot_stacked_bar_plot(self):
        value_counts_A = self.stacked_bar_plot('A')
        value_counts_B = self.stacked_bar_plot('B')
        value_counts_S = self.stacked_bar_plot('S')
        value_counts_J = self.stacked_bar_plot('J')
        value_counts_J_1 = self.stacked_bar_plot('J', 1)

        return [value_counts_A, value_counts_B, value_counts_S, value_counts_J, value_counts_J_1]
        

                
            
            


In [8]:
class_objects = []
journalist_value_counts = []
plt.rcParams['font.size'] = 12        
plt.rcParams['axes.titlesize'] = 16    
plt.rcParams['axes.labelsize'] = 12
journalist_index = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

for i in range(len(journalist_names)):
    obj = Pymc_Model(file_names[i], journalist_names[i], journalist_user_ids[i], journalist_index[i])
    obj.obtain_conversation_df()
    obj.obtain_specific_conversation()
    obj.setting_up_conversation_trees()
    obj.obtain_sets_of_user_types()
    obj.assigning_depth_and_type_to_nodes()
    obj.update_tree_node_map()
    obj.converting_tree_node_map()
    obj.obtain_model_input_output_data()
    if i == 0:
        obj.convert_data_to_csv(0)
    else:
        obj.convert_data_to_csv(1)
    # obj.bayesian_modelling()
    # # obj.posterior_ground_truth_distribution()
    # obj.model_quality()
    # obj.posterior_predictive_outcomes()
    # journalist_value_counts.append(obj.plot_stacked_bar_plot())
    # class_objects.append(obj)


# journalists_same_path_history = []
# print(len(journalist_value_counts))
# for j in range(len(journalist_value_counts[0])):
#     journalists_same_path_history.append(journalist_value_counts[0][j], journalist_value_counts[1][j], journalist_value_counts[2][j], journalist_value_counts[3][j])
    
all_journalists_same_path_history = []
for i in range(len(journalist_value_counts[0])):
    temp_arr = []
    for j in range(len(journalist_value_counts)):
        temp_arr.append(journalist_value_counts[j][i])
    all_journalists_same_path_history.append(temp_arr)

for i, path_history in enumerate(all_journalists_same_path_history):
    label_map = {0 : "Attacker (A)", 1 : "Bystander (B)", 2 : "Supporter (S)", 3 : "Journalist (J)"}
    mean_df = sum(path_history) / len(path_history)

    variance_df = sum((df - mean_df) ** 2 for df in path_history) / len(path_history)

    fig, ax = plt.subplots(figsize=(12, 3))
    palette = sns.color_palette("pastel", 4)
    colors = [palette[3], palette[2], palette[0], palette[1]] 
    bottom = np.zeros(len(mean_df.columns))
    new_axis_labels = []
    for item in mean_df.columns:
        latex_string = f"${item[0]}_{str(int(item[-1])-1)} \\rightarrow {item[2]}_{item[-1]}$"
        new_axis_labels.append(latex_string)
    for value in [0, 1, 2, 3]:
        ax.bar(new_axis_labels, mean_df.loc[value], bottom=bottom, yerr=variance_df.loc[value], color=colors[value], label=f'{label_map[value]}', capsize=5)
        bottom += mean_df.loc[value]

    ax.set_xlabel("Path History ($GrandparentLabel_{GrandparentDepth} \\rightarrow ParentLabel_{ParentDepth}$)")
    ax.set_ylabel('Composition of Child label')
    ax.set_title('Distribution of Child label given Grandparent label, Parent label, and Depth')


    ax.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax = plt.gca()  # Get the current axis
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    # Set background color to light grey
    ax.set_facecolor('#F0F0F0')

    plt.tight_layout()
    plt.savefig(f'{i}.pdf', format='pdf', dpi=300)
    plt.show()
    





10632
Number of rows in df_clean for journalist Alice_Su: 10632
Number of unique conversations in this set of data: 419
Number of tweets authored by journalist Alice_Su:394
4286
Number of rows in df_clean for journalist Lingling_Wei: 4286
Number of unique conversations in this set of data: 452
Number of tweets authored by journalist Lingling_Wei:378
40003
Number of rows in df_clean for journalist Marianna_Spring: 40003
Number of unique conversations in this set of data: 689
Number of tweets authored by journalist Marianna_Spring:3334
1908
Number of rows in df_clean for journalist Mei Fong: 1908
Number of unique conversations in this set of data: 443
Number of tweets authored by journalist Mei Fong:206
378
Number of rows in df_clean for journalist Muyi_Xiao: 378
Number of unique conversations in this set of data: 16
Number of tweets authored by journalist Muyi_Xiao:33
10463
Number of rows in df_clean for journalist Sagarika Ghose: 10463
Number of unique conversations in this set of data

  df = pd.read_csv(self.file_name)


55283
Number of rows in df_clean for journalist Sally Kohn: 55283
Number of unique conversations in this set of data: 1884
Number of tweets authored by journalist Sally Kohn:0


IndexError: list index out of range