This notebook contains code for analysis of False News on Weibo (and also mentioned comparisons).

Required software/language:

Python 3.7.3

MongoDB 4.2.8

Required Python Packages:

pymongo==3.7.0

pandas==0.23.4

numpy==1.15.1

scipy=1.1.0

seaborn==0.9.0

matplotlib==3.1.0


The function plot_CCDF() partially refers to the code of (5).
The fuctions, kdeplot, _univariate_kdeplot, and _statsmodels_univariate_kde are modified based on the seaborn code.


# Imports, Settings, and Connection to MongoDB

In [None]:
# Connect to MongoDB
from pymongo import MongoClient
from bson.objectid import ObjectId
client = MongoClient('mongodb://localhost:27017')

db = client.weibo_fn
og_coll = db.orig_posts
rp_coll = db.reposts

In [None]:
# Imports
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy import integrate, signal, stats
import copy

try:
    import statsmodels.nonparametric.api as smnp
    _has_statsmodels = True
except ImportError:
    _has_statsmodels = False

In [None]:
# Figure settings
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.transforms as transforms
import matplotlib.gridspec as gridspec

plt.rcParams['font.family']=['Arial']
plt.rcParams.update({'font.size': 20})

plt.rcParams['figure.figsize'] = (10, 7.5)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

sns.set(rc={"axes.facecolor": (0, 0, 0, 0)})
sns.set_style('white')

In [None]:
def get_int(x, y):
    return integrate.trapz(y, x)

def log_10_product(x, pos):
    if x<1:
        if x>0.009:
            return '%1.2f' % (x)  #1.2f
        elif x>0.0009:
            return '%1.3f' % (x)  #1.2f
        elif x>0.00009:
            return '%1.4f' % (x)  #1.2f
    else:
        z='%1i' % (x)
        if z=='10000':
            return '10K'
        elif z=='100000':
            return '100K'
        elif z=='1000000':
            return '1000K'
        else:
            return z
        
def abs_loc(x, pos):
    return abs(x)

In [None]:
ordered_domains = ['Politics', 'Finance & Business', 'Military', 
                  'Culture & Sports & Entertainment', 'Society & Life', 
                  'Disasters & Accidents', 'Education & Examinations', 'Science & Technology',
                  'Health & Medicine']

In [None]:
domain2color ={
    'Finance & Business': 'blue',
    'Society & Life': 'orange',
    'Culture & Sports & Entertainment': 'pink',
    'Disasters & Accidents': 'red',
    'Science & Technology': 'gray',
    'Health & Medicine': 'green',
    'Education & Examinations': 'cyan',
    'Politics': 'brown',
    'Military': 'olive'
}

# Domain-level Distribution (Fig. 1, Table 11, Section 3.4)

In [None]:
# The Twitter Data from Vosoughi et al [69].
domain_dist_tw ={
    'Politics': 27600,
    'Society & Life': 16458,
    'Science & Technology': 12043,
    'Finance & Business': 11086,
    'Military': 8054,
    'Culture & Sports & Entertainment': 6046,
    'Disasters & Accidents': 1318,
    'Health & Medicine': 0,
    'Education & Examinations': 0
}

In [None]:
og_coll.update_many({},{'$rename': { "topic": "domain" }})

In [None]:
# The Weibo data.
domain_dist_wb = {}

for domain in domain_dist_tw.keys():
    domain_dist_wb[domain] = og_coll.count_documents({'domain':domain})

domain_dist_wb

In [None]:
# Spearman's rho in Section 3.4

# Nine domains
print('9 domains:', spearmanr(list(domain_dist_tw.values()), 
                             list(domain_dist_wb.values())))

# Seven domains (excluding Health & Medicine and Education & Examinations)
print('7 domains:', spearmanr(list(domain_dist_tw.values())[:7], 
                             list(domain_dist_wb.values())[:7]))

# merge Health & Medicine into Science & Technology, 
# and Education & Examinations into Society & Life for alignment
domain_dist_wb_merged = copy.deepcopy(domain_dist_wb)
domain_dist_wb_merged['Science & Technology'] += domain_dist_wb_merged['Health & Medicine']
domain_dist_wb_merged['Society & Life'] += domain_dist_wb_merged['Education & Examinations']
print('after merging:', spearmanr(list(domain_dist_tw.values())[:7], 
                                  list(domain_dist_wb.values())[:7]))


In [None]:
# Data for Fig. 1 and Table 12.
total_wb = sum(domain_dist_wb.values())
total_tw = sum(domain_dist_tw.values())

domain_dists  = [{'Domain':domain, 
                 'Platform':'Twitter',
                 'N':domain_dist_tw[domain], 
                 '%':domain_dist_tw[domain] / total_tw}
                for domain in domain_dist_tw.keys()]

domain_dists2 = [{'Domain':domain, 
                 'Platform':'Weibo',
                 'N':domain_dist_wb[domain], 
                 '%':domain_dist_wb[domain] / total_wb}
                for domain in domain_dist_tw.keys()]

domain_dists.extend(domain_dists2)
domain_dists = pd.DataFrame(domain_dists)
domain_dists = domain_dists[['Domain', 'Platform', 'N', '%']]
domain_dists 

In [None]:
# Fig. 1
fig, ax = plt.subplots(figsize = (10,7.5))    

bplt = sns.barplot(y = domain_dists['Domain'], x = domain_dists[domain_dists['Platform'] == 'Twitter']['%'] * 100, \
                       data = domain_dists, orient = "h", color = '#A41088', ax = ax)

bplt2 = sns.barplot(y = domain_dists['Domain'], x = domain_dists[domain_dists['Platform'] == 'Weibo']['%'] * -100,
                   data = domain_dists, orient = "h", color = '#ED693C')

bplt.set_xlabel("%Posts", fontsize = 20)
bplt.set_ylabel("Domain", fontsize = 20)
bplt.set_yticklabels(domain_dists['Domain'], rotation = 0, fontsize = 20)

t_patch = mpatches.Patch(color='#A41088', label='Twitter')
w_patch = mpatches.Patch(color='#ED693C', label='Weibo')
plt.legend(handles=[t_patch, w_patch], loc=4, ncol = 1, prop={'size':20})


# for i, d in enumerate(domain_dists['Domain']):
#     p = float(domain_dists[(domain_dists['Platform'] == 'Twitter') & (domain_dists['Domain'] == d)]['%'] * 100)
#     N = int(domain_dists[(domain_dists['Platform'] == 'Twitter') & (domain_dists['Domain'] == d)]['N'])
#     bplt.text(p+0.5, i+0.15, N, fontsize = 15)
#     p = float(domain_dists[(domain_dists['Platform'] == 'Weibo') & (domain_dists['Domain'] == d)]['%'] * 100)
#     N = int(domain_dists[(domain_dists['Platform'] == 'Weibo') & (domain_dists['Domain'] == d)]['N'])
#     bplt2.text(p-3, i+0.15, N, fontsize = 15)
    
formatter = plt.FuncFormatter(abs_loc)
ax.xaxis.set_major_formatter(formatter)
ax.xaxis.set_ticklabels([40, 30, 20, 10, 0, 10, 20, 30, 40], fontsize=20)
plt.savefig('./figs/fig_1.pdf', bbox_inches='tight', format='pdf')

# Diffusion (Section 4)

In [None]:
ints = {
    'Size':{},
    'Maximum Depth':{},
    'Maximum Breadth':{},
    'Number of Engaged Users':{}
}

In [None]:
def measure_stat(measure_name):
    data = {domain:[] for domain in ordered_domains}
    
    for og in og_coll.find({}):
        data[og['domain']].append(int(og[measure_name]))

    data_stat = []
#     for k, v in data.items():
#         data_stat.append({
#             'Domain': k,
#             'Mean': np.average(v),
#             'Standard Error': np.std(v),
#             'Min': np.min(v),
#             'Median': int(np.median(v)),
#             'Max': np.max(v)
#         })

#     data_stat = pd.DataFrame(data_stat)
#     data_stat = data_stat[['Domain', 'Mean', 'Standard Error', 'Min', 'Median', 'Max']]
#     return data, data_stat
    return data

In [None]:
def plot_CCDF(data, measure_name, output_path):
    ax = plt.subplot(111)
    ax.set_xscale('log')
    ax.set_yscale('log')
    formatter = plt.FuncFormatter(log_10_product)
    ax.xaxis.set_major_formatter(formatter)
    ax.yaxis.set_major_formatter(formatter)

    legends = []
    for domain, v in data.items():
        total=float(len(v))
        xf = []
        pf = []
        v.sort()
        counts = list(set(v))
        counts.sort()
        for d in counts:
            ind = v.index(d)
            count = len(v[ind:])
            p = (count / float(total)) * 100
            xf.append(d)
            pf.append(p)
        ax.plot(xf, pf, '-',color = domain2color[domain], linewidth = 1.5)
        try:
            ints[measure_name][domain] = get_int(xf, pf)
        except:
            pass
        legends.append(domain)
    plt.xlabel(measure_name,fontsize=20)
    plt.ylabel("CCDF (%)",fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=20)
    plt.xticks(rotation='horizontal')
    plt.legend(legends,loc='lower left',fontsize=15,frameon=False)
    plt.xlim(xmin=1)
    plt.ylim(ymax=100)
    plt.savefig(output_path, bbox_inches='tight', format='pdf')
    plt.figure()

## Size (Fig. 2(a))

In [None]:
sizes = measure_stat('size')

In [None]:
plot_CCDF(sizes, 'Size', './figs/fig_2A.pdf')

## Maximum Depth (Fig. 2B)

In [None]:
max_depths = measure_stat('max_depth')

In [None]:
plot_CCDF(max_depths, 'Maximum Depth', './figs/fig_2B.pdf')

## Maximum Breadth (Fig. 2C)

In [None]:
max_breadths = measure_stat('max_breadth')

In [None]:
plot_CCDF(max_breadths, 'Maximum Breadth', './figs/fig_2C.pdf')

## Number of Engaged Users (Fig. 2D)

In [None]:
neu = measure_stat('unique_user')

In [None]:
plot_CCDF(neu, 'Number of Engaged Users', './figs/fig_2D.pdf')

## Rankings for Diffusion Capacity (Table 2)

In [None]:
# Normailze Area (NA)
normed_ints = {}
normed_ints['All measures'] = {domain:0 for domain in ordered_domains}
for measure, stat in ints.items():
    max_int = np.max(list(stat.values()))
    normed_stat = {k : (v / max_int) for k, v in stat.items()}
    normed_ints[measure] = normed_stat
    normed_ints['All measures'] = {k : normed_ints['All measures'][k] + (v / max_int) 
                                   for k, v in stat.items()}

rank_ints = {}
for measure, data in normed_ints.items():
    values = list(data.values())
    values.sort(reverse=True)
    ranks = {k : (int(values.index(v)) + 1) for k, v in data.items()}
    rank_ints[measure] = ranks

diff_capacity_table = {domain:{} for domain in ordered_domains}
measures = ['Size', 'Maximum Depth', 'Maximum Breadth', 
            'Number of Engaged Users', 'All measures']
for domain in ordered_domains:
    obj = {}
    for mea in measures:
        obj[mea + '_NA'] = normed_ints[mea][domain]
        obj[mea + '_R'] = rank_ints[mea][domain]
    diff_capacity_table[domain] = obj
diff_capacity_table = pd.DataFrame(diff_capacity_table).T
diff_capacity_table

# Role of Engaged Users (Section 5)

## User Characteristics (Section 5.1)

### Gender (Fig. 3)

In [None]:
gender_dist = {domain:{'male': 0, 'female': 0} for domain in ordered_domains}
gender_dist['All domains'] = {'male': 0, 'female': 0}
for og in og_coll.find({}):
    if og['userType'] != 'Verified Organization':
        gender_dist[og['domain']][og['userGender']] += 1
        gender_dist['All domains'][og['userGender']] += 1
for domain in gender_dist.keys():
    domain_total = sum(gender_dist[domain].values())
    gender_dist[domain]['%male'] = gender_dist[domain]['male'] / domain_total
    gender_dist[domain]['%female'] = 1 - gender_dist[domain]['%male']
gender_dist['Weibo registered users'] = {'male':'-', 'female':'-', '%male':0.57, '%female':0.43}
gender_dist = pd.DataFrame(gender_dist).T
gender_dist

In [None]:
# plot
sns.set_context({"figure.figsize": (10,10)})
 
#Plot 1 - background - "total" (top) series
top_plot = sns.barplot(y = gender_dist.index, x = np.ones(11) * 100.0, 
                       orient="h", color='#A41088')

#Plot 2 - overlay - "bottom" series
bottom_plot = sns.barplot(y = gender_dist.index, x = gender_dist['%male']* 100.0, 
                          orient="h", color='#ED693C')

bottom_plot.set_xlabel("%user", fontsize = 20)
bottom_plot.set_ylabel("Domain", fontsize = 20)
bottom_plot.set_yticklabels(gender_dist.index, rotation=0, fontsize=20)
bottom_plot.set_xticklabels([0, 20,40,60,80,100], fontsize=20)
bottom_plot.set(xlim=(0,100), ylim=(10.4,-0.4))

w_patch = mpatches.Patch(color='#ED693C', label='Male')
t_patch = mpatches.Patch(color='#A41088', label='Female')
plt.legend(handles=[t_patch, w_patch], loc=4, ncol = 1, prop={'size':20})

bottom_plot.vlines(57, -0.4, 10.4, linestyles='--', colors='#ffffff', label='wo')
bottom_plot.text(53.8, 10.87, '57', fontsize=20)

plt.savefig('./figs/fig_3.pdf', bbox_inches='tight', format='pdf')

### Age (Table 3 and Fig. 4)

In [None]:
# Table 4
age_dist = {domain:[] for domain in ordered_domains}
age_dist['All domains'] = []
for og in og_coll.find({'userBirthYear':{'$ne':-1}}):
    # age is calculated when a user published the collected post
    age = og['pubYear'] - og['userBirthYear']
    if age > 6 and age < 100:
        age_dist[og['domain']].append(age)
        age_dist['All domains'].append(age)

In [None]:
age_stat = {domain:{} for domain in ordered_domains}
age_stat['All domains'] = {}
for domain in age_stat.keys():
    age_stat[domain]['Average'] = np.average(age_dist[domain])
    age_stat[domain]['Upper Quartile'] = np.quantile(age_dist[domain], .75)
    age_stat[domain]['N(<30)'] = np.sum(list(map(lambda x : x < 30, age_dist[domain])))
    age_stat[domain]['N(30~65)'] = np.sum(list(map(lambda x : x >= 30 and x <= 65, age_dist[domain])))
    age_stat[domain]['N(>65)'] = np.sum(list(map(lambda x : x >= 65, age_dist[domain])))
    total = len(age_dist[domain])
    age_stat[domain]['%(<30)'] = age_stat[domain]['N(<30)'] / total
    age_stat[domain]['%(30~65)'] = age_stat[domain]['N(30~65)'] / total
    age_stat[domain]['%(>65)'] = age_stat[domain]['N(>65)'] / total   
age_stat = pd.DataFrame(age_stat).T
age_stat = age_stat[['Average', 'Upper Quartile', 'N(<30)', '%(<30)', 'N(30~65)', '%(30~65)',
             'N(>65)', '%(>65)']]
age_stat

In [None]:
def kdeplot(data, data2=None, shade=False, vertical=False, kernel="gau",
            bw="scott", gridsize=100, cut=3, clip=None, legend=True,
            cumulative=False, shade_lowest=True, cbar=False, cbar_ax=None,
            cbar_kws=None, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()

    if isinstance(data, list):
        data = np.asarray(data)

    data = data.astype(np.float64)
    if data2 is not None:
        if isinstance(data2, list):
            data2 = np.asarray(data2)
        data2 = data2.astype(np.float64)

    ax = _univariate_kdeplot(data, shade, vertical, kernel, bw, gridsize, cut, clip, legend, ax,
                             cumulative=cumulative, **kwargs)

    return ax

def _univariate_kdeplot(data, shade, vertical, kernel, bw, gridsize, cut,
                        clip, legend, ax, cumulative=False, **kwargs):
    """Plot a univariate kernel density estimate on one of the axes."""
    
    print(data.shape)
    # Sort out the clipping
    if clip is None:
        clip = (-np.inf, np.inf)

    # Calculate the KDE

    if np.nan_to_num(data.var()) == 0:
        # Don't try to compute KDE on singular data
        msg = "Data must have variance to compute a kernel density estimate."
        warnings.warn(msg, UserWarning)
        x, y = np.array([]), np.array([])

    elif _has_statsmodels:
        # Prefer using statsmodels for kernel flexibility
        x, y = _statsmodels_univariate_kde(data, kernel, bw,
                                           gridsize, cut, clip,
                                           cumulative=cumulative)
    else:
        # Fall back to scipy if missing statsmodels
        if kernel != "gau":
            kernel = "gau"
            msg = "Kernel other than `gau` requires statsmodels."
            warnings.warn(msg, UserWarning)
        if cumulative:
            raise ImportError("Cumulative distributions are currently "
                              "only implemented in statsmodels. "
                              "Please install statsmodels.")
        x, y = _scipy_univariate_kde(data, bw, gridsize, cut, clip)

    # Make sure the density is nonnegative
    y = np.amax(np.c_[np.zeros_like(y), y], axis=1)

    # Flip the data if the plot should be on the y axis
    if vertical:
        x, y = y, x

    # Check if a label was specified in the call
    label = kwargs.pop("label", None)

    # Otherwise check if the data object has a name
    if label is None and hasattr(data, "name"):
        label = data.name

    # Decide if we're going to add a legend
    legend = label is not None and legend
    label = "_nolegend_" if label is None else label

    # Use the active color cycle to find the plot color
    facecolor = kwargs.pop("facecolor", None)
    line, = ax.plot(x, y, **kwargs)
    color = line.get_color()
    line.remove()
    kwargs.pop("color", None)
    facecolor = color if facecolor is None else facecolor

    # Draw the KDE plot and, optionally, shade
    shade_kws = dict(alpha=1, cmap='plasma')
    if shade:
        colors= plt.get_cmap('plasma_r')(norm(x))
        for i in range(x.shape[0]-1):
            ax.fill_between(x[i:], y[i:], color = colors[i])

    # Set the density axis minimum to 0
    if vertical:
        ax.set_xlim(0, auto=None)
    else:
        ax.set_ylim(0, auto=None)

    ax.set_xlabel(ax.get_xlabel(), fontsize = 20)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize = 20)

    return ax

def _statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip,
                                cumulative=False):
    """Compute a univariate kernel density estimate using statsmodels."""
    fft = kernel == "gau"
    kde = smnp.KDEUnivariate(data)
    kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip)
    if cumulative:
        grid, y = kde.support, kde.cdf
    else:
        grid, y = kde.support, kde.density
    return grid, y

def norm(x):
    _range = np.max(x) - np.min(x)
    return (x - np.min(x)) / _range

In [None]:
# Figure 4
age_dist_list = []
for domain, ages in age_dist.items():
    for age in ages:
        age_dist_list.append({'Domain':domain, 'Age':age})
age_dist_list = pd.DataFrame(age_dist_list)
sns.set_context({"figure.figsize": (10, 10)})

domain_order = ['Politics', 'Military', 'Finance & Business', 'Culture & Sports & Entertainment', 
               'Science & Technology', 'Health & Medicine', 'Society & Life', 
               'Education & Examinations','Disasters & Accidents', 'All domains']

pal = sns.cubehelix_palette(10, light=.7, reverse = True)

age_plot = sns.FacetGrid(age_dist_list, row="Domain", hue="Domain", 
                         aspect=10, height=1.5, xlim=(0,100),
                         sharex=True, sharey=True, 
                         row_order = domain_order)
age_plot = age_plot.map(kdeplot, "Age", clip_on=False, shade=True, alpha=1)

age_plot.fig.subplots_adjust(hspace=.05)

age_plot.fig.text(0.65, 0.9, 'Politics', fontsize=20)
age_plot.fig.text(0.65, 0.81, 'Military', fontsize=20)
age_plot.fig.text(0.65, 0.72, 'Finance & Business', fontsize=20)
age_plot.fig.text(0.65, 0.63, 'Culture & Sports & Entertainment', fontsize=20)
age_plot.fig.text(0.65, 0.535, 'Science & Technology', fontsize=20)
age_plot.fig.text(0.65, 0.445, 'Health & Medicine', fontsize=20)
age_plot.fig.text(0.65, 0.35, 'Society & Life', fontsize=20)
age_plot.fig.text(0.65, 0.26, 'Education & Examinations', fontsize=20)
age_plot.fig.text(0.65, 0.17, 'Disasters & Accidents', fontsize=20)
age_plot.fig.text(0.65, 0.08, 'All domains', fontsize=20)

age_plot.set_titles("")
age_plot.set(yticks=[])
age_plot.despine(left=True)

plt.savefig('./figs/fig_4.pdf', bbox_inches='tight', format='pdf')

### Account (Table 4, 5, and 6)

#### Table 4

In [None]:
account_dist = {domain:{} for domain in ordered_domains}
account_dist['All domains'] = {}
type_order = ['Unverified User', 'Verified Individual',
             'Verified Organization']
total_st_cnt = [0, 0, 0]
total_rp_cnt = [0, 0, 0]

for domain in ordered_domains:
    st_cnt = [0, 0, 0]
    rp_cnt = [0, 0, 0]
    
    for og in og_coll.find({
        'domain':domain,
        'userType':{'$in':['Ordinary User','Active Ordinary User']}
    }):
        st_cnt[0] += 1
        rp_cnt[0] += og['size']
    for og in og_coll.find({
        'domain':domain,
        'userType':'Verified Individual'
    }):
        st_cnt[1] += 1
        rp_cnt[1] += og['size']
    for og in og_coll.find({
        'domain':domain,
        'userType':'Verified Organization'
    }):
        st_cnt[2] += 1
        rp_cnt[2] += og['size']
    rp_cnt = [rp_cnt[i] - st_cnt[i] for i in range(3)]
    total_st_cnt = [total_st_cnt[i] + st_cnt[i] for i in range(3)]
    total_rp_cnt = [total_rp_cnt[i] + rp_cnt[i] for i in range(3)]
    st_p = [st_cnt[i] / sum(st_cnt)  for i in range(3)]
    rp_p = [rp_cnt[i] / sum(rp_cnt)  for i in range(3)]
    
    for i in range(3):
        account_dist[domain][type_order[i] + '_%rp'] = rp_p[i]
        account_dist[domain][type_order[i] + '_%st'] = st_p[i]
        
# Calculate the total
total_st_p = [total_st_cnt[i] / sum(total_st_cnt)  for i in range(3)]
total_rp_p = [total_rp_cnt[i] / sum(total_rp_cnt)  for i in range(3)]
for i in range(3):
    account_dist['All domains'][type_order[i] + '_%rp'] = total_rp_p[i]
    account_dist['All domains'][type_order[i] + '_%st'] = total_st_p[i]
        
account_dist = pd.DataFrame(account_dist).T
account_dist = account_dist[
    ['Unverified User_%st', 'Unverified User_%rp',
     'Verified Individual_%st', 'Verified Individual_%rp',
     'Verified Organization_%st', 'Verified Organization_%rp']
]
account_dist

#### Table 5

In [None]:
org_repost_beliefs = {}
global_aggs = {k:0 for k in rp_coll.distinct('label')}

beliefs_order = ['believe', 'debunk', 'DNB', 'doubt', 'unknown']
orgs_order = ['Police', 'Government', 'Media', 'Company',
              'School', 'Social Org.', 'Total']

match = {'userType':'Verified Organization'}
group = {'_id':'$userDomain', 'label':{'$sum':1}}

for udom in rp_coll.distinct('userDomain', match):
    aggs = {}
    for agg in rp_coll.aggregate([{'$match':{'userType':'Verified Organization',
                                             'userDomain':udom}},
                                  {'$group':{'_id':'$label', 
                                             'label':{'$sum':1}}}]):
        aggs[agg['_id']] = agg['label']
        global_aggs[agg['_id']] += agg['label']
    local_total = sum(aggs.values())
    normed_aggs = {k:v/local_total for k,v in aggs.items()}
    org_repost_beliefs[udom] = normed_aggs
global_total = sum(global_aggs.values())
org_repost_beliefs['Total'] = {k:v/global_total for k,v in global_aggs.items()}

org_repost_beliefs = pd.DataFrame(org_repost_beliefs).T
org_repost_beliefs = org_repost_beliefs[beliefs_order]
org_repost_beliefs = org_repost_beliefs.T
org_repost_beliefs = org_repost_beliefs[orgs_order]
org_repost_beliefs


#### Table 6

In [None]:
org_fooled_dist = {domain:{'Number of the fooling-organization posts':0} 
                   for domain in ordered_domains}
fooling_org_posts_total = 0
column_order = [
    'Number of the fooling-organization posts',
    'Proportion of the fooling-organization posts',
    'Number of all reposted false original posts',
    'Proportion of all reposted false original posts',
    'Difference'
]

repostedIds = set()
for rp in rp_coll.find({'userType' : 'Verified Organization',
                        'label':'believe'}):
    repostedIds.add(rp['forwardedId'])

for rpid in repostedIds:
    domain = og_coll.find_one({'weiboId':rpid})['domain']
    org_fooled_dist[domain]['Number of the fooling-organization posts'] += 1
    fooling_org_posts_total +=1

og_total = og_coll.count_documents({'size':{'$gt':1}})

for domain in ordered_domains:
    org_fooled_dist[domain]['Proportion of the fooling-organization posts']=\
    org_fooled_dist[domain]['Number of the fooling-organization posts'] / fooling_org_posts_total
    
    org_fooled_dist[domain]['Number of all reposted false original posts'] = \
    og_coll.count_documents({'domain':domain, 'size':{'$gt':1}})
    
    org_fooled_dist[domain]['Proportion of all reposted false original posts'] = \
    org_fooled_dist[domain]['Number of all reposted false original posts'] / og_total
    
    org_fooled_dist[domain]['Difference'] = \
    org_fooled_dist[domain]['Proportion of the fooling-organization posts'] - \
    org_fooled_dist[domain]['Proportion of all reposted false original posts']
    
org_fooled_dist = pd.DataFrame(org_fooled_dist).T
org_fooled_dist = org_fooled_dist[column_order]
org_fooled_dist


## User Emotions (Section 5.2, Fig. 5)

In [None]:
emotions = ['disgust', 'like', 'anger', 'sadness', 'surprise', 'joy', 'fear']
emotions_cap = ['Disgust', 'Like', 'Anger', 'Sadness', 'Surprise', 'Joy', 'Fear']
diff_capacity = [1,2,3,4,5,6,7,8,9]

### Whole Emotional Intensity (Section 5.2)

In [None]:
# Original Posts
orig_post_emo_int = {t:0 for t in ordered_domains}

for og in og_coll.find({}):
    orig_post_emo_int[og['domain']] += og['emo_weiboContent']['total']

orig_post_emo_int = {t:orig_post_emo_int[t] / og_coll.count_documents({'domain':t})
                     for t in ordered_domains}

sorted_orig_post_emo_int = list(orig_post_emo_int.values())
sorted_orig_post_emo_int.sort(reverse=True)
orig_post_emo_int_rank = [sorted_orig_post_emo_int.index(v)+1 for v in orig_post_emo_int.values()]

print(orig_post_emo_int_rank)
print(spearmanr(diff_capacity, orig_post_emo_int_rank))

In [None]:
# Reposts
rp_domain = {domain:0 for domain in ordered_domains}
for t in ordered_domains:
    for og in og_coll.find({'domain':t}):
        rp_domain[t] += (og['size'] - 1)

repost_emo_int = {domain:0 for domain in ordered_domains}

for og in og_coll.find({'size':{'$gt':1}}):
    repost_emo_int[og['domain']] += og['emo_reposts']['total']
    
repost_emo_int = {t:repost_emo_int[t] / rp_domain[t]
                   for t in ordered_domains}

sorted_repost_emo_int = list(repost_emo_int.values())
sorted_repost_emo_int.sort(reverse=True)
repost_emo_int_rank = [sorted_repost_emo_int.index(v)+1 for v in repost_emo_int.values()]

print(repost_emo_int_rank)
print(spearmanr(diff_capacity,repost_emo_int_rank))

### Emotion Ranks of the Domains (Table 13, Table 14, and Figure 5)

In [None]:
# Emotion distribution for the original posts
# Table 13
orig_post_emo_dist = {domain:{emo:0 for emo in emotions} for domain in ordered_domains}

for og in og_coll.find({}):
    for emo in emotions:
        if  og['emo_weiboContent']['total'] != 0:
            orig_post_emo_dist[og['domain']][emo] += (og['emo_weiboContent'][emo] / og['emo_weiboContent']['total'])

for domain in ordered_domains:
    domain_n = og_coll.count_documents({'domain':domain})
    orig_post_emo_dist[domain] = {e:(v / domain_n) for e, v in orig_post_emo_dist[domain].items()}
    
orig_post_emo_dist = pd.DataFrame(orig_post_emo_dist).T
orig_post_emo_dist = orig_post_emo_dist[emotions]
orig_post_emo_dist.columns = [emotions_cap]
orig_post_emo_dist
 

In [None]:
# Fig. 5(Left)
orig_post_emo_rank = {emo:{} for emo in emotions_cap}
for emo in emotions_cap:
    emo_vals = [v[0] for v in orig_post_emo_dist[emo].values.tolist()]
    sorted_emo_vals = copy.deepcopy(emo_vals)
    sorted_emo_vals.sort(reverse=True)
    orig_post_emo_rank[emo] = {ordered_domains[i]:sorted_emo_vals.index(emo_vals[i])+1 \
                               for i in range(len(ordered_domains))}

orig_post_emo_rank = pd.DataFrame(orig_post_emo_rank).T
orig_post_emo_rank = orig_post_emo_rank[ordered_domains].T
orig_post_emo_rank

In [None]:
# Caption of Table 13
for e in emotions_cap:
    print(e, spearmanr(orig_post_emo_rank[e], diff_capacity))

In [None]:
# Emotion distribution for the reposts
# Table 14
repost_emo_dist = {domain:{emo:0 for emo in emotions} for domain in ordered_domains}

for og in og_coll.find({'size':{'$gt':1}}):
    for emo in emotions:
        if  og['emo_reposts']['total'] != 0:
            repost_emo_dist[og['domain']][emo] += (og['emo_reposts'][emo] / og['emo_reposts']['total'])

for domain in ordered_domains:
    domain_n = og_coll.count_documents({'domain':domain})
    repost_emo_dist[domain] = {e:(v / domain_n) for e, v in repost_emo_dist[domain].items()}
    
repost_emo_dist = pd.DataFrame(repost_emo_dist).T
repost_emo_dist = repost_emo_dist[emotions]
repost_emo_dist.columns = [emotions_cap]
repost_emo_dist
 

In [None]:
# Fig. 5 (Middle)
repost_emo_rank = {emo:{} for emo in emotions_cap}
for emo in emotions_cap:
    emo_vals = [v[0] for v in repost_emo_dist[emo].values.tolist()]
    sorted_emo_vals = copy.deepcopy(emo_vals)
    sorted_emo_vals.sort(reverse=True)
    repost_emo_rank[emo] = {ordered_domains[i]:sorted_emo_vals.index(emo_vals[i])+1 \
                               for i in range(len(ordered_domains))}

repost_emo_rank = pd.DataFrame(repost_emo_rank).T
repost_emo_rank = repost_emo_rank[ordered_domains].T
repost_emo_rank

In [None]:
# Caption of Table 14
for e in emotions_cap:
    print(e, spearmanr(repost_emo_rank[e], diff_capacity))

In [None]:
# Fig. 5 Plot
fig = plt.figure(tight_layout=True, figsize=(19,9))
gs = gridspec.GridSpec(1,9)

# subplot 1 (Left)
ax = fig.add_subplot(gs[0,0:4])
orig_post_emo_rank_ht = sns.heatmap(data=orig_post_emo_rank, cbar=False, annot=True, annot_kws={"size": 15})


orig_post_emo_rank_ht.set_xlabel("Emotion of the Original Posts", fontsize = 15)
orig_post_emo_rank_ht.set_ylabel("Domain", fontsize = 15)
orig_post_emo_rank_ht.set_xticklabels(orig_post_emo_rank_ht.get_xticklabels(), fontsize = 15)
orig_post_emo_rank_ht.set_yticklabels(orig_post_emo_rank_ht.get_yticklabels(), fontsize = 15)

# subplot 2 (Middle)
ax = fig.add_subplot(gs[0,4:8])
repost_emo_rank_ht = sns.heatmap(data=repost_emo_rank, cbar=False, annot=True, annot_kws={"size": 15})

repost_emo_rank_ht.set_xlabel("Emotion of the Reposts", fontsize = 15)
repost_emo_rank_ht.set_xticklabels(repost_emo_rank_ht.get_xticklabels(), fontsize = 15)
repost_emo_rank_ht.set_yticklabels([])

# subplot 3 (Right)
ax = fig.add_subplot(gs[0,8])
base = plt.gca().transData
rot = transforms.Affine2D().rotate_deg_around(3.5, 4.5, 270)
# ax = axes[2]
line_diffusion = ax.plot(range(9), color='#690098', alpha=.5, transform = rot + base)
ax.fill_between(x = range(0,9,1), y1 = range(9), color='#690098',
                alpha=.1, transform = rot + base)

line_orig_post = ax.plot(np.array(orig_post_emo_int_rank) - 1, '--', color='#F38A3E', alpha=.5, transform = rot + base)
ax.fill_between(x = range(0,9,1), y1 = np.array(orig_post_emo_int_rank) - 1, color='#F38A3E',
                alpha=.4, transform = rot + base)

line_repost = ax.plot(np.array(repost_emo_int_rank) - 1, '-.', color='#BE2F65', alpha=.5, transform = rot + base)
ax.fill_between(x = range(0,9,1), y1 = np.array(repost_emo_int_rank) - 1, color='#BE2F65',
                alpha=.4, transform = rot + base)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)

plt.legend(['Diffusion', 'Orig. Post EI', 'Repost EI'], 
           bbox_to_anchor=(0.5, 0.04),loc='upper center', borderaxespad=0,
           frameon=False, labelspacing=-0.01, borderpad=0)
plt.xticks([0, 7], ( '1','9'))
plt.xlabel('Rank', fontsize=15)
plt.yticks([])


plt.savefig('./figs/fig_5.pdf', bbox_inches='tight', format='pdf')


## User Behaviors (Table 7, 8, and 9)

### Number of Reposts for Reply (Table 7)

In [None]:
reply_dist = {domain:[] for domain in ordered_domains}
reply_dist['Total'] = []

for og in og_coll.find({'gotForward':{'$gt':0}}):
    reply_dist[og['domain']].append(og['replyCnt'])
    reply_dist['Total'].append(og['replyCnt'])
    
data_stat = []
for k, v in reply_dist.items():
    data_stat.append({
        'Domain': k,
        'Mean': np.average(v),
        'Standard Error': np.std(v)
    })
data_stat = pd.DataFrame(data_stat)
data_stat = data_stat[['Domain', 'Mean', 'Standard Error']]
data_stat

### Cascade Concentration (Table 7)

In [None]:
cc_dist = {domain:[] for domain in ordered_domains}
cc_dist['Total'] = []

for og in og_coll.find({}):
    cc_dist[og['domain']].append(og['concentration'])
    cc_dist['Total'].append(og['concentration'])
    
data_stat = []
for k, v in cc_dist.items():
    data_stat.append({
        'Domain': k,
        'Mean': np.average(v) * 100.0
    })
data_stat = pd.DataFrame(data_stat)
data_stat = data_stat[['Domain', 'Mean']]
data_stat

### Starter Engagement (Table 8)

In [None]:
se_dist = {domain:[] for domain in ordered_domains}

for og in og_coll.find({'gotForward':{'$gt':0}}):
    se_dist[og['domain']].append(og['starterRepostCnt'] + 1)

se_ints = {domain:0  for domain in ordered_domains}
for domain, se in se_dist.items():
    total=float(len(se))
    xf = []
    pf = []
    se.sort()
    counts = list(set(se))
    counts.sort()
    for d in counts:
        ind = se.index(d)
        count = len(se[ind:])
        p = (count / float(total)) * 100
        xf.append(d) 
        pf.append(p)
    se_ints[domain] = get_int(xf, pf)
    
normed_se_ints = {domain : (se_ints[domain] / max(se_ints.values())) 
                  for domain in ordered_domains}
values = list(normed_se_ints.values())
values.sort(reverse=True)
rank_se_ints = {k : (int(values.index(v)) + 1) for k, v in normed_se_ints.items()}

data_stat = {}
for k, v in se_dist.items():
    l = len(se_dist[k])
    data_stat[k]={
        'Mean': np.average(v),
        'Standard Error': np.std(v),
        'Min': np.min(v),
        'Median': int(np.median(v)),
        'Max': np.max(v),
        '% of having >= 1 repost': np.sum(list(map(lambda x : x >= 2, se_dist[k]))) / l * 100.0,
#         '% of having >= 5 reposts': np.sum(list(map(lambda x : x >= 6, se_dist[k]))) / l * 100.0,
        '% of having >= 10 reposts': np.sum(list(map(lambda x : x >= 11, se_dist[k]))) / l * 100.0,
        'NA':normed_se_ints[k], 
        'R': rank_se_ints[k],
    }
data_stat = pd.DataFrame(data_stat).T
data_stat = data_stat[['Mean', 'Standard Error', 
                       'Min', 'Median', 'Max',
                       '% of having >= 1 repost', 
#                        '% of having >= 5 reposts', 
                       '% of having >= 10 reposts',
                       'NA', 'R']]
data_stat

### Starters' Reposts Content Analysis (Table 9)

In [None]:
st_repost_dist = {domain:{'Number of cascades in which starters expressed disbeliefs':0,
                         'Number of cascades that starters reposted':0} 
                   for domain in ordered_domains}
st_repost_dist['All domains'] =  \
    {'Number of cascades in which starters expressed disbeliefs':0, \
     'Number of cascades that starters reposted':0} 
column_order = ['Number of cascades in which starters expressed disbeliefs',
                'Number of cascades that starters reposted',
                'Disbelief rate',
                'Rank']

for og in og_coll.find({'starterRerpostBeliefs':{'$exists':True}}):
    st_repost_dist[og['domain']]['Number of cascades that starters reposted'] += 1
    st_repost_dist['All domains']['Number of cascades that starters reposted'] += 1
    beliefs = og['starterRerpostBeliefs']
    if 'DNB' in beliefs or 'doubt' in beliefs or 'debunk' in beliefs:
        st_repost_dist[og['domain']]['Number of cascades in which starters expressed disbeliefs'] += 1
        st_repost_dist['All domains']['Number of cascades in which starters expressed disbeliefs'] += 1

for domain in st_repost_dist.keys():
    st_repost_dist[domain]['Disbelief rate'] = \
    st_repost_dist[domain]['Number of cascades in which starters expressed disbeliefs'] / \
    st_repost_dist[domain]['Number of cascades that starters reposted']

rates = []
for domain in ordered_domains:
    rates.append(st_repost_dist[domain]['Disbelief rate'])
rates.sort(reverse=True)
for domain in ordered_domains:
    st_repost_dist[domain]['Rank'] = rates.index(st_repost_dist[domain]['Disbelief rate']) + 1 

st_repost_dist = pd.DataFrame(st_repost_dist).T
st_repost_dist = st_repost_dist[column_order]
st_repost_dist
