In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import texthero
from scipy.stats import poisson
from tqdm import tqdm

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
rn_post_length_words = {} # Stores {<length of post>: #occourence, ...}
fn_post_length_words = {} # Stores {<length of post>: #occourence, ...}
all_post_length_words = {} # Stores {<length of post>: #occourence, ...}

for row in tqdm(df.iterrows(), total = len(df)):
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if len(labels) == 1:
            is_fake_news = (labels[0][1] == 1)
            
            # clean text
            t = texthero.clean(pd.Series(text))[0]
            
            n_words = len(t)
            
            if is_fake_news:
                if n_words not in fn_post_length_words:
                    fn_post_length_words[n_words] = 0
                fn_post_length_words[n_words] += 1
            
            else:
                if n_words not in rn_post_length_words:
                    rn_post_length_words[n_words] = 0
                rn_post_length_words[n_words] += 1
            
            if n_words not in all_post_length_words:
                all_post_length_words[n_words] = 0
            all_post_length_words[n_words] += 1


In [None]:
dic = rn_post_length_words # change dic here

plt.figure(figsize=(10,8))
n = sum(dic.values())
x_axis = sorted(dic)
y_axis = np.zeros(len(x_axis))
s = 0
for ind, val in enumerate(x_axis):
    s += dic[val]
    y_axis[ind] = dic[val]
    
    
plt.scatter(x_axis, y_axis, marker='x')
plt.xlim([10,1600])
plt.title('Distribution in all labeled posts')
plt.xlabel('Number of words')
plt.ylabel('Number of posts')
plt.savefig('./overview/word_length_dist.png')
plt.show()


plt.figure(figsize=(10,8))
for dic in [all_post_length_words, fn_post_length_words, rn_post_length_words]:
    n = sum(dic.values())
    x_axis = sorted(dic)
    y_axis = np.zeros(len(x_axis))
    s = 0
    for ind, val in enumerate(x_axis):
        s += dic[val]
        y_axis[ind] = s/n


    plt.plot(x_axis, y_axis)
plt.xlim([30,1600])
#plt.xscale('log')
plt.title('Cumulative Distribution')
plt.xlabel('Number of words')
plt.ylabel('Number of posts')
plt.legend(['all', 'fake news', 'real news'])
plt.savefig('./overview/word_length_cum.png')
plt.show();

In [None]:
# Posts with multiple lables
posts = {'x': [], 'y': []} # x: rn, y:fn

for row in tqdm(df.iterrows(), total = len(df)):
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if len(labels) > 1:
            dist = {'fn': 0, 'rn': 0}
            for page, fn_news, bias, factual in labels:
                if fn_news == 1:
                    dist['fn'] += 1
                else:
                    dist['rn'] += 1
            posts['x'].append(dist['rn'])
            posts['y'].append(dist['fn'])
            
# posts

In [None]:
p = pd.DataFrame(posts)

plt.figure(figsize=(10,8))
plt.scatter(p['x'], p['y'], marker='x')
plt.xlabel('Real-News Links')
plt.ylabel('Fake-News Links')
plt.title('Distribution of fake and real News in posts with multiple Links')
plt.savefig('./overview/multi_links.png')