In [1]:
import pandas as pd

def bayesian_analysis_manual(fake_news_file, real_news_file):
    # Load the data
    fake_news_engagement_df = pd.read_csv(fake_news_file)
    real_news_engagement_df = pd.read_csv(real_news_file)
    
    # Step 1: Calculate Prior Probabilities
    total_news = len(fake_news_engagement_df) + len(real_news_engagement_df)
    print ("Total news",total_news)
    fake_news_proportion = len(fake_news_engagement_df) / total_news
    print ("fake news engagement", len(fake_news_engagement_df))
    real_news_proportion = len(real_news_engagement_df) / total_news
    print ("real news engagement", len(real_news_engagement_df))

    # Step 2: calculate the Likelihoods (Average replies for fake news tweets)
    
    # Count replies for each fake news tweet
    fake_news_replies = fake_news_engagement_df.groupby('tweet_id').size()
    total_fake_replies = fake_news_replies.sum()
    print("total_fake_replies",total_fake_replies)
    num_fake_tweets = len(fake_news_replies)
    print("num_fake_tweets",num_fake_tweets)
    avg_fake_news_replies_manual = total_fake_replies / num_fake_tweets 

    # Count replies for each real news tweet
    real_news_replies = real_news_engagement_df.groupby('tweet_id').size()
    total_real_replies = real_news_replies.sum()
    print("total_real_replies",total_real_replies)
    num_real_tweets = len(real_news_replies)
    print("num_real_tweets",num_real_tweets)
    avg_real_news_replies_manual = total_real_replies / num_real_tweets 

    # Step 3: Calculate Marginal Likelihood
    P_e = (avg_fake_news_replies_manual * fake_news_proportion) + (avg_real_news_replies_manual * real_news_proportion)

    # Step 4: Apply Bayes' Theorem to calculate P(h|e)
    P_h_given_e = (avg_fake_news_replies_manual * fake_news_proportion) / P_e

    # Return the results
    return {
        'Prior P(h)': fake_news_proportion,
        'Prior P(¬h)': real_news_proportion,
        'Likelihood P(e|h)': avg_fake_news_replies_manual,
        'Likelihood P(e|¬h)': avg_real_news_replies_manual,
        'Marginal Likelihood P(e)': P_e,
        'Posterior P(h|e)': P_h_given_e
    }

fake_news_file_5jan = '/Users/hariharanduraisingh/Downloads/jupyter_notebook/CoAID/05-01-2020/NewsFakeCOVID-19_tweets_replies.csv'
real_news_file_5jan = '/Users/hariharanduraisingh/Downloads/jupyter_notebook/CoAID/05-01-2020/NewsRealCOVID-19_tweets_replies.csv'

results = bayesian_analysis_manual(fake_news_file_5jan, real_news_file_5jan)

# Print the results
for key, value in results.items():
    print(f'{key}: {value}')


Total news 69836
fake news engagement 5721
real news engagement 64115
total_fake_replies 5721
num_fake_tweets 1266
total_real_replies 64115
num_real_tweets 13481
Prior P(h): 0.08192049945586803
Prior P(¬h): 0.918079500544132
Likelihood P(e|h): 4.518957345971564
Likelihood P(e|¬h): 4.755952822490913
Marginal Likelihood P(e): 4.736538034685667
Posterior P(h|e): 0.07815734616524023
