In [None]:
# author: Jana Lasser & Almog Simchon

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import statsmodels.formula.api as smf

# LME regression NewsGuard score on belief & truth similarity

**Note**: for this script to work, you will have to run `tweet_collection/wrangle_data.ipynb` with the code for the dictionary robustness analysis included. This will then produce an output file `US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip` that includes the honesty component similarities for the 100 perturbed dictionary versions.

## Data wrangling

In [3]:
src = "../../data/tweets"
dst = "../../data/tweets"
fname = "tweets.csv.gzip"
cols = [
    "author_id", # data grouping: independent random variable
    "party", # characteristic of author: independent fixed variable
    "NG_score" # dependent variable
]
# fixed variables from different embeddings and dictionary versions
fixed_variables = [f"avg_truth_score_{i}" for i in range(100)] + \
                  [f"avg_belief_score_{i}" for i in range(100)]
cols += fixed_variables

tweets = pd.read_csv(
    Path(src, fname), 
    dtype={"author_id":str},
    compression="gzip",
    usecols=cols
)

  tweets = pd.read_csv(


In [4]:
fixed_variables_name_map = \
    {f"avg_truth_score_{i}":f"truth_{i}" for i in range(100)}
for i in range(100):
    fixed_variables_name_map[f"avg_belief_score_{i}"] = f"belief_{i}"

In [5]:
tweets = tweets.rename(columns=fixed_variables_name_map)

In [6]:
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])] # remove independents
tweets = tweets.dropna() # remove tweets without NG, belief or truth score

In [7]:
tweet_counts = tweets["author_id"]\
    .value_counts()\
    .reset_index()\
    .rename(columns={"index":"author_id", "author_id":"count"})

In [8]:
# filter out authors with only a single tweet
tweets = tweets[tweets["author_id"].isin(tweet_counts[tweet_counts["count"] > 1]["author_id"])]

In [9]:
tweets["NG"] = tweets["NG_score"] / 100

In [10]:
for col in fixed_variables_name_map.values():
    tweets[col] = tweets[col] - tweets[col].mean()

In [11]:
tweets = tweets.copy()

## Calculate estimates with perturbed dictionaries

In [None]:
results = pd.DataFrame()
for i in range(100):
    print(i)
    scores = [f"belief_{i}", f"truth_{i}"]
    basic_stats = [
        "Intercept", 
        "party[T.Republican]", 
    ]
    stats = {
        "belief":f"belief_{i}",
        "truth":f"truth_{i}", 
        "belief:party[T.Republican]":f"belief_{i}:party[T.Republican]",
        "truth:party[T.Republican]":f"truth_{i}:party[T.Republican]",
        "belief:truth":f"belief_{i}:truth_{i}",
        "belief:truth:party[T.Republican]":f"belief_{i}:truth_{i}:party[T.Republican]"
    }
    
    md = smf.mixedlm(
        f"NG ~ 1 + belief_{i} * truth_{i} + belief_{i} * truth_{i} * party",
        tweets, 
        groups=tweets["author_id"],
        re_formula=f"~belief_{i} * truth_{i}"
    )
    res = md.fit(method=["lbfgs"], maxiter=30000)
    row = {"run":i}
    for stat in basic_stats:
        row[stat + "_estimate"] = [res.params[stat]]
        row[stat + "_pval"] = [res.pvalues[stat]]
    for stat in stats.keys():
        row[stat + "_estimate"] = [res.params[stats[stat]]]
        row[stat + "_pval"] = [res.pvalues[stats[stat]]]
    results = pd.concat([results, pd.DataFrame(row)])
results.to_csv(Path(dst, "LME_results_dictionary_robustness.csv"), index=False)

## Calculate estimates with dictionaries reduced by one

In [54]:
src = "../../data/tweets"
dst = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_glove_reducebyone.csv.gzip"
cols = [
    "id",
    "author_id", # data grouping: independent random variable
]
# fixed variables from different embeddings and dictionary versions
fixed_variables = [f"avg_truth_score_{i}" for i in range(37)] + \
                  [f"avg_belief_score_{i}" for i in range(37)]
cols += fixed_variables

scores = pd.read_csv(
    Path(src, fname), 
    dtype={"author_id":str, "id":str},
    compression="gzip",
    usecols=cols,
    #nrows=100
)

fname = "US_politician_tweets_2010-11-06_to_2022-12-31.csv.gzip"
tweets = pd.read_csv(
    Path(src, fname), 
    usecols=["id", "party", "NG_score", "avg_belief_score", "avg_truth_score"],
    compression="gzip", 
    dtype={"id":str}
)

In [55]:
tweets = tweets.dropna(subset=["NG_score"])

In [56]:
tweets = pd.merge(tweets, scores, how="left", left_on="id", right_on="id").dropna()

In [57]:
fixed_variables_name_map = \
    {f"avg_truth_score_{i}":f"truth_{i}" for i in range(37)}
for i in range(37):
    fixed_variables_name_map[f"avg_belief_score_{i}"] = f"belief_{i}"

In [58]:
tweets = tweets.rename(columns=fixed_variables_name_map)

In [59]:
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])] # remove independents
tweets = tweets.dropna() # remove tweets without NG, belief or truth score

In [60]:
tweet_counts = tweets["author_id"]\
    .value_counts()\
    .reset_index()\
    .rename(columns={"index":"author_id", "author_id":"count"})

In [61]:
# filter out authors with only a single tweet
tweets = tweets[tweets["author_id"].isin(tweet_counts[tweet_counts["count"] > 1]["author_id"])]

In [62]:
tweets["NG"] = tweets["NG_score"] / 100

In [63]:
for col in fixed_variables_name_map.values():
    tweets[col] = tweets[col] - tweets[col].mean()

In [64]:
tweets = tweets.copy()

In [70]:
truth_keywords = pd.read_csv("../../data/utilities/truth_seeking_p=0.05_swapped_wn_def_example.csv") 
truth_keywords = list(truth_keywords['truth_seeking'])
belief_keywords = pd.read_csv("../../data/utilities/belief_speaking_p=0.05_swapped_wn_def_example.csv") 
belief_keywords = list(belief_keywords['belief_speaking'])

In [None]:
results_belief = pd.DataFrame()
results_truth = pd.DataFrame()

for i in range(37):
    print(i)
    scores = [f"belief_{i}", f"truth_{i}"]
    basic_stats = [
        "Intercept", 
        "party[T.Republican]", 
    ]
    stats1 = {
        "belief":f"belief_{i}",
        "truth":"avg_truth_score", 
        "belief:party[T.Republican]":f"belief_{i}:party[T.Republican]",
        "truth:party[T.Republican]":f"avg_truth_score:party[T.Republican]",
        "belief:truth":f"belief_{i}:avg_truth_score",
        "belief:truth:party[T.Republican]":f"belief_{i}:avg_truth_score:party[T.Republican]"
    }
    stats2 = {
        "belief":f"avg_belief_score",
        "truth":f"truth_{i}", 
        "belief:party[T.Republican]":f"avg_belief_score:party[T.Republican]",
        "truth:party[T.Republican]":f"truth_{i}:party[T.Republican]",
        "belief:truth":f"avg_belief_score:truth_{i}",
        "belief:truth:party[T.Republican]":f"avg_belief_score:truth_{i}:party[T.Republican]"
    }
    
    
    # changing belief-speaking dictionary 
    row = {"run":i, "keyword":belief_keywords[i]}
    md = smf.mixedlm(
        f"NG ~ 1 + belief_{i} * avg_truth_score + belief_{i} * avg_truth_score * party",
        tweets, 
        groups=tweets["author_id"],
        re_formula=f"~belief_{i} * avg_truth_score"
    )
    res = md.fit(method=["lbfgs"], maxiter=30000)
    for stat in basic_stats:
        row[stat + "_belief" + "_estimate"] = [res.params[stat]]
        row[stat + "_belief" + "_pval"] = [res.pvalues[stat]]
    for stat in stats1.keys():
        row[stat + "_belief" + "_estimate"] = [res.params[stats1[stat]]]
        row[stat + "_belief" + "_pval"] = [res.pvalues[stats1[stat]]]
    results_belief = pd.concat([results_belief, pd.DataFrame(row)])
        
    # changing truth-seeking dictionary
    row = {"run":i, "keyword":truth_keywords[i]}
    md = smf.mixedlm(
        f"NG ~ 1 + avg_belief_score * truth_{i} + avg_belief_score * truth_{i} * party",
        tweets, 
        groups=tweets["author_id"],
        re_formula=f"~avg_belief_score * truth_{i}"
    )
    res = md.fit(method=["lbfgs"], maxiter=30000)
    for stat in basic_stats:
        row[stat + "_truth" + "_estimate"] = [res.params[stat]]
        row[stat + "_truth" + "_pval"] = [res.pvalues[stat]]
    for stat in stats2.keys():
        row[stat + "_truth" + "_estimate"] = [res.params[stats2[stat]]]
        row[stat + "_truth" + "_pval"] = [res.pvalues[stats2[stat]]]        
    results_truth = pd.concat([results_truth, pd.DataFrame(row)])
    
results_belief.to_csv(Path(dst, "LME_results_dictionary_belief_reducedbyone.csv"), index=False)
results_truth.to_csv(Path(dst, "LME_results_dictionary_truth_reducedbyone.csv"), index=False)