In [None]:
# author: Jana Lasser & Almog Simchon

In [94]:
import pandas as pd
import numpy as np
from pathlib import Path
import statsmodels.formula.api as smf

# LME regression NewsGuard score on belief & truth similarity

**Note**: for this script to work, you will have to run `tweet_collection/wrangle_data.ipynb` with the code for the dictionary robustness analysis included. This will then produce an output file `US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip` that includes the honesty component similarities for the 100 perturbed dictionary versions.

## Data wrangling

In [125]:
src = "../../data/tweets"
dst = "../../data/tweets"
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
cols = [
    "retweeted", # used to filter out retweets
    "author_id", # data grouping: independent random variable
    "party", # characteristic of author: independent fixed variable
    "NG_score" # dependent variable
]
# fixed variables from different embeddings and dictionary versions
fixed_variables = [f"avg_truth_score_{i}" for i in range(100)] + \
                  [f"avg_belief_score_{i}" for i in range(100)]
cols += fixed_variables

tweets = pd.read_csv(
    Path(src, fname), 
    dtype={"author_id":str},
    compression="gzip",
    usecols=cols,
)

In [126]:
fixed_variables_name_map = \
    {f"avg_truth_score_{i}":f"truth_{i}" for i in range(100)}
for i in range(100):
    fixed_variables_name_map[f"avg_belief_score_{i}"] = f"belief_{i}"

In [127]:
tweets = tweets.rename(columns=fixed_variables_name_map)

In [128]:
tweets = tweets[tweets["retweeted"] == False] # remove retweets
tweets = tweets.drop(columns=["retweeted"])
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])] # remove independents
tweets = tweets.dropna() # remove tweets without NG, belief or truth score

In [129]:
tweet_counts = tweets["author_id"]\
    .value_counts()\
    .reset_index()\
    .rename(columns={"index":"author_id", "author_id":"count"})

In [130]:
# filter out authors with only a single tweet
tweets = tweets[tweets["author_id"].isin(tweet_counts[tweet_counts["count"] > 1]["author_id"])]

In [131]:
tweets["NG"] = tweets["NG_score"] / 100

In [132]:
for col in fixed_variables_name_map.values():
    tweets[col] = tweets[col] - tweets[col].mean()

In [133]:
tweets = tweets.copy()

## Calculate estimates with perturbed dictionaries

In [None]:
results = pd.DataFrame()
for i in range(100):
    print(i)
    scores = [f"belief_{i}", f"truth_{i}"]
    basic_stats = [
        "Intercept", 
        "party[T.Republican]", 
    ]
    stats = {
        "belief":f"belief_{i}",
        "truth":f"truth_{i}", 
        "belief:party[T.Republican]":f"belief_{i}:party[T.Republican]",
        "truth:party[T.Republican]":f"truth_{i}:party[T.Republican]",
        "belief:truth":f"belief_{i}:truth_{i}",
        "belief:truth:party[T.Republican]":f"belief_{i}:truth_{i}:party[T.Republican]"
    }
    
    md = smf.mixedlm(
        f"NG ~ 1 + belief_{i} * truth_{i} + belief_{i} * truth_{i} * party",
        tweets, 
        groups=tweets["author_id"],
        re_formula=f"~belief_{i} * truth_{i}"
    )
    res = md.fit(method=["lbfgs"], maxiter=30000)
    row = {"run":i}
    for stat in basic_stats:
        row[stat + "_estimate"] = [res.params[stat]]
        row[stat + "_pval"] = [res.pvalues[stat]]
    for stat in stats.keys():
        row[stat + "_estimate"] = [res.params[stats[stat]]]
        row[stat + "_pval"] = [res.pvalues[stats[stat]]]
    results = pd.concat([results, pd.DataFrame(row)])
results.to_csv(Path(dst, "LME_results_dictionary_robustness.csv"), index=False)