In [None]:
# author: Jana Lasser & Almog Simchon

In [2]:
import pandas as pd
import numpy as np
from os.path import join
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt

# OLS regression articles

## Regression table functionality

In [9]:
table_header = '''
\\begin{table}[]
    \\footnotesize
    \\centering
    \\begin{tabular}{l|c|c|c|c|c|c}
            & coef. & std. err. & $t$ & $P>\\vert t \\vert$ & [0.025 & 0.975] \\\\
            \\toprule
'''
table_footer = '''
            \\bottomrule
    \end{tabular}
    \caption{TODO}
    \label{tab:TODO}
\end{table}
'''

def make_regression_table(res, data, dep_col, name_map, decimals=3):
    tab = table_header
    conf_int = res.conf_int()
    for var in conf_int.index:
        varname = name_map[var]
        coef = res.params[var]
        stderr = res.bse[var]
        t = res.tvalues[var]
        pval = res.pvalues[var]
        #if pval == 0: pval = "$< 0.001$"
        ci_low = conf_int.loc[var][0]
        ci_hi = conf_int.loc[var][1]
        line = f"\t\t{varname} & {coef:1.3f} & {stderr:1.3f} & {t:1.3f} & {pval:1.3f} & {ci_low:1.3f} & {ci_hi:1.3f} \\\\ \n"
        tab += line
        
    tab += "\t\t\\bottomrule \n"
    mean_dep_var = data[dep_col].mean()
    std_dep_var = data[dep_col].std()
    line = f"\t\t\multicolumn{{2}}{{l}}{{R-squared}} & \multicolumn{{1}}{{r}}{res.rsquared:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{Mean dependent var}} & \multicolumn{{2}}{{r}}{mean_dep_var:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Adjusted R-squared}} & \multicolumn{{1}}{{r}}{res.rsquared_adj:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{S.D. dependent var}} & \multicolumn{{2}}{{r}}{std_dep_var:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Model MSE}} & \multicolumn{{1}}{{r}}{res.mse_model:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{AIC}} & \multicolumn{{2}}{{r}}{res.aic:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Sum squared resid}} & \multicolumn{{1}}{{r}}{res.ssr:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{BIC}} & \multicolumn{{2}}{{r}}{res.bic:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Log-likelihood}} & \multicolumn{{1}}{{r}}{res.llf:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{F-statistic}} & \multicolumn{{2}}{{r}}{res.fvalue:1.3f} \\\\ \n"
    tab += line
    dw = durbin_watson(res.resid)
    line = f"\t\t\multicolumn{{2}}{{l}}{{Durbin-Watson stat}} & \multicolumn{{1}}{{r}}{dw:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{Prob(F-statistic)}} & \multicolumn{{2}}{{r}}{res.f_pvalue:1.3f} \\\\ \n"
    tab += line
    tab += table_footer
    
    return tab

## OLS regression NewsGuard score on belief & truth similarity

In [3]:
src = "../../data/articles"
fname = "articles.csv.gzip"
articles = pd.read_csv(join(src, fname), compression="gzip")
# filter for articles that have only been shared by one party
articles = articles[articles["party_count"] == 1]

In [4]:
# normalize and center the scores
articles["NG_score"] = articles["NG_score"] / 100
articles["avg_belief_score"] = articles["avg_belief_score"] - \
    articles["avg_belief_score"].mean()
articles["avg_truth_score"] = articles["avg_truth_score"] - \
    articles["avg_truth_score"].mean()

In [5]:
mod = smf.ols(formula='NG_score ~ avg_belief_score + avg_truth_score + party + party * avg_belief_score + party * avg_truth_score + party * avg_belief_score * avg_truth_score', data=articles)
res = mod.fit()
print(res.summary2().tables[1])

                                                       Coef.  Std.Err.  \
Intercept                                           0.945342  0.000428   
party[T.Republican]                                -0.093835  0.000676   
avg_belief_score                                   -0.002277  0.007632   
party[T.Republican]:avg_belief_score               -0.339251  0.012057   
avg_truth_score                                     0.016746  0.008399   
party[T.Republican]:avg_truth_score                 0.172206  0.013259   
avg_belief_score:avg_truth_score                    0.146949  0.020161   
party[T.Republican]:avg_belief_score:avg_truth_... -0.063404  0.035111   

                                                              t  \
Intercept                                           2208.290601   
party[T.Republican]                                 -138.898371   
avg_belief_score                                      -0.298330   
party[T.Republican]:avg_belief_score                 -28.138367  

In [7]:
res.summary()

0,1,2,3
Dep. Variable:,NG_score,R-squared:,0.132
Model:,OLS,Adj. R-squared:,0.132
Method:,Least Squares,F-statistic:,3319.0
Date:,"Tue, 06 Dec 2022",Prob (F-statistic):,0.0
Time:,08:37:43,Log-Likelihood:,102680.0
No. Observations:,152532,AIC:,-205300.0
Df Residuals:,152524,BIC:,-205300.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9453,0.000,2208.291,0.000,0.945,0.946
party[T.Republican],-0.0938,0.001,-138.898,0.000,-0.095,-0.093
avg_belief_score,-0.0023,0.008,-0.298,0.765,-0.017,0.013
party[T.Republican]:avg_belief_score,-0.3393,0.012,-28.138,0.000,-0.363,-0.316
avg_truth_score,0.0167,0.008,1.994,0.046,0.000,0.033
party[T.Republican]:avg_truth_score,0.1722,0.013,12.987,0.000,0.146,0.198
avg_belief_score:avg_truth_score,0.1469,0.020,7.289,0.000,0.107,0.186
party[T.Republican]:avg_belief_score:avg_truth_score,-0.0634,0.035,-1.806,0.071,-0.132,0.005

0,1,2,3
Omnibus:,61732.956,Durbin-Watson:,1.344
Prob(Omnibus):,0.0,Jarque-Bera (JB):,359930.215
Skew:,-1.864,Prob(JB):,0.0
Kurtosis:,9.537,Cond. No.,131.0


In [11]:
len(articles)

153910

In [10]:
name_map = {
    "Intercept":"Intercept",
    "party[T.Republican]":"Republican",
    "avg_belief_score":"$D_\\mathrm{b}$",
    "avg_belief_score:avg_truth_score":"$D_\\mathrm{b}$ \\times $D_\\mathrm{t}$",
    "party[T.Republican]:avg_belief_score":"Republican $\\times$ $D_\\mathrm{b}$",
    "avg_truth_score":"$D_\\mathrm{t}$",
    "party[T.Republican]:avg_truth_score":"Rep. $\\times$ $D_\\mathrm{t}$",
    "party[T.Republican]:avg_belief_score:avg_truth_score":"Rep. $\\times$ $D_\\mathrm{b}$ $\\times$ $D_\\mathrm{t}$"
}

tab = make_regression_table(res, articles, "NG_score", name_map)
dst = "../../tables"
fname = "OLS_table_article_NG_score.txt"
with open(join(dst, fname), "w") as f:
    f.write(tab)

## Prediction

In [31]:
honesty_component_score = np.arange(-1.0, 0.4, 0.01)
N = len(honesty_component_score)
belief_pred_df_dem = pd.DataFrame({
    "avg_belief_score":honesty_component_score,
    "party":["Democrat"] * N,
    "avg_truth_score":[articles["avg_truth_score"].mean()] * N
})
belief_pred_df_rep = pd.DataFrame({
    "avg_belief_score":honesty_component_score,
    "party":["Republican"] * N,
    "avg_truth_score":[articles["avg_truth_score"].mean()] * N
})
truth_pred_df_dem = pd.DataFrame({
    "avg_truth_score":honesty_component_score,
    "party":["Democrat"] * N,
    "avg_belief_score":[articles["avg_belief_score"].mean()] * N
})
truth_pred_df_rep = pd.DataFrame({
    "avg_truth_score":honesty_component_score,
    "party":["Republican"] * N,
    "avg_belief_score":[articles["avg_belief_score"].mean()] * N
})

In [32]:
belief_pred_dem = res.get_prediction(belief_pred_df_dem)
belief_pred_rep = res.get_prediction(belief_pred_df_rep)
truth_pred_dem = res.get_prediction(truth_pred_df_dem)
truth_pred_rep = res.get_prediction(truth_pred_df_rep)

belief_pred_dem = belief_pred_dem.summary_frame(alpha=0.05)
belief_pred_dem["party"] = "Democrat"
belief_pred_dem["honesty_component"] = "belief"
belief_pred_rep = belief_pred_rep.summary_frame(alpha=0.05)
belief_pred_rep["party"] = "Republican"
belief_pred_rep["honesty_component"] = "belief"
truth_pred_dem = truth_pred_dem.summary_frame(alpha=0.05)
truth_pred_dem["party"] = "Democrat"
truth_pred_dem["honesty_component"] = "truth"
truth_pred_rep = truth_pred_rep.summary_frame(alpha=0.05)
truth_pred_rep["party"] = "Republican"
truth_pred_rep["honesty_component"] = "truth"

In [33]:
for df in [belief_pred_dem, belief_pred_rep]:
    df["belief"] = honesty_component_score
    df["truth"] = np.nan
    df["honesty_component"] = "belief"
    df.rename(columns={"mean":"NG", "mean_ci_lower":"ymin", "mean_ci_upper":"ymax"}, inplace=True)
for df in [truth_pred_dem, truth_pred_rep]:
    df["belief"] = np.nan
    df["truth"] = honesty_component_score
    df["honesty_component"] = "truth"
    df.rename(columns={"mean":"NG", "mean_ci_lower":"ymin", "mean_ci_upper":"ymax"}, inplace=True)

In [34]:
dst = "../../data/articles"
fname = "OLS_predictions_articles.csv"
cols = ["NG", "belief", "truth", "ymin", "ymax", "party", "honesty_component"]
predictions = pd.concat([
    belief_pred_dem[cols],
    belief_pred_rep[cols],
    truth_pred_dem[cols],
    truth_pred_rep[cols]
]).reset_index(drop=True)
predictions.to_csv(join(dst, fname), index=False)