In [None]:
# author: Jana Lasser & Almog Simchon

In [2]:
import pandas as pd
import numpy as np
from os.path import join
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt

# OLS regression articles

## Regression table functionality

In [8]:
table_header = '''
\\begin{table}[]
    \\footnotesize
    \\centering
    \\begin{tabular}{l|c|c|c|c|c|c}
            & coef. & std. err. & $t$ & $P>\\vert t \\vert$ & [0.025 & 0.975] \\\\
            \\toprule
'''
table_footer = '''
            \\bottomrule
    \end{tabular}
    \caption{TODO}
    \label{tab:TODO}
\end{table}
'''

def make_regression_table(res, data, dep_col, name_map, decimals=3):
    tab = table_header
    conf_int = res.conf_int()
    for var in conf_int.index:
        varname = name_map[var]
        coef = res.params[var]
        stderr = res.bse[var]
        t = res.tvalues[var]
        pval = res.pvalues[var]
        #if pval == 0: pval = "$< 0.001$"
        ci_low = conf_int.loc[var][0]
        ci_hi = conf_int.loc[var][1]
        line = f"\t\t{varname} & {coef:1.3f} & {stderr:1.3f} & {t:1.3f} & {pval:1.3f} & {ci_low:1.3f} & {ci_hi:1.3f} \\\\ \n"
        tab += line
        
    tab += "\t\t\\bottomrule \n"
    mean_dep_var = data[dep_col].mean()
    std_dep_var = data[dep_col].std()
    line = f"\t\t\multicolumn{{2}}{{l}}{{R-squared}} & \multicolumn{{1}}{{r}}{res.rsquared:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{Mean dependent var}} & \multicolumn{{2}}{{r}}{mean_dep_var:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Adjusted R-squared}} & \multicolumn{{1}}{{r}}{res.rsquared_adj:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{S.D. dependent var}} & \multicolumn{{2}}{{r}}{std_dep_var:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Model MSE}} & \multicolumn{{1}}{{r}}{res.mse_model:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{AIC}} & \multicolumn{{2}}{{r}}{res.aic:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Sum squared resid}} & \multicolumn{{1}}{{r}}{res.ssr:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{BIC}} & \multicolumn{{2}}{{r}}{res.bic:1.3f} \\\\ \n"
    tab += line
    line = f"\t\t\multicolumn{{2}}{{l}}{{Log-likelihood}} & \multicolumn{{1}}{{r}}{res.llf:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{F-statistic}} & \multicolumn{{2}}{{r}}{res.fvalue:1.3f} \\\\ \n"
    tab += line
    dw = durbin_watson(res.resid)
    line = f"\t\t\multicolumn{{2}}{{l}}{{Durbin-Watson stat}} & \multicolumn{{1}}{{r}}{dw:1.3f} & "
    line += f"\multicolumn{{2}}{{l}}{{Prob(F-statistic)}} & \multicolumn{{2}}{{r}}{res.f_pvalue:1.3f} \\\\ \n"
    tab += line
    tab += table_footer
    
    return tab

## OLS regression NewsGuard score on belief & truth similarity

In [21]:
src = "../../data/articles"
fname = "articles.csv.gzip"
articles = pd.read_csv(join(src, fname), compression="gzip")

In [22]:
# normalize and center the scores
articles["NG_score"] = articles["NG_score"] / 100
articles["avg_belief_score"] = articles["avg_belief_score"] - \
    articles["avg_belief_score"].mean()
articles["avg_truth_score"] = articles["avg_truth_score"] - \
    articles["avg_truth_score"].mean()

In [23]:
mod = smf.ols(formula='NG_score ~ avg_belief_score + avg_truth_score + party + party * avg_belief_score + party * avg_truth_score + party * avg_belief_score * avg_truth_score', data=articles)
res = mod.fit()
print(res.summary2().tables[1])

                                                       Coef.  Std.Err.  \
Intercept                                           0.943744  0.000486   
party[T.Republican]                                -0.088548  0.000760   
avg_belief_score                                   -0.011986  0.008490   
party[T.Republican]:avg_belief_score               -0.336685  0.013252   
avg_truth_score                                     0.002057  0.009377   
party[T.Republican]:avg_truth_score                 0.160069  0.014647   
avg_belief_score:avg_truth_score                    0.004126  0.032290   
party[T.Republican]:avg_belief_score:avg_truth_... -0.045341  0.051273   

                                                              t  \
Intercept                                           1942.412846   
party[T.Republican]                                 -116.501203   
avg_belief_score                                      -1.411827   
party[T.Republican]:avg_belief_score                 -25.406374  

In [24]:
res.summary()

0,1,2,3
Dep. Variable:,NG_score,R-squared:,0.117
Model:,OLS,Adj. R-squared:,0.117
Method:,Least Squares,F-statistic:,2361.0
Date:,"Thu, 01 Dec 2022",Prob (F-statistic):,0.0
Time:,15:49:23,Log-Likelihood:,81989.0
No. Observations:,124231,AIC:,-164000.0
Df Residuals:,124223,BIC:,-163900.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9437,0.000,1942.413,0.000,0.943,0.945
party[T.Republican],-0.0885,0.001,-116.501,0.000,-0.090,-0.087
avg_belief_score,-0.0120,0.008,-1.412,0.158,-0.029,0.005
party[T.Republican]:avg_belief_score,-0.3367,0.013,-25.406,0.000,-0.363,-0.311
avg_truth_score,0.0021,0.009,0.219,0.826,-0.016,0.020
party[T.Republican]:avg_truth_score,0.1601,0.015,10.929,0.000,0.131,0.189
avg_belief_score:avg_truth_score,0.0041,0.032,0.128,0.898,-0.059,0.067
party[T.Republican]:avg_belief_score:avg_truth_score,-0.0453,0.051,-0.884,0.377,-0.146,0.055

0,1,2,3
Omnibus:,50930.766,Durbin-Watson:,1.331
Prob(Omnibus):,0.0,Jarque-Bera (JB):,293444.231
Skew:,-1.897,Prob(JB):,0.0
Kurtosis:,9.503,Cond. No.,174.0


In [30]:
name_map = {
    "Intercept":"Intercept",
    "party[T.Republican]":"Republican",
    "avg_belief_score":"$D_\\mathrm{b}$",
    "avg_belief_score:avg_truth_score":"$D_\\mathrm{b}$ \\times $D_\\mathrm{t}$",
    "party[T.Republican]:avg_belief_score":"Republican $\\times$ $D_\\mathrm{b}$",
    "avg_truth_score":"$D_\\mathrm{t}$",
    "party[T.Republican]:avg_truth_score":"Rep. $\\times$ $D_\\mathrm{t}$",
    "party[T.Republican]:avg_belief_score:avg_truth_score":"Rep. $\\times$ $D_\\mathrm{b}$ $\\times$ $D_\\mathrm{t}$"
}

tab = make_regression_table(res, articles, "NG_score", name_map)
dst = "../../tables"
fname = "OLS_table_article_NG_score.txt"
with open(join(dst, fname), "w") as f:
    f.write(tab)

## Prediction

In [31]:
honesty_component_score = np.arange(-1.0, 0.4, 0.01)
N = len(honesty_component_score)
belief_pred_df_dem = pd.DataFrame({
    "avg_belief_score":honesty_component_score,
    "party":["Democrat"] * N,
    "avg_truth_score":[articles["avg_truth_score"].mean()] * N
})
belief_pred_df_rep = pd.DataFrame({
    "avg_belief_score":honesty_component_score,
    "party":["Republican"] * N,
    "avg_truth_score":[articles["avg_truth_score"].mean()] * N
})
truth_pred_df_dem = pd.DataFrame({
    "avg_truth_score":honesty_component_score,
    "party":["Democrat"] * N,
    "avg_belief_score":[articles["avg_belief_score"].mean()] * N
})
truth_pred_df_rep = pd.DataFrame({
    "avg_truth_score":honesty_component_score,
    "party":["Republican"] * N,
    "avg_belief_score":[articles["avg_belief_score"].mean()] * N
})

In [32]:
belief_pred_dem = res.get_prediction(belief_pred_df_dem)
belief_pred_rep = res.get_prediction(belief_pred_df_rep)
truth_pred_dem = res.get_prediction(truth_pred_df_dem)
truth_pred_rep = res.get_prediction(truth_pred_df_rep)

belief_pred_dem = belief_pred_dem.summary_frame(alpha=0.05)
belief_pred_dem["party"] = "Democrat"
belief_pred_dem["honesty_component"] = "belief"
belief_pred_rep = belief_pred_rep.summary_frame(alpha=0.05)
belief_pred_rep["party"] = "Republican"
belief_pred_rep["honesty_component"] = "belief"
truth_pred_dem = truth_pred_dem.summary_frame(alpha=0.05)
truth_pred_dem["party"] = "Democrat"
truth_pred_dem["honesty_component"] = "truth"
truth_pred_rep = truth_pred_rep.summary_frame(alpha=0.05)
truth_pred_rep["party"] = "Republican"
truth_pred_rep["honesty_component"] = "truth"

In [33]:
for df in [belief_pred_dem, belief_pred_rep]:
    df["belief"] = honesty_component_score
    df["truth"] = np.nan
    df["honesty_component"] = "belief"
    df.rename(columns={"mean":"NG", "mean_ci_lower":"ymin", "mean_ci_upper":"ymax"}, inplace=True)
for df in [truth_pred_dem, truth_pred_rep]:
    df["belief"] = np.nan
    df["truth"] = honesty_component_score
    df["honesty_component"] = "truth"
    df.rename(columns={"mean":"NG", "mean_ci_lower":"ymin", "mean_ci_upper":"ymax"}, inplace=True)

In [34]:
dst = "../../data/articles"
fname = "OLS_predictions_articles.csv"
cols = ["NG", "belief", "truth", "ymin", "ymax", "party", "honesty_component"]
predictions = pd.concat([
    belief_pred_dem[cols],
    belief_pred_rep[cols],
    truth_pred_dem[cols],
    truth_pred_rep[cols]
]).reset_index(drop=True)
predictions.to_csv(join(dst, fname), index=False)