In [25]:
%run './model/multi_corpus.py'
%run './constants.py'


import polars as pl
# import pandas as pd
import os
from scipy.stats import norm
import arviz as az

In [26]:
def p_value_stars(p_value):
    match p_value:
        case _ if p_value <= 0.001:
            stars = '***'
        case _ if p_value <= 0.01:
            stars =  '**'
        case _ if p_value <= 0.05:
            stars =  '*'  
        case _:
            stars = ' '
    return stars

In [27]:
traces_path = os.path.join(OUTPUT_PATH, f'co_occurence_traces')

co_occurence_coef = []
co_occurence_se = []

for file in os.listdir(traces_path):

    field_name = file.split('.')[0]
    trace = az.from_netcdf(os.path.join(traces_path, file))

    summary = az.summary(trace, kind="stats")
    summary.index.name = 'beta'
    df = pl.from_pandas(summary, include_index=True)
    df = df.with_columns(pl.col('beta').str.replace('_', ' '))

    df = (
        df
        .with_columns((pl.col('mean') / pl.col('sd')).round(4).alias('z_score'))
        .with_columns((pl.col('z_score').abs().apply(norm.sf).round(4).alias('p_value')))
        .with_columns(pl.col('p_value').apply(p_value_stars).alias('significance'))
    )

    df_coef = (
        df.select(
            pl.col('beta'),
            pl.concat_str(
                [
                    pl.col('mean'),
                    pl.col('significance')
                ],
                # sep=''
            ).alias(field_name),
        )
    )

    df_se = (
        df.select(
            pl.col('beta'),
            pl.concat_str(
                [
                    pl.lit('('),
                    pl.col('sd'),
                    pl.lit(')'),
                ]
            ).alias(field_name),
        )
    )

    co_occurence_coef.append(df_coef)
    co_occurence_se.append(df_se)

iter_dfs = iter(co_occurence_coef)
co_occurence_df_coef = next(iter_dfs)
for df in iter_dfs:
    co_occurence_df_coef = co_occurence_df_coef.join(df, on='beta', how='inner')

iter_dfs = iter(co_occurence_se)
co_occurence_df_se = next(iter_dfs)
for df in iter_dfs:
    co_occurence_df_se = co_occurence_df_se.join(df, on='beta', how='inner')

In [28]:
traces_path = os.path.join(OUTPUT_PATH, f'co_citation_traces')

co_citation_coef = []
co_citation_se = []

for file in os.listdir(traces_path):

    field_name = file.split('.')[0]
    trace = az.from_netcdf(os.path.join(traces_path, file))

    summary = az.summary(trace, kind="stats")
    summary.index.name = 'beta'
    df = pl.from_pandas(summary, include_index=True)
    df = df.with_columns(pl.col('beta').str.replace('_', ' '))

    df = (
        df
        .with_columns((pl.col('mean') / pl.col('sd')).round(4).alias('z_score'))
        .with_columns((pl.col('z_score').abs().apply(norm.sf).round(4).alias('p_value')))
        .with_columns(pl.col('p_value').apply(p_value_stars).alias('significance'))
    )

    df_coef = (
        df.select(
            pl.col('beta'),
            pl.concat_str(
                [
                    pl.col('mean'),
                    pl.col('significance')
                ],
                # sep=''
            ).alias(field_name),
        )
    )

    df_se = (
        df.select(
            pl.col('beta'),
            pl.concat_str(
                [
                    pl.lit('('),
                    pl.col('sd'),
                    pl.lit(')'),
                ]
            ).alias(field_name),
        )
    )

    co_citation_coef.append(df_coef)
    co_citation_se.append(df_se)

iter_dfs = iter(co_citation_coef)
co_citation_df_coef = next(iter_dfs)
for df in iter_dfs:
    co_citation_df_coef = co_citation_df_coef.join(df, on='beta', how='inner')

iter_dfs = iter(co_citation_se)
co_citation_df_se = next(iter_dfs)
for df in iter_dfs:
    co_citation_df_se = co_citation_df_se.join(df, on='beta', how='inner')

columns = sorted(co_citation_df_coef.select(pl.all().exclude('beta')).columns)

In [29]:
coef_df = co_citation_df_coef.join(co_occurence_df_coef, how='left', on='beta')
coef_df = coef_df.select(['beta'] + sorted(coef_df.select(pl.all().exclude('beta')).columns))
coef_df = coef_df.with_columns(pl.col('beta').str.split(' ').arr.first())
coef_df = coef_df.select(pl.concat_list(pl.all()).arr.join(' & ').alias('all_fields'))
coef_df['all_fields'].to_list()

['Density & -3.712*** & -5.224*** & -4.884*** & -4.97*** & -3.93*** & -4.203*** & -2.193*** & -4.793*** & -3.509*** & -5.313*** & -3.747*** & -5.434*** & -4.262*** & -5.543*** & -4.337*** & -5.081*** & -5.346*** & -6.778*** & -5.016*** & -4.924*** & -3.811*** & -5.013*** & -2.486*** & -4.592*** & -3.794*** & -5.132*** & -2.581*** & -4.825*** & -4.482*** & -5.157*** & -2.984*** & -5.485***',
 'Triangles & 3.353*** & 0.972*** & 2.489*** & 0.664** & 2.684*** & 2.685*** & 0.546* & 0.168  & 1.274** & -0.498  & 1.732*** & 2.435*** & 1.978** & 2.385*** & 1.996*** & 0.977** & 0.614  & 0.632  & 2.565*** & 1.334*** & 2.985*** & 1.402*** & 2.494*** & 1.26*** & 1.137* & 1.447*** & 2.483*** & 2.067*** & 1.023*** & 0.583* & 2.227*** & 2.455***',
 'Stars & 0.061*** & 0.007*** & 0.01*** & 0.011*** & 0.018*** & 0.013*** & 0.013  & 0.006*** & -0.027  & 0.003*** & -0.013  & 0.013*** & 0.193*** & 0.012*** & 0.043*** & 0.004*** & 0.003** & 0.002*** & 0.013*** & 0.005*** & 0.021*** & 0.006*** & 0.047*** & 0

In [30]:
se_df = co_citation_df_se.join(co_occurence_df_se, how='left', on='beta')
se_df = se_df.select(['beta'] + sorted(se_df.select(pl.all().exclude('beta')).columns))
se_df = se_df.select(pl.concat_list(pl.all().exclude('beta')).arr.join(' & ').alias('all_fields'))
se_df

all_fields
str
"""(0.379) & (0.3…"
"""(0.633) & (0.2…"
"""(0.009) & (0.0…"
"""(1.007) & (0.9…"
"""(1.009) & (1.0…"
"""(0.989) & (1.0…"
"""(1.015) & (1.0…"
"""(0.997) & (0.9…"
"""(1.021) & (0.9…"
"""(0.318) & (0.0…"


In [31]:
coef_df = coef_df['all_fields'].to_list()
se_df = se_df['all_fields'].to_list()

row_str = ''
for i in range(len(se_df)):
    row_str += f'{coef_df[i]} \\\ \n & {se_df[i]} \\\ \n \\addlinespace[0.5em] \n'

print(row_str)

Density & -3.712*** & -5.224*** & -4.884*** & -4.97*** & -3.93*** & -4.203*** & -2.193*** & -4.793*** & -3.509*** & -5.313*** & -3.747*** & -5.434*** & -4.262*** & -5.543*** & -4.337*** & -5.081*** & -5.346*** & -6.778*** & -5.016*** & -4.924*** & -3.811*** & -5.013*** & -2.486*** & -4.592*** & -3.794*** & -5.132*** & -2.581*** & -4.825*** & -4.482*** & -5.157*** & -2.984*** & -5.485*** \\ 
 & (0.379) & (0.349) & (0.339) & (0.323) & (0.385) & (0.246) & (0.302) & (0.298) & (0.492) & (0.34) & (0.365) & (0.308) & (0.536) & (0.285) & (0.398) & (0.245) & (0.663) & (0.501) & (0.286) & (0.252) & (0.294) & (0.277) & (0.361) & (0.288) & (0.438) & (0.336) & (0.644) & (0.292) & (0.346) & (0.34) & (0.312) & (0.318) \\ 
 \addlinespace[0.5em] 
Triangles & 3.353*** & 0.972*** & 2.489*** & 0.664** & 2.684*** & 2.685*** & 0.546* & 0.168  & 1.274** & -0.498  & 1.732*** & 2.435*** & 1.978** & 2.385*** & 1.996*** & 0.977** & 0.614  & 0.632  & 2.565*** & 1.334*** & 2.985*** & 1.402*** & 2.494*** & 1.26*** 

In [32]:
new_sub_columns = ' \n& '.join(['$G_{c}$ & $G_{o}$' for _ in columns])
print(new_sub_columns)

$G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$ 
& $G_{c}$ & $G_{o}$


In [33]:
alignments = ''.join(['c']*len(columns)*2)
print(alignments)

cccccccccccccccccccccccccccccccc


In [34]:

new_columns = []
for col in columns:
    
    if ' & ' in col:
        first, second = col.split(' & ')
        s = f'\\multicolumn{{2}}{{c}}{{ \\begin{{tabular}}{{cc}} {first} \& \\\ {second} \\end{{tabular}} }}'
        new_columns.append(s)
    elif ' ' in col:
        first, second = col.split(' ')
        s = f'\\multicolumn{{2}}{{c}}{{ \\begin{{tabular}}{{cc}} {first} \\\ {second} \\end{{tabular}} }}'
        new_columns.append(s)
    else:
        s = f'\\multicolumn{{2}}{{c}}{{ \\begin{{tabular}}{{cc}} {col} \\end{{tabular}} }}'
        new_columns.append(s)
new_columns = ' \n& '.join(new_columns)
print(new_columns)

\multicolumn{2}{c}{ \begin{tabular}{cc} Artificial \\ Intelligence \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Economics \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Ethnic \& \\ Cultural Studies \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Gender \\ Studies \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Genetics \& \\ Genomics \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Geometry \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Geophysics \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Human Resources \& \\ Organizations \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Immunology \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} International \\ Business \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Language \& \\ Linguistics \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc} Material \\ Engineering \end{tabular} } 
& \multicolumn{2}{c}{ \begin{tabular}{cc}

In [35]:
mid_rules = ''
for i in range(2, len(columns)*2+1, 2):
    mid_rules += f'\\cmidrule(lr){{{i}-{i+1}}} \n'

print(mid_rules)

\cmidrule(lr){2-3} 
\cmidrule(lr){4-5} 
\cmidrule(lr){6-7} 
\cmidrule(lr){8-9} 
\cmidrule(lr){10-11} 
\cmidrule(lr){12-13} 
\cmidrule(lr){14-15} 
\cmidrule(lr){16-17} 
\cmidrule(lr){18-19} 
\cmidrule(lr){20-21} 
\cmidrule(lr){22-23} 
\cmidrule(lr){24-25} 
\cmidrule(lr){26-27} 
\cmidrule(lr){28-29} 
\cmidrule(lr){30-31} 
\cmidrule(lr){32-33} 



In [36]:
table_str = f"""
\\begin{{tabular}}{{l*{{{len(columns)*2}}}{{c}}}}
\\toprule
\\addlinespace[0.7em]
& {new_columns} \\\ 
{mid_rules}
& {new_sub_columns} \\\ 
\\midrule
\\midrule
\\addlinespace[0.5em]
{row_str}
\\bottomrule
\end{{tabular}}

"""

with open(os.path.join(LATEX_TABLE_PATH, 'ergm.tex'), "w+") as file:
    file.write(table_str)