In [1]:
import os
from typing import List, Dict, Set, Optional

import numpy as np
import pandas as pd
from pylo.language.lp import Clause as PyloClause
from kbc_pul.project_info import data_dir
from kbc_pul.amie.amie_output_rule_extraction import get_amie_rules_from_rule_tsv_file
from artificial_bias_experiments.amie_rule_learning import get_amie_rule_tsv_filename
from artificial_bias_experiments.evaluation.confidence_comparison.df_utils import ColumnNamesInfo
from artificial_bias_experiments.noisy_prop_scores.sar_popularity.experiment_info import \
    NoisyPropScoresSARPopularityExperimentInfo
from artificial_bias_experiments.noisy_prop_scores.sar_popularity.load_popularity_df_rule_wrappers import \
    get_rule_wrappers_as_dataframe_noisy_prop_scores_sar_popularity
from artificial_bias_experiments.noisy_prop_scores.sar_popularity.noise_definition import \
    NoiseTypeSARPopularity
from artificial_bias_experiments.noisy_prop_scores.sar_popularity.noisy_prop_scores_sar_popularity_file_naming import \
    NoisyPropScoresSARPopularityFileNamer

from kbc_pul.confidence_naming import ConfidenceEnum
from kbc_pul.data_structures.rule_wrapper import RuleWrapper, contains_rule_predicting_relation, \
    get_pylo_rule_from_string, is_pylo_rule_recursive
from kbc_pul.experiments_utils.file_utils import print_file_exists
from kbc_pul.experiments_utils.load_df_ground_truth import get_df_ground_truth
from kbc_pul.project_info import project_dir as kbc_e_metrics_project_dir

%load_ext autoreload
%autoreload 2

# known prop score, SAR Popularity

## Import the rule wrappers

In [2]:
dataset_name: str = "yago3_10"

# log_growth_rate_list: List[float] = [0.01, 0.1, 1]
log_growth_rate_list: List[float] = [0.01, 0.05, 0.1, 0.5,  1]
noise_type: NoiseTypeSARPopularity = NoiseTypeSARPopularity.FRACTION_OF_LOG_GROWTH_RATE
# noise_fractions_for_log_growth_rate_list: List[float] = [1.0, 0.5, 1.5]
noise_fractions_for_log_growth_rate_list: List[float] = [1.0, 0.9, 1.1]
noise_levels: List[float] = noise_fractions_for_log_growth_rate_list
noise_level_no_noise: float = 1.0


is_pca_version: bool = False

In [3]:
filename_ground_truth_dataset: str = os.path.join(
    data_dir, dataset_name, 'cleaned_csv', 'train.csv'
)
separator_ground_truth_dataset = "\t"
df_ground_truth: pd.DataFrame = get_df_ground_truth(filename_ground_truth_dataset, separator_ground_truth_dataset)
target_relation_list: List[str] = list(sorted(df_ground_truth["Rel"].unique()))

amie_min_std_confidence: float = 0.1
amie_rule_tsv_filename = get_amie_rule_tsv_filename(
    filename_ground_truth_dataset=filename_ground_truth_dataset,
    dataset_name=dataset_name,
    min_std_confidence=amie_min_std_confidence
)

amie_rule_wrappers: List[RuleWrapper] =  get_amie_rules_from_rule_tsv_file(amie_rule_tsv_filename)
len(amie_rule_wrappers)


149

In [4]:
image_dir: str = NoisyPropScoresSARPopularityFileNamer.get_dir_images(
    use_pca=is_pca_version,
    dataset_name=dataset_name,
)
if not os.path.exists(image_dir):
    os.makedirs(image_dir)
print_file_exists(image_dir)

? file exists: /home/joschout/Documents/Repos/KUL-PUL/images/artificial_bias_experiments/known_prop_scores/sar_popularity/not_pca/yago3_10
-> True


## Convert rule wrappers to dataframe

In [5]:
df_rule_wrappers_list: List[pd.DataFrame] = []

for target_relation in target_relation_list:
    if contains_rule_predicting_relation(amie_rule_wrappers,
                                         target_relation):

        for log_growth_rate in log_growth_rate_list:
            try:
                experiment_info = NoisyPropScoresSARPopularityExperimentInfo(
                    dataset_name=dataset_name,
                    target_relation=target_relation,
                    is_pca_version=is_pca_version,
                    log_growth_rate=log_growth_rate,
                    noise_type=noise_type,
                    noise_levels=noise_levels

                )

                root_dir_experiment_settings_specific: str = NoisyPropScoresSARPopularityFileNamer.get_dir_experiment_specific(
                    experiment_info
                )
                df_rule_wrappers_single_log_growth_rate: pd.DataFrame = get_rule_wrappers_as_dataframe_noisy_prop_scores_sar_popularity(
                    root_dir_experiment_settings_specific=root_dir_experiment_settings_specific,
                    target_relation=target_relation,
                )
                df_rule_wrappers_list.append(df_rule_wrappers_single_log_growth_rate)
            except Exception as err:
                print(f"Exception at {target_relation} and {log_growth_rate}")
                # raise err
    else:
        print(f"No rules predicting {target_relation}")

df_rule_wrappers_all_targets = pd.concat(
    df_rule_wrappers_list, axis=0
)

No rules predicting edited
No rules predicting hasacademicadvisor
No rules predicting hasmusicalrole
No rules predicting haswebsite
No rules predicting influences
Exception at isaffiliatedto and 0.5
Exception at isaffiliatedto and 1
No rules predicting isknownfor
Exception at playsfor and 0.5
Exception at playsfor and 1
No rules predicting wrotemusicfor


In [6]:
df_rule_wrappers_all_targets.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,Nb supported predictions,Body size,$conf$,CWA,$\frac{\left| \mathbf{R}\right|}{\left| \mathbf{R_s}\right|} conf$ $p$,PCA $p$,$\frac{\left| \mathbf{R}\right|}{\left| \mathbf{R_s}\right|} conf$ $p^{-1}$,PCA ${p^{-1}}$,IPW,IPW-PCA $p$,IPW-PCA ${p^{-1}}$,ICW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.347231,0.092511,0.104007,0.070707,0.098981,0.080305,0.080305,0.193671
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.347231,0.092511,0.104007,0.070707,0.109925,0.080297,0.080297,0.214369
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.347231,0.092511,0.104007,0.070707,0.090033,0.080314,0.080314,0.176271
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.347231,0.09447,0.104007,0.070568,0.114317,0.102902,0.102902,0.182105
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.347231,0.09447,0.104007,0.070568,0.126966,0.102905,0.102905,0.201642


In [7]:
df_rule_wrappers_all_targets.columns

Index(['target_relation', 'log_growth_rate', 'noise_level',
       'random_trial_index', 'Rule', 'Nb supported predictions', 'Body size',
       '$conf$', 'CWA',
       '$\frac{\left| \mathbf{R}\right|}{\left| \mathbf{R_s}\right|} conf$ $p$',
       'PCA $p$',
       '$\frac{\left| \mathbf{R}\right|}{\left| \mathbf{R_s}\right|} conf$ $p^{-1}$',
       'PCA ${p^{-1}}$', 'IPW', 'IPW-PCA $p$', 'IPW-PCA ${p^{-1}}$', 'ICW'],
      dtype='object')

In [8]:
column_names_logistics: List[str] = [
    'target_relation',
    'log_growth_rate',
    'noise_level',
    'random_trial_index',
    'Rule',
]

## 2. Only keep a subset of rules
### 2.0 Only keep the additive noise levels we are interested in

In [9]:
df_rule_wrappers_all_targets = df_rule_wrappers_all_targets[
    df_rule_wrappers_all_targets['noise_level'].isin(noise_levels)
]

### 2.1. Only keep the non-recursive rules; drop recursive rules

In [10]:


def is_rule_recursive(rule_string: str) -> bool:
        pylo_rule: PyloClause = get_pylo_rule_from_string(rule_string)
        is_rule_recursive = is_pylo_rule_recursive(pylo_rule)
        return is_rule_recursive


mask_recursive_rules = df_rule_wrappers_all_targets.apply(
    lambda row: is_rule_recursive(row["Rule"]),
    axis=1
)

In [11]:
print(len(df_rule_wrappers_all_targets))
df_rule_wrappers_all_targets: pd.DataFrame = df_rule_wrappers_all_targets[~mask_recursive_rules]
print(len(df_rule_wrappers_all_targets))

22078
7168


### 2.3 Drop the Pair-positive columns (both directions)

In [12]:
df_rule_wrappers_all_targets.drop(
        [ConfidenceEnum.TRUE_CONF_BIAS_YS_ZERO_S_TO_O.value,
         ConfidenceEnum.TRUE_CONF_BIAS_YS_ZERO_O_TO_S.value],
        axis=1,
        inplace=True,
        errors='ignore'
)
df_rule_wrappers_all_targets.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,Nb supported predictions,Body size,$conf$,CWA,PCA $p$,PCA ${p^{-1}}$,IPW,IPW-PCA $p$,IPW-PCA ${p^{-1}}$,ICW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.098981,0.080305,0.080305,0.193671
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.109925,0.080297,0.080297,0.214369
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.090033,0.080314,0.080314,0.176271
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.09447,0.070568,0.114317,0.102902,0.102902,0.182105
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.09447,0.070568,0.126966,0.102905,0.102905,0.201642


### 2.4 Drop the IPW-PCA columns (both directions)

In [13]:
df_rule_wrappers_all_targets.drop(
        [ConfidenceEnum.IPW_PCA_CONF_S_TO_O.value,
         ConfidenceEnum.IPW_PCA_CONF_O_TO_S.value],
        axis=1,
        inplace=True,
        errors='ignore'
)
df_rule_wrappers_all_targets.head()


Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,Nb supported predictions,Body size,$conf$,CWA,PCA $p$,PCA ${p^{-1}}$,IPW,ICW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.098981,0.193671
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.109925,0.214369
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.090033,0.176271
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.09447,0.070568,0.114317,0.182105
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.09447,0.070568,0.126966,0.201642


### 2.4 Drop everything with less than 10 trials

In [14]:
group_by_list =  [
    "target_relation",
    "filter_relation",
    'true_other',
    'noisy_other',
    "Rule",
    "random_trial_index"
]
df_count_trials: pd.DataFrame = df_rule_wrappers_all_targets[
      [
        'target_relation',
        'log_growth_rate',
        'noise_level',
        'random_trial_index',
        'Rule',
    ]
].groupby(
    [
    'target_relation',
    'log_growth_rate',
    'noise_level',
    'Rule',
    ]
).count().reset_index()

In [15]:
df_less_than_ten_trials: pd.DataFrame = df_count_trials[df_count_trials["random_trial_index"].values != 10]
df_less_than_ten_trials

Unnamed: 0,target_relation,log_growth_rate,noise_level,Rule,random_trial_index
408,isaffiliatedto,0.05,0.9,"isaffiliatedto(A,B) :- playsfor(A,B)",5
411,isaffiliatedto,0.1,1.0,"isaffiliatedto(A,B) :- playsfor(A,B)",2
412,isaffiliatedto,0.1,1.1,"isaffiliatedto(A,B) :- playsfor(A,B)",2
671,playsfor,0.05,0.9,"playsfor(A,B) :- isaffiliatedto(A,B)",5
674,playsfor,0.1,1.0,"playsfor(A,B) :- isaffiliatedto(A,B)",2
675,playsfor,0.1,1.1,"playsfor(A,B) :- isaffiliatedto(A,B)",2


In [16]:
df_rule_wrappers_all_targets = df_rule_wrappers_all_targets[
    ~(
        (
            (df_rule_wrappers_all_targets["target_relation"] == "isaffiliatedto")
            &
            (df_rule_wrappers_all_targets["Rule"]=="isaffiliatedto(A,B) :- playsfor(A,B)")
        )
        |
        (
            (df_rule_wrappers_all_targets["target_relation"] == "playsfor")
            &
            (df_rule_wrappers_all_targets["Rule"]=="playsfor(A,B) :- isaffiliatedto(A,B)")
        )
  )

]
df_rule_wrappers_all_targets.head()


Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,Nb supported predictions,Body size,$conf$,CWA,PCA $p$,PCA ${p^{-1}}$,IPW,ICW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.098981,0.193671
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.109925,0.214369
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",42,5481,0.101806,0.007663,0.092511,0.070707,0.090033,0.176271
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.09447,0.070568,0.114317,0.182105
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",41,5481,0.101806,0.00748,0.09447,0.070568,0.126966,0.201642


**Now, we have the full dataframe**

****

## Calculate $[conf(R) - \widehat{conf}(R)]$

In [17]:
true_conf: ConfidenceEnum = ConfidenceEnum.TRUE_CONF

conf_estimators_list: List[ConfidenceEnum] = [
    ConfidenceEnum.CWA_CONF,
    ConfidenceEnum.ICW_CONF,
    ConfidenceEnum.PCA_CONF_S_TO_O,
    ConfidenceEnum.PCA_CONF_O_TO_S,
     ConfidenceEnum.IPW_CONF,
]
all_confs_list: List[ConfidenceEnum] = [ConfidenceEnum.TRUE_CONF ] + conf_estimators_list

column_names_all_confs: List[str] = [
    conf.get_name()
    for conf in all_confs_list
]

In [18]:
df_rule_wrappers_all_targets = df_rule_wrappers_all_targets[
    column_names_logistics + column_names_all_confs
]
df_rule_wrappers_all_targets.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,$conf$,CWA,ICW,PCA $p$,PCA ${p^{-1}}$,IPW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",0.101806,0.007663,0.193671,0.092511,0.070707,0.098981
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",0.101806,0.007663,0.214369,0.092511,0.070707,0.109925
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",0.101806,0.007663,0.176271,0.092511,0.070707,0.090033
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",0.101806,0.00748,0.182105,0.09447,0.070568,0.114317
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",0.101806,0.00748,0.201642,0.09447,0.070568,0.126966


In [19]:
df_conf_estimators_no_noise = df_rule_wrappers_all_targets[
    df_rule_wrappers_all_targets["noise_level"] == noise_level_no_noise
]
df_conf_estimators_no_noise.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,$conf$,CWA,ICW,PCA $p$,PCA ${p^{-1}}$,IPW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",0.101806,0.007663,0.193671,0.092511,0.070707,0.098981
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",0.101806,0.00748,0.182105,0.09447,0.070568,0.114317
6,actedin,0.01,1.0,4,"actedin(A,B) :- directed(A,B)",0.101806,0.005473,0.14581,0.080214,0.052817,0.0955
9,actedin,0.01,1.0,6,"actedin(A,B) :- directed(A,B)",0.101806,0.005656,0.145839,0.096273,0.050489,0.092966
12,actedin,0.01,1.0,1,"actedin(A,B) :- directed(A,B)",0.101806,0.006751,0.167064,0.087264,0.065954,0.086594


In [20]:
column_names_info =ColumnNamesInfo(
        true_conf=true_conf,
        column_name_true_conf=true_conf.get_name(),
        conf_estimators=conf_estimators_list,
        column_names_conf_estimators=[
            col.get_name()
            for col in conf_estimators_list
        ],
        column_names_logistics=column_names_logistics
    )


In [21]:
def get_df_rulewise_squared_diffs_between_true_conf_and_conf_estimator(
        df_rule_wrappers: pd.DataFrame,
        column_names_info: ColumnNamesInfo
) -> pd.DataFrame:
    df_rulewise_diffs_between_true_conf_and_conf_estimator: pd.DataFrame = df_rule_wrappers[
        column_names_info.column_names_logistics
    ]

    col_name_estimator: str
    for col_name_estimator in column_names_info.column_names_conf_estimators:
        df_rulewise_diffs_between_true_conf_and_conf_estimator \
            = df_rulewise_diffs_between_true_conf_and_conf_estimator.assign(
                **{
                    col_name_estimator: (
                        (df_rule_wrappers[column_names_info.column_name_true_conf]
                            - df_rule_wrappers[col_name_estimator]) ** 2
                    )
                }
            )
    return df_rulewise_diffs_between_true_conf_and_conf_estimator

df_conf_squared_errors: pd.DataFrame = get_df_rulewise_squared_diffs_between_true_conf_and_conf_estimator(
    df_rule_wrappers=df_rule_wrappers_all_targets,
    column_names_info = column_names_info
)
df_conf_squared_errors.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,CWA,ICW,PCA $p$,PCA ${p^{-1}}$,IPW
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",0.008863,0.008439,8.6e-05,0.000967,8e-06
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",0.008863,0.01267,8.6e-05,0.000967,6.6e-05
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",0.008863,0.005545,8.6e-05,0.000967,0.000139
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",0.008897,0.006448,5.4e-05,0.000976,0.000157
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",0.008897,0.009967,5.4e-05,0.000976,0.000633


## AVERAGE the PCA(S) and PCA(O)

In [22]:
df_conf_squared_errors["PCA"] = (
    (
            df_conf_squared_errors[ConfidenceEnum.PCA_CONF_S_TO_O.value]
            +
            df_conf_squared_errors[ConfidenceEnum.PCA_CONF_O_TO_S.value]
    ) / 2
)
df_conf_squared_errors.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,CWA,ICW,PCA $p$,PCA ${p^{-1}}$,IPW,PCA
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",0.008863,0.008439,8.6e-05,0.000967,8e-06,0.000527
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",0.008863,0.01267,8.6e-05,0.000967,6.6e-05,0.000527
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",0.008863,0.005545,8.6e-05,0.000967,0.000139,0.000527
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",0.008897,0.006448,5.4e-05,0.000976,0.000157,0.000515
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",0.008897,0.009967,5.4e-05,0.000976,0.000633,0.000515


In [23]:
df_conf_squared_errors = df_conf_squared_errors.drop(
        columns=[
            ConfidenceEnum.PCA_CONF_S_TO_O.value,
            ConfidenceEnum.PCA_CONF_O_TO_S.value
        ],
        axis=1,
        errors='ignore'
    )
df_conf_squared_errors.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,random_trial_index,Rule,CWA,ICW,IPW,PCA
0,actedin,0.01,1.0,8,"actedin(A,B) :- directed(A,B)",0.008863,0.008439,8e-06,0.000527
1,actedin,0.01,0.9,8,"actedin(A,B) :- directed(A,B)",0.008863,0.01267,6.6e-05,0.000527
2,actedin,0.01,1.1,8,"actedin(A,B) :- directed(A,B)",0.008863,0.005545,0.000139,0.000527
3,actedin,0.01,1.0,9,"actedin(A,B) :- directed(A,B)",0.008897,0.006448,0.000157,0.000515
4,actedin,0.01,0.9,9,"actedin(A,B) :- directed(A,B)",0.008897,0.009967,0.000633,0.000515


# Now start averaging


In [24]:
df_conf_squared_errors_avg_over_trials: pd.DataFrame = df_conf_squared_errors.groupby(
    by=["target_relation", "log_growth_rate", 'noise_level', "Rule"],
    sort=True,
    as_index=False
).mean()
df_conf_squared_errors_avg_over_trials.drop(
        columns=["random_trial_index"],
        axis=1,
        errors='ignore',
        inplace=True
)

df_conf_squared_errors_avg_over_trials.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,Rule,CWA,ICW,IPW,PCA
0,actedin,0.01,0.9,"actedin(A,B) :- directed(A,B)",0.009002,0.008603,0.000418,0.000861
1,actedin,0.01,1.0,"actedin(A,B) :- directed(A,B)",0.009002,0.005509,0.000152,0.000861
2,actedin,0.01,1.1,"actedin(A,B) :- directed(A,B)",0.009002,0.003455,0.000142,0.000861
3,actedin,0.05,0.9,"actedin(A,B) :- directed(A,B)",0.004771,0.008352,0.00024,0.003427
4,actedin,0.05,1.0,"actedin(A,B) :- directed(A,B)",0.004771,0.005406,7.5e-05,0.003427


In [25]:
df_conf_squared_errors_avg_over_trials_and_rules: pd.DataFrame = df_conf_squared_errors_avg_over_trials.groupby(
    by=["target_relation", "log_growth_rate", 'noise_level',],
    sort=True,
    as_index=False
).mean()
df_conf_squared_errors_avg_over_trials_and_rules.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,CWA,ICW,IPW,PCA
0,actedin,0.01,0.9,0.009002,0.008603,0.000418,0.000861
1,actedin,0.01,1.0,0.009002,0.005509,0.000152,0.000861
2,actedin,0.01,1.1,0.009002,0.003455,0.000142,0.000861
3,actedin,0.05,0.9,0.004771,0.008352,0.00024,0.003427
4,actedin,0.05,1.0,0.004771,0.005406,7.5e-05,0.003427


In [26]:
len(df_conf_squared_errors_avg_over_trials_and_rules)


315

In [27]:
df_conf_squared_errors_avg_over_trials_and_rules_no_noise: pd.DataFrame = df_conf_squared_errors_avg_over_trials_and_rules[
    df_conf_squared_errors_avg_over_trials_and_rules["noise_level"] == noise_level_no_noise
]
df_conf_squared_errors_avg_over_trials_and_rules_no_noise

Unnamed: 0,target_relation,log_growth_rate,noise_level,CWA,ICW,IPW,PCA
1,actedin,0.01,1.0,9.001723e-03,0.005509,1.520177e-04,0.000861
4,actedin,0.05,1.0,4.770706e-03,0.005406,7.463840e-05,0.003427
7,actedin,0.10,1.0,2.170461e-03,0.004424,1.817430e-05,0.009314
10,actedin,0.50,1.0,1.797522e-05,0.000567,9.813232e-07,0.027928
13,actedin,1.00,1.0,2.429983e-07,0.000076,7.645068e-08,0.029955
...,...,...,...,...,...,...,...
301,worksat,0.01,1.0,5.266431e-02,0.008039,8.281230e-03,0.030247
304,worksat,0.05,1.0,3.105694e-02,0.001068,5.492093e-04,0.014150
307,worksat,0.10,1.0,1.485991e-02,0.001621,4.201051e-04,0.005694
310,worksat,0.50,1.0,4.988448e-05,0.000127,3.370548e-06,0.006617


## Subset of noisy_other

In [28]:
noise_levels_not_noiseless: Set[float] = {
    noise_level for noise_level in noise_levels if noise_level != noise_level_no_noise
}

log_growth_rate_list

# true_label_frequencies_set: Set[float] = {
#     0.3, 0.7,
# }
# true_label_frequency_to_estimate_map: Dict[float, Set[float]] = dict()
#
# label_frequency_est_diff: float = 0.1
# label_frequencies_to_keep: Set[float] = set(true_label_frequencies_set)
# for true_label_freq in true_label_frequencies_set:
#     true_label_frequency_to_estimate_map[true_label_freq] = {
#         round(true_label_freq - label_frequency_est_diff, 1),
#         round(true_label_freq + label_frequency_est_diff, 1)
#     }
#     label_frequencies_to_keep.update(true_label_frequency_to_estimate_map[true_label_freq])

[0.01, 0.05, 0.1, 0.5, 1]

In [29]:
df_conf_errors_avg_over_trials_and_rules_noisy = df_conf_squared_errors_avg_over_trials_and_rules[
    df_conf_squared_errors_avg_over_trials_and_rules["noise_level"].isin(noise_levels_not_noiseless)
]
df_conf_squared_errors_avg_over_trials_and_rules.head()

Unnamed: 0,target_relation,log_growth_rate,noise_level,CWA,ICW,IPW,PCA
0,actedin,0.01,0.9,0.009002,0.008603,0.000418,0.000861
1,actedin,0.01,1.0,0.009002,0.005509,0.000152,0.000861
2,actedin,0.01,1.1,0.009002,0.003455,0.000142,0.000861
3,actedin,0.05,0.9,0.004771,0.008352,0.00024,0.003427
4,actedin,0.05,1.0,0.004771,0.005406,7.5e-05,0.003427


In [30]:
len(df_conf_squared_errors_avg_over_trials_and_rules)

315

## Count the rules per $p$

In [31]:
df_n_rules_per_target = df_rule_wrappers_all_targets[["target_relation", "Rule"]].groupby(
    by=['target_relation'],
    # sort=True,
    # as_index=False
)["Rule"].nunique().to_frame().reset_index().rename(
    columns={"Rule" : "# rules"}
)


df_n_rules_per_target.head()

Unnamed: 0,target_relation,# rules
0,actedin,1
1,created,2
2,dealswith,7
3,diedin,1
4,directed,2


****
# Format pretty table

Goal:
* put smallest value per row in BOLT
* per target: noise level +0.0 and 0.01

In [32]:
df_conf_squared_errors_avg_over_trials_and_rules["noise_level"].unique()

array([0.9, 1. , 1.1])

In [33]:
log_growth_rate_to_noise_level_to_df_map: Dict[float, Dict[float, pd.DataFrame]] = dict()
for log_growth_rate in log_growth_rate_list:
    df_log_growth_rate_tmp: pd.DataFrame = df_conf_squared_errors_avg_over_trials_and_rules[
        df_conf_squared_errors_avg_over_trials_and_rules["log_growth_rate"] == log_growth_rate
    ]
    noise_level_to_df_map = dict()
    log_growth_rate_to_noise_level_to_df_map[log_growth_rate] = noise_level_to_df_map

    for noise_level in noise_levels:
        df_log_growth_rate_and_noise_level_tmp: pd.DataFrame = df_log_growth_rate_tmp[
            df_log_growth_rate_tmp["noise_level"] == noise_level
        ]
        noise_level_to_df_map[noise_level] = df_log_growth_rate_and_noise_level_tmp[
            [col for col in df_log_growth_rate_and_noise_level_tmp.columns if col != "noise_level" and col != "log_growth_rate"]
        ]

log_growth_rate_to_noise_level_to_df_map[0.01][noise_level_no_noise].head()



Unnamed: 0,target_relation,CWA,ICW,IPW,PCA
1,actedin,0.009002,0.005509,0.000152,0.000861
16,created,0.083106,0.018787,0.006104,0.018811
31,dealswith,0.0123,0.004319,0.00093,0.004645
46,diedin,0.016054,0.003292,0.001255,0.014843
61,directed,0.100014,0.012173,0.019268,0.027775


In [34]:
from typing import Iterator

noiseless_log_growth_rate_to_df_map = dict()

label_freq_estimators: Iterator[float]
for log_growth_rate in log_growth_rate_list:
    noise_level_to_df_map: Dict[float, pd.DataFrame]  = log_growth_rate_to_noise_level_to_df_map[log_growth_rate]

    df_noiseless_log_growth_rate: pd.DataFrame = noise_level_to_df_map[noise_level_no_noise]

    lower_noise: float = 0.9
    higher_noise: float = 1.1


    df_lower: pd.DataFrame = noise_level_to_df_map[lower_noise][
        ['target_relation', ConfidenceEnum.IPW_CONF.value]
    ].rename(
        columns={
            ConfidenceEnum.IPW_CONF.value: f"{ConfidenceEnum.IPW_CONF.value}_lower"
        }
    )
    # break
    # df_lower.head()
    df_noiseless_log_growth_rate = pd.merge(
        left=df_noiseless_log_growth_rate,
        right=df_lower,
        on="target_relation"
    )


    df_higher = noise_level_to_df_map[higher_noise][
        ['target_relation', ConfidenceEnum.IPW_CONF.value]
    ].rename(
        columns={
            ConfidenceEnum.IPW_CONF.value: f"{ConfidenceEnum.IPW_CONF.value}_higher"
        }
    )

    df_noiseless_log_growth_rate = pd.merge(
        left=df_noiseless_log_growth_rate,
        right=df_higher,
        on="target_relation"
    )

    noiseless_log_growth_rate_to_df_map[log_growth_rate] = df_noiseless_log_growth_rate

noiseless_log_growth_rate_to_df_map[0.01].head()



Unnamed: 0,target_relation,CWA,ICW,IPW,PCA,IPW_lower,IPW_higher
0,actedin,0.009002,0.005509,0.000152,0.000861,0.000418,0.000142
1,created,0.083106,0.018787,0.006104,0.018811,0.006373,0.00738
2,dealswith,0.0123,0.004319,0.00093,0.004645,0.001911,0.000645
3,diedin,0.016054,0.003292,0.001255,0.014843,0.001424,0.00141
4,directed,0.100014,0.012173,0.019268,0.027775,0.028452,0.01469


In [35]:
log_growth_rate_list

[0.01, 0.05, 0.1, 0.5, 1]

In [36]:
first_log_growth_rate_to_include = 0.01
second_log_growth_rate_to_include = 0.1

df_one_row_per_target = pd.merge(
    left=noiseless_log_growth_rate_to_df_map[first_log_growth_rate_to_include],
    right=noiseless_log_growth_rate_to_df_map[second_log_growth_rate_to_include],
    on="target_relation",
    suffixes=(
        f"_{first_log_growth_rate_to_include}",
        f"_{second_log_growth_rate_to_include}")
)
df_one_row_per_target.head()

Unnamed: 0,target_relation,CWA_0.01,ICW_0.01,IPW_0.01,PCA_0.01,IPW_lower_0.01,IPW_higher_0.01,CWA_0.1,ICW_0.1,IPW_0.1,PCA_0.1,IPW_lower_0.1,IPW_higher_0.1
0,actedin,0.009002,0.005509,0.000152,0.000861,0.000418,0.000142,0.00217,0.004424,1.8e-05,0.009314,0.000129,3.7e-05
1,created,0.083106,0.018787,0.006104,0.018811,0.006373,0.00738,0.01979,0.000866,0.000313,0.020267,0.001022,0.000651
2,dealswith,0.0123,0.004319,0.00093,0.004645,0.001911,0.000645,7.7e-05,3.7e-05,1.8e-05,0.002346,2.9e-05,1.9e-05
3,diedin,0.016054,0.003292,0.001255,0.014843,0.001424,0.00141,0.005988,0.000554,2.7e-05,0.008339,0.000207,0.000119
4,directed,0.100014,0.012173,0.019268,0.027775,0.028452,0.01469,0.024723,0.004312,0.000345,0.021924,0.001521,0.000591


## What is the smallest value?

In [37]:
all_values: np.ndarray = df_one_row_per_target[
    [ col
      for col in df_one_row_per_target.columns
      if col != "target_relation"
      ]
].values

min_val = np.amin(all_values)
min_val

7.400770684479044e-06

In [38]:
min_val * 10000

0.07400770684479044

In [39]:
max_val = np.amax(all_values)
max_val

0.2649064640128459

In [40]:
max_val * 10000

2649.064640128459

In [41]:
df_one_row_per_target.head() * 10000

Unnamed: 0,target_relation,CWA_0.01,ICW_0.01,IPW_0.01,PCA_0.01,IPW_lower_0.01,IPW_higher_0.01,CWA_0.1,ICW_0.1,IPW_0.1,PCA_0.1,IPW_lower_0.1,IPW_higher_0.1
0,actedinactedinactedinactedinactedinactedinacte...,90.017227,55.086609,1.520177,8.608356,4.175971,1.415528,21.704609,44.235785,0.181743,93.137442,1.293013,0.373449
1,createdcreatedcreatedcreatedcreatedcreatedcrea...,831.060109,187.870776,61.044488,188.11438,63.730342,73.796016,197.900411,8.659567,3.13354,202.667843,10.221416,6.51433
2,dealswithdealswithdealswithdealswithdealswithd...,123.001596,43.192387,9.301353,46.445522,19.108806,6.45209,0.765903,0.367184,0.177912,23.460657,0.288217,0.193411
3,diedindiedindiedindiedindiedindiedindiedindied...,160.535336,32.919265,12.552012,148.431334,14.242989,14.100801,59.879302,5.535506,0.274268,83.391168,2.071926,1.191496
4,directeddirecteddirecteddirecteddirecteddirect...,1000.139466,121.726191,192.679267,277.754081,284.524519,146.90206,247.225601,43.121416,3.452956,219.239306,15.210912,5.908889


In [42]:
df_one_row_per_target.dtypes

target_relation     object
CWA_0.01           float64
ICW_0.01           float64
IPW_0.01           float64
PCA_0.01           float64
IPW_lower_0.01     float64
IPW_higher_0.01    float64
CWA_0.1            float64
ICW_0.1            float64
IPW_0.1            float64
PCA_0.1            float64
IPW_lower_0.1      float64
IPW_higher_0.1     float64
dtype: object

In [43]:
exponent = 4

multiplication_factor = 10 ** exponent
multiplication_factor

10000

In [44]:
df_one_row_per_target[
    df_one_row_per_target.select_dtypes(include=['number']).columns
] *= multiplication_factor

df_one_row_per_target

Unnamed: 0,target_relation,CWA_0.01,ICW_0.01,IPW_0.01,PCA_0.01,IPW_lower_0.01,IPW_higher_0.01,CWA_0.1,ICW_0.1,IPW_0.1,PCA_0.1,IPW_lower_0.1,IPW_higher_0.1
0,actedin,90.017227,55.086609,1.520177,8.608356,4.175971,1.415528,21.704609,44.235785,0.181743,93.137442,1.293013,0.373449
1,created,831.060109,187.870776,61.044488,188.11438,63.730342,73.796016,197.900411,8.659567,3.13354,202.667843,10.221416,6.51433
2,dealswith,123.001596,43.192387,9.301353,46.445522,19.108806,6.45209,0.765903,0.367184,0.177912,23.460657,0.288217,0.193411
3,diedin,160.535336,32.919265,12.552012,148.431334,14.242989,14.100801,59.879302,5.535506,0.274268,83.391168,2.071926,1.191496
4,directed,1000.139466,121.726191,192.679267,277.754081,284.524519,146.90206,247.225601,43.121416,3.452956,219.239306,15.210912,5.908889
5,exports,31.324256,1.640039,13.096302,16.222405,17.350232,11.161703,0.110069,0.145113,0.107495,3.234288,0.159197,0.085727
6,graduatedfrom,476.786566,37.842347,18.376165,259.993822,23.865551,23.059943,162.240162,1.892679,1.699344,76.74631,8.381784,3.484546
7,happenedin,761.461212,78.07055,14.188279,602.554453,28.706332,16.358138,346.315204,27.663051,1.066135,210.309399,8.486935,6.674478
8,hascapital,72.18114,49.124517,6.667978,404.749582,9.394169,6.579504,19.661899,51.779118,0.509042,654.188579,1.239141,0.603147
9,hasneighbor,110.118481,26.031879,9.379282,12.787871,15.935822,8.664088,0.37278,0.22562,0.156667,51.254143,0.190293,0.172166


In [45]:
df_one_row_per_target.head()

Unnamed: 0,target_relation,CWA_0.01,ICW_0.01,IPW_0.01,PCA_0.01,IPW_lower_0.01,IPW_higher_0.01,CWA_0.1,ICW_0.1,IPW_0.1,PCA_0.1,IPW_lower_0.1,IPW_higher_0.1
0,actedin,90.017227,55.086609,1.520177,8.608356,4.175971,1.415528,21.704609,44.235785,0.181743,93.137442,1.293013,0.373449
1,created,831.060109,187.870776,61.044488,188.11438,63.730342,73.796016,197.900411,8.659567,3.13354,202.667843,10.221416,6.51433
2,dealswith,123.001596,43.192387,9.301353,46.445522,19.108806,6.45209,0.765903,0.367184,0.177912,23.460657,0.288217,0.193411
3,diedin,160.535336,32.919265,12.552012,148.431334,14.242989,14.100801,59.879302,5.535506,0.274268,83.391168,2.071926,1.191496
4,directed,1000.139466,121.726191,192.679267,277.754081,284.524519,146.90206,247.225601,43.121416,3.452956,219.239306,15.210912,5.908889


## Output files definitions

In [46]:
dir_latex_table: str = os.path.join(
    kbc_e_metrics_project_dir,
    "paper_latex_tables",
    'known_prop_scores',
    'sar_popularity'
)

if not os.path.exists(dir_latex_table):
    os.makedirs(dir_latex_table)

filename_tsv_rule_stats = os.path.join(
    dir_latex_table,
    "conf_error_stats_v3.tsv"
)
filename_tsv_single_row_summary = os.path.join(
    dir_latex_table,
    "noisy_sar_popularity_single_row_summary.tsv"
)

## Create single-row summary

In [47]:
df_one_row_in_total: pd.Series = df_one_row_per_target.mean(
)


df_one_row_in_total

  df_one_row_in_total: pd.Series = df_one_row_per_target.mean(


CWA_0.01           458.569530
ICW_0.01           168.455141
IPW_0.01            62.810261
PCA_0.01           264.160768
IPW_lower_0.01      81.817065
IPW_higher_0.01     56.567828
CWA_0.1            172.288258
ICW_0.1             50.979464
IPW_0.1              3.465404
PCA_0.1            182.677279
IPW_lower_0.1        7.355887
IPW_higher_0.1       6.200750
dtype: float64

In [48]:
df_n_rules_per_target.head()

Unnamed: 0,target_relation,# rules
0,actedin,1
1,created,2
2,dealswith,7
3,diedin,1
4,directed,2


In [49]:
df_one_row_in_total["# rules"] = int(df_n_rules_per_target["# rules"].sum())
df_one_row_in_total

CWA_0.01           458.569530
ICW_0.01           168.455141
IPW_0.01            62.810261
PCA_0.01           264.160768
IPW_lower_0.01      81.817065
IPW_higher_0.01     56.567828
CWA_0.1            172.288258
ICW_0.1             50.979464
IPW_0.1              3.465404
PCA_0.1            182.677279
IPW_lower_0.1        7.355887
IPW_higher_0.1       6.200750
# rules             47.000000
dtype: float64

In [50]:
type(df_one_row_in_total)

pandas.core.series.Series

In [51]:
df_one_row_in_total.to_csv(
    filename_tsv_single_row_summary,
    sep = "\t",
    header=None
)

### Now create a pretty table

In [52]:
column_names_info.column_names_conf_estimators

['CWA', 'ICW', 'PCA $p$', 'PCA ${p^{-1}}$', 'IPW']

In [53]:
simplified_column_names_conf_estimators = ['CWA', 'PCA', 'ICW', 'IPW',]

In [54]:
multi_index_columns = [
    ("$p$", ""),
    ("\# R", "")
]
from itertools import product

# conf_upper_cols = column_names_info.column_names_conf_estimators + [
#     f"{ConfidenceEnum.IPW_CONF.value} " + "($\Delta c=-" + f"{label_frequency_est_diff}" + "$)",
#     f"{ConfidenceEnum.IPW_CONF.value} " + "($\Delta c=" + f"{label_frequency_est_diff}" + "$)",
# ]
conf_upper_cols = simplified_column_names_conf_estimators+ [
    f"{ConfidenceEnum.IPW_CONF.value} " + "($-\Delta$)",
    f"{ConfidenceEnum.IPW_CONF.value} " + "($+\Delta$)",
]

c_subcols = [f"$k={first_log_growth_rate_to_include}$", f"$k={second_log_growth_rate_to_include}$"]

multi_index_columns = multi_index_columns + list(product(c_subcols, conf_upper_cols))
# multi_index_list

multi_index_columns = pd.MultiIndex.from_tuples(multi_index_columns)
multi_index_columns

MultiIndex([(     '$p$',                ''),
            (    '\# R',                ''),
            ('$k=0.01$',             'CWA'),
            ('$k=0.01$',             'PCA'),
            ('$k=0.01$',             'ICW'),
            ('$k=0.01$',             'IPW'),
            ('$k=0.01$', 'IPW ($-\Delta$)'),
            ('$k=0.01$', 'IPW ($+\Delta$)'),
            ( '$k=0.1$',             'CWA'),
            ( '$k=0.1$',             'PCA'),
            ( '$k=0.1$',             'ICW'),
            ( '$k=0.1$',             'IPW'),
            ( '$k=0.1$', 'IPW ($-\Delta$)'),
            ( '$k=0.1$', 'IPW ($+\Delta$)')],
           )

In [55]:

rule_counter: int = 1
rule_str_to_rule_id_map: Dict[str, int] = {}


float_precision: int = 1

col_name_conf_estimator: str


pretty_rows: List[List] = []

row_index: int
row: pd.Series

LogGrowthRate = float

def get_dict_with_smallest_estimator_per_log_growth_rate(row: pd.Series) -> Dict[LogGrowthRate, Set[str]]:
    # Find estimator with smallest mean value for label frequency###################
    log_growth_rate_to_set_of_smallest_est_map: Dict[LogGrowthRate, Set[str]] = dict()

    log_growth_rate: LogGrowthRate
    for log_growth_rate in [first_log_growth_rate_to_include, second_log_growth_rate_to_include]:
        o_set_of_col_names_with_min_value: Optional[Set[str]] = None
        o_current_smallest_value: Optional[float] = None
        # Find smallest squared error
        for col_name_conf_estimator in simplified_column_names_conf_estimators:
            current_val: float = row[f"{col_name_conf_estimator}_{log_growth_rate}"]

            if o_set_of_col_names_with_min_value is None or o_current_smallest_value > current_val:
                o_set_of_col_names_with_min_value = {col_name_conf_estimator}
                o_current_smallest_value = current_val
            elif current_val == o_current_smallest_value:
                o_set_of_col_names_with_min_value.update(col_name_conf_estimator)

        log_growth_rate_to_set_of_smallest_est_map[log_growth_rate] = o_set_of_col_names_with_min_value
    return log_growth_rate_to_set_of_smallest_est_map


def format_value_depending_on_whether_it_is_smallest(
        value: float,
        is_smallest: bool,
        float_precision: float,
        use_si: bool = False
)-> str:
    if is_smallest:
        if not use_si:
            formatted_value = "$\\bm{" + f"{value:0.{float_precision}f}" + "}$"
            # formatted_value = "$\\bm{" + f"{value:0.{float_precision}e}" + "}$"
        else:
            formatted_value = "\\textbf{$" + f"\\num[round-precision={float_precision},round-mode=figures,scientific-notation=true]"+\
                              "{"+ str(value) + "}"+ "$}"
    else:
        if not use_si:
            formatted_value = f"${value:0.{float_precision}f}$"
            # formatted_value = f"${value:0.{float_precision}e}$"
        else:
            formatted_value = "$" + f"\\num[round-precision={float_precision},round-mode=figures,scientific-notation=true]"+\
                              "{"+ str(value) + "}"+ "$"

    return formatted_value


estimator_columns = simplified_column_names_conf_estimators + [
    f"{ConfidenceEnum.IPW_CONF.value}_lower",
    f"{ConfidenceEnum.IPW_CONF.value}_higher"
]

# For each row, i.e. for each target relation
for row_index, row in df_one_row_per_target.iterrows():

    # Find estimator with smallest mean value for label frequency###################
    log_growth_rate_to_set_of_smallest_est_map: Dict[float, Set[str]] = get_dict_with_smallest_estimator_per_log_growth_rate(
        row=row
    )
    ##################################################################################
    # Construct the new row
    ######################

    target_relation = row["target_relation"]


    nb_of_rules = df_n_rules_per_target[df_n_rules_per_target['target_relation'] == target_relation][
        "# rules"
    ].iloc[0]

    new_row: List[str] = [
        target_relation,
        nb_of_rules
    ]
    # For each Confidence estimator, get the value at c 0.3 and 0.7
    # for col_name_conf_estimator in estimator_columns:
    #     mean_val_03:float = row[f"{col_name_conf_estimator}_0.3"]
    #     mean_val_07:float = row[f"{col_name_conf_estimator}_0.7"]
    #
    #     new_row_value = (
    #         format_value_depending_on_whether_it_is_smallest(
    #             value=mean_val_03,
    #             is_smallest=col_name_conf_estimator == label_freq_to_smallest_est_map[0.3],
    #             float_precision=float_precision
    #         )
    #         + " / "
    #         + format_value_depending_on_whether_it_is_smallest(
    #             value=mean_val_07,
    #             is_smallest=col_name_conf_estimator == label_freq_to_smallest_est_map[0.7],
    #             float_precision=float_precision
    #         )
    #     )
    #     new_row.append(new_row_value)
    for col_name_conf_estimator in estimator_columns:
        mean_val_first_log_growth_rate:float = row[f"{col_name_conf_estimator}_{first_log_growth_rate_to_include}"]



        new_row_value_first_log_growth_rate = format_value_depending_on_whether_it_is_smallest(
                value=mean_val_first_log_growth_rate,
                is_smallest=(
                        col_name_conf_estimator  in log_growth_rate_to_set_of_smallest_est_map[first_log_growth_rate_to_include]
                             ),
                float_precision=float_precision
        )
        new_row.append(new_row_value_first_log_growth_rate)


    for col_name_conf_estimator in estimator_columns:
        mean_val_second_log_growth_rate:float = row[f"{col_name_conf_estimator}_{second_log_growth_rate_to_include}"]
        new_row_value_second_log_growth_rate = format_value_depending_on_whether_it_is_smallest(
                value=mean_val_second_log_growth_rate,
                is_smallest=(
                        col_name_conf_estimator  in log_growth_rate_to_set_of_smallest_est_map[second_log_growth_rate_to_include]
                             ),
                float_precision=float_precision

        )
        new_row.append(new_row_value_second_log_growth_rate)

    pretty_rows.append(new_row)


df_pretty: pd.DataFrame = pd.DataFrame(
    data=pretty_rows,
    columns=multi_index_columns
)
df_pretty.head()

Unnamed: 0_level_0,$p$,\# R,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.1$,$k=0.1$,$k=0.1$,$k=0.1$,$k=0.1$,$k=0.1$
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,CWA,PCA,ICW,IPW,IPW ($-\Delta$),IPW ($+\Delta$),CWA,PCA,ICW,IPW,IPW ($-\Delta$),IPW ($+\Delta$)
0,actedin,1,$90.0$,$8.6$,$55.1$,$\bm{1.5}$,$4.2$,$1.4$,$21.7$,$93.1$,$44.2$,$\bm{0.2}$,$1.3$,$0.4$
1,created,2,$831.1$,$188.1$,$187.9$,$\bm{61.0}$,$63.7$,$73.8$,$197.9$,$202.7$,$8.7$,$\bm{3.1}$,$10.2$,$6.5$
2,dealswith,7,$123.0$,$46.4$,$43.2$,$\bm{9.3}$,$19.1$,$6.5$,$0.8$,$23.5$,$0.4$,$\bm{0.2}$,$0.3$,$0.2$
3,diedin,1,$160.5$,$148.4$,$32.9$,$\bm{12.6}$,$14.2$,$14.1$,$59.9$,$83.4$,$5.5$,$\bm{0.3}$,$2.1$,$1.2$
4,directed,2,$1000.1$,$277.8$,$\bm{121.7}$,$192.7$,$284.5$,$146.9$,$247.2$,$219.2$,$43.1$,$\bm{3.5}$,$15.2$,$5.9$


In [56]:
df_pretty: pd.DataFrame = df_pretty.sort_values(
    by=["$p$"]
)

df_pretty.head()



Unnamed: 0_level_0,$p$,\# R,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.01$,$k=0.1$,$k=0.1$,$k=0.1$,$k=0.1$,$k=0.1$,$k=0.1$
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,CWA,PCA,ICW,IPW,IPW ($-\Delta$),IPW ($+\Delta$),CWA,PCA,ICW,IPW,IPW ($-\Delta$),IPW ($+\Delta$)
0,actedin,1,$90.0$,$8.6$,$55.1$,$\bm{1.5}$,$4.2$,$1.4$,$21.7$,$93.1$,$44.2$,$\bm{0.2}$,$1.3$,$0.4$
1,created,2,$831.1$,$188.1$,$187.9$,$\bm{61.0}$,$63.7$,$73.8$,$197.9$,$202.7$,$8.7$,$\bm{3.1}$,$10.2$,$6.5$
2,dealswith,7,$123.0$,$46.4$,$43.2$,$\bm{9.3}$,$19.1$,$6.5$,$0.8$,$23.5$,$0.4$,$\bm{0.2}$,$0.3$,$0.2$
3,diedin,1,$160.5$,$148.4$,$32.9$,$\bm{12.6}$,$14.2$,$14.1$,$59.9$,$83.4$,$5.5$,$\bm{0.3}$,$2.1$,$1.2$
4,directed,2,$1000.1$,$277.8$,$\bm{121.7}$,$192.7$,$284.5$,$146.9$,$247.2$,$219.2$,$43.1$,$\bm{3.5}$,$15.2$,$5.9$


# To file

In [57]:
# dir_latex_table: str = os.path.join(
#     kbc_e_metrics_project_dir,
#     "paper_latex_tables",
#     'known_prop_scores',
#     'scar'
# )
#
# if not os.path.exists(dir_latex_table):
#     os.makedirs(dir_latex_table)

filename_latex_table: str = os.path.join(
    dir_latex_table,
    "confidence-error-table-sar-popularity-agg-per-p.tex"
)
filename_tsv_table: str = os.path.join(
    dir_latex_table,
    "confidence-error-table-sar-popularity-agg-per-p.tsv"
)


with open(filename_latex_table, "w") as latex_ofile:
    with pd.option_context("max_colwidth", 1000):
        latex_ofile.write(
            df_pretty.to_latex(
                column_format="lr|lllllll|lllllll",
                index=False,
                float_format="{:0.3f}".format,
                escape=False,
            #     caption="$[widehat{conf}-conf]^2$ for SCAR. "
            #             "std=standard confidence, "
            #             "PCA (S) = PCA confidence with $s$ as domain, "
            #             "PCA (O) = PCA confidence with $o$  as domain, "
            #             "IPW = PCA confidence with $\hat{e}=e$, "
            #             "IPW +/- $" + f"{label_frequency_est_diff:0.1}" + "$ = IPW confidence with $\hat{e}=e+/-" +  f"{label_frequency_est_diff:0.1}" + "$."
            )
        )

with open(filename_tsv_table, "w") as tsv_ofile:
    tsv_ofile.write(df_pretty.to_csv(
        index=False,
        sep="\t"
    ))

print(filename_latex_table)


/home/joschout/Documents/Repos/KUL-PUL/paper_latex_tables/known_prop_scores/sar_popularity/confidence-error-table-sar-popularity-agg-per-p.tex
