# Purpose

If IRT is valid: Check rank acc % in average across epoch, the rank in word sequence should be similar across model —> ICC should be high



In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# Statistical packages for ICC and Spearman rho
from pingouin import intraclass_corr
from scipy.stats import spearmanr
import seaborn as sns

## Pull data from BQ

In [None]:
from google.cloud import bigquery
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/jupyter/tf/secret/majestic-camp-303620-e8cb3a12037b.json"
client = bigquery.Client(location="US", project="majestic-camp-303620")

def load_raw_data():
    """Read data from BQ database"""
    query = """
    SELECT
        code_name,
        word,
        AVG(acc) AS acc, 
    FROM 
        slow_op_10.train
    WHERE 
        unit_time=4.0
    GROUP BY
        code_name,
        word;
    """
    query_job = client.query(query)

    return query_job.to_dataframe()

df = load_raw_data()

In [None]:
intraclass_corr(df, targets="word", ratings="acc", raters="code_name")

- ICC2 check the absolute agreement between k raters (models in our case), assuming there is one single measure
- ICC2 = 0.90644, which is very high, indicating the accuracy pattern across model is quite consistent 

In [None]:
# Pivot to wide
wide_df = df.pivot_table(columns="code_name", index='word', values='acc').reset_index()

In [None]:
# Rank df
df_rank = pd.DataFrame()

for i in range(10):
    df_rank[f"rank_{i}"] = wide_df[f"Slow_OP_10_r{i:04d}"].rank()

In [None]:
# Spearman rho
rho, pval = spearmanr(df_rank)
sns.heatmap(rho, cmap="Spectral_r", vmin=-1, vmax=1, annot=True)

- Spearman rho is a nonparametric measure of rank correlation
- Across each model, rho is between 0.87-0.90, showing a high consistency acorss models

# Single epoch

In [None]:
def load_raw_data(epoch):
    """Read data from BQ database"""
    query = f"""
    SELECT
        code_name,
        word,
        AVG(acc) AS acc, 
    FROM 
        slow_op_10.train
    WHERE 
        unit_time=4.0 AND epoch = {epoch}
    GROUP BY
        code_name,
        word;
    """
    query_job = client.query(query)

    return query_job.to_dataframe()

In [None]:
def get_icc(epoch):
    df = load_raw_data(epoch)
    icc = intraclass_corr(df, targets="word", ratings="acc", raters="code_name")
    icc2 = icc.ICC[1]
    return icc2
    

In [None]:
icc_over_epoch = [get_icc(x) for x in np.linspace(10,100,10)]

In [None]:
def get_acc_over_epoch():
    """Read data from BQ database"""
    query = f"""
    SELECT
        epoch,
        AVG(acc) AS acc, 
    FROM 
        slow_op_10.train
    WHERE 
        unit_time=4.0
    GROUP BY
        epoch;
    """
    query_job = client.query(query)

    return query_job.to_dataframe()

acc = get_acc_over_epoch()

In [None]:
icc_df = acc.loc[acc.epoch>=10]

In [None]:
icc_df['icc2'] = icc_over_epoch

In [None]:
sns.lineplot(x="epoch", y="acc", data=icc_df, label="acc")
sns.lineplot(x="epoch", y="icc2", data=icc_df, label="icc2")