In [1]:
import json
from copy import deepcopy
from tqdm import trange
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr, kendalltau
import numpy as np
import random
from collections import defaultdict

In [3]:
with open("/home/sameerj/capstone_misc/data_sms.json") as f_all:
    d = json.load(f_all)

In [4]:
scores = defaultdict(dict)
scores["human_score"] = defaultdict(list)
scores["unieval"] = defaultdict(list)
scores["ice_uniform"] = defaultdict(list)
scores["ice_stratified"] = defaultdict(list)

In [5]:
def unroll_scores_article(article_data):
    for dimension in ["coherence", "consistency", "fluency", "relevance"]:
        article_sums = article_data["sys_summs"].values()
        for article_sum in article_sums:
            for scores_category in ["human_score", "ice_uniform", "ice_stratified", "unieval"]:
                scores[scores_category][dimension].append(article_sum["scores"][scores_category + "_" + dimension])


In [6]:
def unroll_scores(data):
    for article_id, article_data in data.items():
        unroll_scores_article(article_data)


In [7]:
unroll_scores(d)

In [8]:
len(scores["human_score"]["coherence"])

1536

In [20]:
scores["human_score"]["coherence"]

[0.17,
 0.92,
 0.92,
 0.75,
 0.92,
 0.83,
 0.92,
 0.92,
 0.75,
 0.83,
 0.92,
 0.83,
 0.83,
 0.33,
 0.17,
 0.58,
 0.0,
 0.17,
 0.75,
 0.0,
 0.67,
 0.17,
 0.92,
 0.92,
 0.5,
 0.58,
 0.17,
 0.92,
 0.67,
 0.08,
 0.08,
 0.0,
 0.58,
 0.58,
 0.67,
 0.17,
 0.33,
 0.5,
 0.83,
 0.58,
 1.0,
 0.83,
 0.08,
 0.83,
 0.83,
 0.5,
 0.17,
 0.17,
 0.42,
 0.67,
 0.67,
 0.67,
 0.42,
 0.75,
 0.42,
 0.67,
 0.92,
 0.5,
 0.42,
 0.83,
 0.92,
 0.25,
 0.5,
 0.5,
 0.33,
 1.0,
 1.0,
 0.75,
 1.0,
 0.75,
 1.0,
 0.92,
 0.25,
 1.0,
 0.58,
 1.0,
 0.92,
 0.83,
 0.42,
 0.17,
 0.42,
 0.75,
 0.58,
 0.42,
 0.83,
 0.42,
 0.75,
 0.92,
 0.5,
 0.58,
 1.0,
 0.42,
 0.83,
 0.33,
 0.83,
 0.33,
 0.33,
 1.0,
 0.58,
 0.42,
 0.33,
 0.92,
 0.5,
 0.42,
 0.5,
 0.42,
 0.5,
 0.83,
 0.67,
 0.42,
 0.33,
 0.08,
 0.25,
 0.83,
 0.33,
 0.83,
 0.83,
 0.5,
 0.5,
 0.75,
 0.83,
 0.42,
 0.42,
 0.33,
 0.67,
 0.83,
 0.58,
 0.17,
 0.75,
 0.67,
 0.5,
 0.75,
 0.83,
 0.75,
 0.5,
 0.75,
 0.42,
 1.0,
 0.75,
 0.83,
 0.92,
 0.58,
 0.83,
 0.25,
 0.83,
 1.0,
 0.58,

In [18]:
spearmanr(scores["human_score"]["consistency"], scores["ice_uniform"]["consistency"])

SpearmanrResult(correlation=0.4868035166335241, pvalue=3.363326582975367e-92)

In [19]:
spearmanr(scores["human_score"]["consistency"], scores["unieval"]["consistency"])

SpearmanrResult(correlation=0.27056848436023057, pvalue=3.5385599965712854e-27)

In [40]:
def sigtest(m1, m2, human, dimension):
    better = 0
    for i in trange(1000):
        l = list(range(1536))
        random.shuffle(l)
        sub_ids = l[:int(0.8 * len(l))]
        corr1, corr2 = [], []
        target, pred1, pred2 = [], [], []
        for doc_id in sub_ids:
            pred1.append(scores[m1][dimension][doc_id])
            pred2.append(scores[m2][dimension][doc_id])
            target.append(scores[human][dimension][doc_id])
        corr1 = spearmanr(target, pred1)[0]
        corr2 = spearmanr(target, pred2)[0]
        if corr1 > corr2:
            better += 1
    print(better)
    if better > 950:
        return 1
    elif better < 50:
        return -1
    else:
        return 0

In [41]:
dims = ['consistency', 'fluency', 'relevance', 'coherence']
# stratified vs unieval
for dim in dims:
  m1 = 'ice_uniform'
  m2 = 'unieval'
  human = "human_score"
  op = sigtest(m1, m2, human, dim)
  opstr = ''
  if op == -1:
    opstr = m2
  elif op == 1:
    opstr = m1
  else:
    opstr = 'neither'

  print(f'{dim} --> {m1} vs {m2}: {opstr}')

100%|██████████| 1000/1000 [00:01<00:00, 581.08it/s]


1000
consistency --> ice_uniform vs unieval: ice_uniform


100%|██████████| 1000/1000 [00:01<00:00, 586.78it/s]


1000
fluency --> ice_uniform vs unieval: ice_uniform


100%|██████████| 1000/1000 [00:01<00:00, 585.75it/s]


1000
relevance --> ice_uniform vs unieval: ice_uniform


100%|██████████| 1000/1000 [00:01<00:00, 583.68it/s]

1000
coherence --> ice_uniform vs unieval: ice_uniform



