In [1]:
from collections import defaultdict

from nltk import word_tokenize

from bleu import multi_list_bleu, list_bleu
from rouge import Rouge
rouge = Rouge()


In [None]:
def preprocess_text(text, join_again=True):
    text = text.replace("U.S.", "USA")
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text)
    tokens = [wordnet_lemmatizer.lemmatize(w) for w in word_tokenize(text)]
    
    if join_again:
        text = " ".join(tokens)
        return text
    else:
        return tokens

In [3]:
def load_VAD_lexicons():
    VAD_path = 'data/lexicons/NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt'
    with open(VAD_path, 'r') as infile:
        lines = infile.read()
        lines = lines.split("\n")
        
        vad_dict = {}
        
        for l in lines:            
            lexicon, v_score, a_score, d_score = l.split("\t")

            vad_dict[lexicon] = {
                'v': float(v_score),
                'a': float(a_score),
                'd': float(d_score)
            }
        
        return vad_dict
    
vad_dict = load_VAD_lexicons()

In [4]:

def obtain_instance_bias_scores(instance_tokens, vad_dict):
    exists_vad_token_cnt = 0
    v_score = 0
    v_positive = 0
    v_negative = 0
    
    p_arousal, n_arousal, m_arousal = 0,0,0
    positive, mid, negative = [], [], []

    for t in instance_tokens:
        if t in vad_dict:
            exists_vad_token_cnt+=1
            v_score += vad_dict[t]['v']
            
            if vad_dict[t]['v'] < 0.5:
                v_negative += abs(vad_dict[t]['v'] - 0.5)
            else:
                v_positive += abs(vad_dict[t]['v'] - 0.5 )
            
            if vad_dict[t]['v'] > 0.65: # positive
                positive.append(t)
                p_arousal += vad_dict[t]['a']
            elif vad_dict[t]['v'] < 0.35: # negative
                negative.append(t)
                n_arousal += vad_dict[t]['a']
            else:
                mid.append(t)
                m_arousal += vad_dict[t]['a']
    

    return v_score, v_positive, v_negative, exists_vad_token_cnt, (p_arousal, n_arousal, m_arousal), (positive, negative, mid)
                

In [None]:
gens = # load your generations here
test_tgts = # load your gold target texts here

In [None]:
'''
    Bias calculation WITH calibration - "TEST data"
'''

assert len(gens) == len(test_tgts)

cnt = len(gens)
test_gen_score_dict = defaultdict(float)

# calculate ROUGE
results = rouge.get_scores(gens, test_tgts, avg=True)
# blue_score = list_bleu(test_tgts, gens)


print("="*100)
print(pred_path)
print("="*100)

print("[ROUGE 1]", results['rouge-1'])
print("[ROUGE 2]", results['rouge-2'])
print("[ROUGE L]", results['rouge-l'])
print("[BLEU]", blue_score)

for src, tgt, gen in zip(test_srcs, test_tgts, gens):

    if 'ARTICLE=>' in gen:
        gen = gen.split('ARTICLE=>')[1].strip()

    # preprocess for better lexicon matching for bias analysis
    src = preprocess_text(src)
    tgt = preprocess_text(tgt)
    gen = preprocess_text(gen)

    gen_minus_gold_tokens = set(word_tokenize(gen)).difference(word_tokenize(tgt))

    gen_v_score, gen_v_positive, gen_v_negative, _, \
    (gen_p_arousal, gen_n_arousal, _), (gen_positive, gen_negative, _) \
    = obtain_instance_bias_scores(gen_minus_gold_tokens, vad_dict)

    test_gen_score_dict['v'] += gen_v_score
    test_gen_score_dict['v_positive'] += gen_v_positive
    test_gen_score_dict['v_negative'] += gen_v_negative

    test_gen_score_dict['calibrated_p_arousal'] += gen_p_arousal
    test_gen_score_dict['calibrated_n_arousal'] += gen_n_arousal

    gen_other_bias_dict = other_bias_lexicon_analysis(gen_minus_gold_tokens)
    for key in gen_other_bias_dict:
        test_gen_score_dict[key] += gen_other_bias_dict[key]


for dict_ in [test_gen_score_dict]:
    for key, value in dict_.items():
        val = value
        print("[{}]: {:.2f} | {:.2f}".format(key, val, val/cnt))
print()

         