In [1]:
import os
import time
import csv
import json
import numpy as np
import torch as th
import glob

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score

os.environ['CUDA_VISIBLE_DEVICES']='0'

In [2]:
from torchmetrics.text.rouge import ROUGEScore

In [3]:
def get_bleu(recover, reference):
    return sentence_bleu([recover.split()], reference.split(), smoothing_function=SmoothingFunction().method4,)

def selectBest(sentences):
    selfBleu = [[] for i in range(len(sentences))]
    for i, s1 in enumerate(sentences):
        for j, s2 in enumerate(sentences):
            score = get_bleu(s1, s2)
            selfBleu[i].append(score)
    for i, s1 in enumerate(sentences):
        selfBleu[i][i] = 0
    idx = np.argmax(np.sum(selfBleu, -1))
    return sentences[idx]

def diversityOfSet(sentences):
    selfBleu = []
    # print(sentences)
    for i, sentence in enumerate(sentences):
        for j in range(i+1, len(sentences)):
            # print(sentence, sentences[j])
            score = get_bleu(sentence, sentences[j])
            selfBleu.append(score)
    if len(selfBleu)==0:
        selfBleu.append(0)
    div4 = distinct_n_gram_inter_sent(sentences, 4)
    return np.mean(selfBleu), div4


def distinct_n_gram(hypn,n):
    dist_list = []
    for hyp in hypn:
        hyp_ngrams = []
        hyp_ngrams += nltk.ngrams(hyp.split(), n)
        total_ngrams = len(hyp_ngrams)
        unique_ngrams = len(list(set(hyp_ngrams)))
        if total_ngrams == 0:
            return 0
        dist_list.append(unique_ngrams/total_ngrams)
    return  np.mean(dist_list)


def distinct_n_gram_inter_sent(hypn, n):
    hyp_ngrams = []
    for hyp in hypn:
        hyp_ngrams += nltk.ngrams(hyp.split(), n)
    total_ngrams = len(hyp_ngrams)
    unique_ngrams = len(list(set(hyp_ngrams)))
    if total_ngrams == 0:
        return 0
    dist_n = unique_ngrams/total_ngrams
    return  dist_n

def metrics_calculate_S(recovers_S, sources_S, references_Srecovers_S, S):
    bleu = []
    rougel = []
    avg_len = []
    dist1 = []

    sos = '[CLS]'
    eos = '[SEP]'
    sep = '[SEP]'
    pad = '[PAD]'

    rougeScore = ROUGEScore()

    sentenceDict = {}
    referenceDict = {}
    sourceDict = {}
    for i in range(len(sources_S[0])):
        sentenceDict[i] = []
        referenceDict[i] = []
        sourceDict[i] = []

    for s in range(S):
        sources = sources_S.pop()
        references = references_S.pop()
        recovers = recovers_S.pop()
        for i in range(len(sources)):
            sources[i] = sources[i].replace(eos, '').replace(sos, '')
            references[i] = references[i].replace(eos, '').replace(sos, '').replace(sep, '')
            recovers[i] = recovers[i].replace(eos, '').replace(sos, '').replace(sep, '').replace(pad, '')

            sentenceDict[i].append(recovers[i])
            referenceDict[i].append(references[i])
            sourceDict[i].append(sources[i])

    # diversity
    div4 = []
    selfBleu = []
    for k, v in sentenceDict.items():
        if len(v) == 0:
            continue
        sb, d4 = diversityOfSet(v)
        selfBleu.append(sb)
        div4.append(d4)

    selfBleu = np.mean(selfBleu)
    div4 = np.mean(div4)
    # print('avg selfBleu score', selfBleu)
    # print('avg div4 score', div4)

    bleu = []
    rougel = []
    avg_len = []
    dist1 = []
    recovers = []
    references = []
    sources = []

    for k, v in sentenceDict.items():
        if len(v) == 0 or len(referenceDict[k]) == 0:
            continue

        recovers.append(selectBest(v))
        references.append(referenceDict[k][0])
        sources.append(sourceDict[k][0])

    for (source, reference, recover) in zip(sources, references, recovers):
        bleu.append(get_bleu(recover, reference))
        rougel.append(rougeScore(recover, reference)['rougeL_fmeasure'].tolist())
        avg_len.append(len(recover.split(' ')))
        dist1.append(distinct_n_gram([recover], 1))

    P, R, F1 = score(recovers, references, model_type='microsoft/deberta-xlarge-mnli', lang='en', verbose=True)

    bleu = np.mean(bleu)
    rougel = np.mean(rougel)
    F1 = th.mean(F1)
    dist1 = np.mean(dist1)
    avg_len = np.mean(avg_len)
    # print('avg BLEU score', bleu)
    # print('avg ROUGE-L score', rougel)
    # print('avg berscore', F1)
    # print('avg dist1 score', dist1)
    return bleu, rougel, F1, dist1, selfBleu, div4, avg_len

def create_log_file(filename):
    with open(f'score/{filename}.csv', 'w' , newline='') as csvfile:
        writer = csv.writer(csvfile)

def record_metric_score(msg, filename):
    with open(f'score/{filename}.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([msg])

In [4]:
ver = 'metabeta_ver'
dataset = 'WA'
checkpoint_path = f'checkpoint/{dataset}/{ver}/inferences'

bleu_list = list()
rougel_list = list()
bert_score_list = list()
dist1_list = list()
selfBleu_list = list()
div4_list = list()
avg_len_list = list()

epochs = [i*1000 for i in range(80,80+1)]
use_ema = True
S = 1 # MBR size

s_t = time.time()
for epoch in epochs:
    if use_ema:
        filepath = f'{checkpoint_path}/test_inferences_ema_{epoch}.csv'
    else:
        filepath = f'{checkpoint_path}/test_inferences_{epoch}.csv'
    print(filepath)

    information = {}
    with open(filepath, 'r', newline='') as csvfile:
        rows = csv.reader(csvfile)
        recovers = []
        sources = []
        references = []
        for row in rows:
            if f'Epoch' in row[0]:
                epoch = row[0].replace('\n', '')
                continue
            temp = row[0].replace('\n', '').replace('  ', '').replace('\'', '').replace('\"', '').split(',source:')
            recovers.append(temp[0].split('recover:')[1])
            sources.append(temp[1].split(',reference:')[0])
            references.append(temp[1].split(',reference:')[1])
        information[0] = [epoch, recovers, sources, references]
    print(len(recovers), len(sources), len(references))

    recovers_S = list()
    sources_S = list()
    references_S = list()
    for i in range(len(information)):
        epoch = information[i][0]
        recovers = information[i][1]
        sources = information[i][2]
        references = information[i][3]

        recovers_S.append(recovers)
        sources_S.append(sources)
        references_S.append(references)

    bleu, rougel, bert_score, dist1, selfBleu, div4, avg_len = metrics_calculate_S(recovers_S, sources_S, references_S, S)
    bleu_list.append(bleu)
    rougel_list.append(rougel)
    bert_score_list.append(bert_score)
    dist1_list.append(dist1)
    selfBleu_list.append(selfBleu)
    div4_list.append(div4)
    avg_len_list.append(avg_len)

e_t = time.time() - s_t
print(f'[Evaluate] Using {e_t/60:.2f} mins.')

BalaGinjo_checkpoint/CC/metaBeta_0315_Rsparse_e1_updateD100_FSM/inferences//test_inferences_ema_50000.csv
128 128 128


Some weights of the model checkpoint at microsoft/deberta-xlarge-mnli were not used when initializing DebertaModel: ['pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'pooler.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.21 seconds, 106.18 sentences/sec
[Evaluate] Using 0.27 mins.


In [5]:
for i in range(len(epochs)):
    print(f'epoch: {epochs[i]}')
    print(f'bleu: {bleu_list[i]:.4f},\nrougel: {rougel_list[i]:.4f},\nbert score: {bert_score_list[i]:.4f},\ndist1: {dist1_list[i]:.4f},\nselfBleu: {selfBleu_list[i]:.4f},\ndiv4: {div4_list[i]:.4f},\navg_len: {avg_len_list[i]:.2f}')
    print()