In [1]:
!pwd

/home/anton/optimization-methods/statistics


In [3]:
%cd ../embeddings-for-trees

/home/anton/optimization-methods/embeddings-for-trees


In [4]:
import os
import torch
from data_module.jsonl_data_module import JsonlDataModule
from omegaconf import OmegaConf
from utils.common import PAD, UNK, EOS, SOS

import matplotlib.pyplot as plt

import json
import pickle
import itertools
from collections import defaultdict

from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from rouge import Rouge
from bert_score import score as bert_score

from scipy.stats import wilcoxon, mannwhitneyu
from time import perf_counter

Using backend: pytorch


In [11]:
config = OmegaConf.load("config/tree_lstm_med_10per1.yaml")

In [12]:
data_module = JsonlDataModule(config)
data_module.prepare_data()
data_module.setup()

Dataset is already downloaded


In [13]:
id_to_label = data_module._vocabulary.id_to_label

In [16]:
def ids_to_text(ids):
    res = []
    for idx in ids:
        if idx in [0, 2]:  # PAD, SOS
            continue
        elif idx == 3:  # EOS
            return res
        res.append(id_to_label[idx.item()])
    return res

In [17]:
def get_hyps_refs(predict_file = "/home/anton/opt-exps-data/outputs/25per/SGD_25per/Momentum_test_outputs.pkl",
                  use_first=10):
    start = perf_counter()
    test_dataloader = data_module.test_dataloader()
    predictions = torch.load(f"{predict_file}", map_location=torch.device('cpu'))

    hyps = []; refs = []
    
    if use_first is None:
        x = zip(predictions, test_dataloader)
    else:
        x = itertools.islice(zip(predictions, test_dataloader), use_first)
    
    for batch_our, batch_ref in x:
        batch_ref = batch_ref[0]
        for batch_idx in range(batch_our.size(1)):
            res_our = ids_to_text(batch_our[:, batch_idx])
            res_ref = ids_to_text(batch_ref[:, batch_idx])
            
            hyps.append(' '.join(res_our)); refs.append(' '.join(res_ref))
    finish = perf_counter()
    print(f"Preprocessing time elapsed: {finish - start}, s")

    return hyps, refs

In [21]:
class Metrics:
    def __init__(self, hyps, refs):
        self.hyps = hyps
        self.refs = refs
    
    def get_statistics(self, with_symbolic=False, bert=False):
        result = {
            'scores': {},
            'score': {}
        }
        for func in dir(Metrics):
            if callable(getattr(Metrics, func)) and func[0] != '_' and func != 'get_statistics':
                if (not with_symbolic and 'symb' in func) or (not bert and func == 'bert'):
                    continue
                print(f'Calculating {func} metric')
                start = perf_counter()
                metric_result = getattr(Metrics, func)(self)
                result['scores'].update(metric_result['scores'])
                result['score'].update(metric_result['score'])
                finish = perf_counter()
                print('Metric is calculated. Elapsed time:', finish - start)
    
        return result
    
    def bert(self):
        (P, R, F), hashname = bert_score(self.hyps, self.refs, lang="en", return_hash=True)
        return {
            'scores': {
                'bert-P': P.cpu().detach().numpy(),
                'bert-R': R.cpu().detach().numpy(),
                'bert-F': F.cpu().detach().numpy()
            },
            'score': {
                'bert-P': P.mean().item(),
                'bert-R': R.mean().item(),
                'bert-F': F.mean().item()
            }
        }
    
    def meteor(self):
        scores = []
        for hyp, ref in zip(self.hyps, self.refs):
            scores.append(single_meteor_score(ref, hyp))
        
        return {
            'scores': {
                'meteor': np.array(scores)
            },
            'score': {
                'meteor': np.mean(scores)
            }
        }

    def bleu(self):
        
        def bleu_weights(l):
            length = len(l)
            if length >= 4:
                return (0.25, 0.25, 0.25, 0.25)
            wt = [1 / length for _ in range(length)]
            wt.extend([0 for _ in range(4-length)])
            return tuple(wt)
        
        scores = []
        
        
        
        for hyp, ref in zip(self.hyps, self.refs):
            scores.append(sentence_bleu([ref], hyp, weights=bleu_weights(ref)))
        
        return {
            'scores': {
                'bleu': np.array(scores)
            },
            'score': {
                'bleu': corpus_bleu([[ref] for ref in self.refs], self.hyps)
            }
        }
    
    def symb_rouge_l(self):
        rouge = Rouge(metrics=['rouge-l'])
        scores_dict = defaultdict(list)
        
        hyps_symb = [' '.join(h.replace(' ', '')) for h in hyps]
        refs_symb = [' '.join(r.replace(' ', '')) for r in refs]

        scores = rouge.get_scores(hyps_symb, refs_symb)
        for score in scores:
            batch = {}
            for sub in score:
                batch.update({sub + k.upper(): v for k, v in score[sub].items()})

            for title in batch:
                scores_dict['symb-' + title].append(batch[title])

        scores_dict = dict(scores_dict)
        score_dict = {}
        for title in scores_dict:
            scores_dict[title] = np.array(scores_dict[title])
            score_dict[title] = scores_dict[title].mean()
        
        return {
            'scores': scores_dict,
            'score': score_dict
        }
    
    def rouge(self):
        rouge = Rouge()
        scores_dict = defaultdict(list)

        scores = rouge.get_scores(hyps, refs)
        for score in scores:
            batch = {}
            for sub in score:
                batch.update({sub + k.upper(): v for k, v in score[sub].items()})

            for title in batch:
                scores_dict[title].append(batch[title])

        scores_dict = dict(scores_dict)
        score_dict = {}
        for title in scores_dict:
            scores_dict[title] = np.array(scores_dict[title])
            score_dict[title] = scores_dict[title].mean()
        
        return {
            'scores': scores_dict,
            'score': score_dict
        }

## Check

In [22]:
%%time
hyps, refs = get_hyps_refs()

Preprocessing time elapsed: 14.219671744001971, s
CPU times: user 1.89 s, sys: 466 ms, total: 2.36 s
Wall time: 14.2 s


In [30]:
metrics = Metrics(hyps, refs)
test_metrics = metrics.get_statistics(with_symbolic=False, bert=False)

Calculating bleu metric
Metric is calculated. Elapsed time: 1.0576067639995017
Calculating meteor metric
Metric is calculated. Elapsed time: 0.7424251960037509
Calculating rouge metric
Metric is calculated. Elapsed time: 0.4186929900024552


In [31]:
hyps[:10], refs[:10]

(['get id',
  'on existing',
  'on existing',
  'on existing',
  'on existing',
  'get view',
  'get write',
  'set on touch max',
  'on item item',
  'get context'],
 ['get id',
  'on measure',
  'on measure',
  'on measure',
  'on measure',
  'get view',
  'get text',
  'set on touch listener',
  'on dialog item click',
  'get text view'])

In [33]:
for k, v in test_metrics['scores'].items():
    print(k, v[:10])

bleu [1.00000000e+00 3.87192832e-78 3.87192832e-78 3.87192832e-78
 3.87192832e-78 1.00000000e+00 3.54948106e-01 5.79217451e-01
 2.72661731e-01 3.91236061e-01]
meteor [0.9375     0.25       0.25       0.25       0.25       0.9375
 0.25       0.73611111 0.25641026 0.17241379]
rouge-1F [1.         0.5        0.5        0.5        0.5        1.
 0.5        0.75       0.57142857 0.4       ]
rouge-1P [1.         0.5        0.5        0.5        0.5        1.
 0.5        0.75       0.66666667 0.5       ]
rouge-1R [1.         0.5        0.5        0.5        0.5        1.
 0.5        0.75       0.5        0.33333333]
rouge-2F [1.         0.         0.         0.         0.         1.
 0.         0.66666666 0.         0.        ]
rouge-2P [1.         0.         0.         0.         0.         1.
 0.         0.66666667 0.         0.        ]
rouge-2R [1.         0.         0.         0.         0.         1.
 0.         0.66666667 0.         0.        ]
rouge-lF [1.         0.5        0.5      

## Methods reports

In [47]:
global_methods = [
#     ('Adam', 'adam_global_predictions.pkl'),
#     ('Cyclic SGD', './outputs/global_Cycle_SGD_test_outputs.pkl'),
#     ('Lamb', 'Lamb_global_predictions.pkl'),
#     ('LaRAdam', 'Lookahead_Radam_standart_lr/tensor/Lookahead_RAdam_test_outputs.pkl'),
#     ('LaAdam', './outputs/global_Lookahead_test_outputs.pkl'),
    ('RAdam', 'RAdam_global/tensor/RAdam_test_outputs.pkl'),
    ('SGD', 'SGD_global/tensor/Momentum_test_outputs.pkl'),
    ('LaSGD', 'Lookahead_global_predictions.pkl')
]

In [48]:
if os.path.isfile('/home/anton/data/global_methods_report.data'):

    with open('/home/anton/data/global_methods_report.data', 'rb') as fp:
        global_methods_report = pickle.load(fp)

else:
#     global_methods_report = {}

    for method, path in global_methods:
        hyps, refs = get_hyps_refs(predict_file=DATA_DIR+path, use_first=None)
        metrics = Metrics(hyps, refs)
        global_methods_report[method] = metrics.get_statistics(with_symbolic=True, bert=False)
        print(method, global_methods_report[method])

    with open('/home/anton/data/global_methods_report.data', 'wb') as fp:
        pickle.dump(global_methods_report, fp)

Preprocessing time elapsed: 995.5748151590014, s
Calculating bleu metric
Metric is calculated. Elapsed time: 80.88183521700012
Calculating meteor metric
Metric is calculated. Elapsed time: 53.89030301400089
Calculating rouge metric
Metric is calculated. Elapsed time: 36.001044742000886
Calculating symb_rouge_l metric
Metric is calculated. Elapsed time: 64.01198457699866
RAdam {'scores': {'bleu': array([1.00000000e+000, 1.00000000e+000, 1.00000000e+000, ...,
       6.07291035e-155, 0.00000000e+000, 0.00000000e+000]), 'meteor': array([0.9375, 0.9375, 0.9375, ..., 0.    , 0.    , 0.    ]), 'rouge-1F': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-1P': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-1R': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-2F': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-2P': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-2R': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-lF': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-lP': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge

In [51]:
global_methods_report

{'Adam': {'scores': {'bleu': array([1.00000000e+000, 1.00000000e+000, 1.00000000e+000, ...,
          1.13491276e-231, 8.84484440e-232, 0.00000000e+000]),
   'meteor': array([0.9375, 0.9375, 0.9375, ..., 0.    , 0.    , 0.    ]),
   'rouge-1F': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-1P': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-1R': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-2F': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-2P': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-2R': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-lF': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-lP': array([1., 1., 1., ..., 0., 0., 0.]),
   'rouge-lR': array([1., 1., 1., ..., 0., 0., 0.]),
   'symb-rouge-lF': array([1.        , 1.        , 1.        , ..., 0.33333333, 0.18181818,
          0.        ]),
   'symb-rouge-lP': array([1.        , 1.        , 1.        , ..., 0.375     , 0.16666667,
          0.        ]),
   'symb-rouge-lR': array([1. , 1. , 1. , ..., 0.3, 0.2, 0. ]

In [55]:
local_methods = [
#     ('Adadelta', 'Adadelta_local/tensor/Adadelta_test_outputs.pkl'),
#     ('Barzilai-Borwein', 'BB_local/tensor/BB_test_outputs.pkl'),
#     ('LaRAdam', 'LaRAdam_local/tensor/Lookahead_RAdam_test_outputs.pkl'),
#     ('Adam', 'Adam_local/tensor/Adam_test_outputs.pkl'),
    ('SVRG', 'SVRG_local/tensor/SVRG_test_outputs.pkl'),
    ('SWA', 'SWA_local_const/tensor/SWA_test_outputs.pkl'),
]

In [56]:
if os.path.isfile('local_methods_report.data'):

    with open('local_methods_report.data', 'rb') as fp:
        local_methods_report = pickle.load(fp)

else:
#     local_methods_report = {}

    for method, path in local_methods:
        hyps, refs = get_hyps_refs(predict_file=DATA_DIR+path, use_first=None)
        metrics = Metrics(hyps, refs)
        local_methods_report[method] = metrics.get_statistics(with_symbolic=True, bert=False)
        print(method, local_methods_report[method])

    with open('local_methods_report.data', 'wb') as fp:
        pickle.dump(local_methods_report, fp)

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f558cf01310>
Traceback (most recent call last):
  File "/home/anton/opt-met-exps/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/home/anton/opt-met-exps/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1316, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.8/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f558cf01310>
Traceback (most recent call last):
  File "/home/anton/opt-met-exps/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/home/anton/opt-met-exps/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1316, in _shut

Preprocessing time elapsed: 983.2660813690018, s
Calculating bleu metric
Metric is calculated. Elapsed time: 75.26180082100109
Calculating meteor metric
Metric is calculated. Elapsed time: 44.55030993500259
Calculating rouge metric
Metric is calculated. Elapsed time: 32.516790330999356
Calculating symb_rouge_l metric
Metric is calculated. Elapsed time: 60.80438553199929
SVRG {'scores': {'bleu': array([1.00000000e+000, 1.00000000e+000, 1.00000000e+000, ...,
       4.93832578e-233, 0.00000000e+000, 0.00000000e+000]), 'meteor': array([0.9375, 0.9375, 0.9375, ..., 0.    , 0.    , 0.    ]), 'rouge-1F': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-1P': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-1R': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-2F': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-2P': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-2R': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-lF': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-lP': array([1., 1., 1., ..., 0., 0., 0.]), 'rouge-

In [57]:
for method in global_methods_report:
    print(method, global_methods_report[method]['score'])
    print()

Adam {'bleu': 0.2473695397571092, 'meteor': 0.2442593625233599, 'rouge-1F': 0.35564469697045975, 'rouge-1P': 0.3916712930502008, 'rouge-1R': 0.3481059005420179, 'rouge-2F': 0.14160716195789708, 'rouge-2P': 0.14598144736492577, 'rouge-2R': 0.14106221765945842, 'rouge-lF': 0.36518472954921566, 'rouge-lP': 0.4156373158626803, 'rouge-lR': 0.3479342834775493, 'symb-rouge-lF': 0.5355527538936309, 'symb-rouge-lP': 0.6649887983399876, 'symb-rouge-lR': 0.49367466202792953}

Lamb {'bleu': 0.31576961782899315, 'meteor': 0.28534654241234975, 'rouge-1F': 0.3980293636173818, 'rouge-1P': 0.43506358961377445, 'rouge-1R': 0.39214800029733854, 'rouge-2F': 0.18087299676891758, 'rouge-2P': 0.1867766183357475, 'rouge-2R': 0.18048168409736182, 'rouge-lF': 0.409513294543072, 'rouge-lP': 0.463702259795823, 'rouge-lR': 0.3912147487990936, 'symb-rouge-lF': 0.5733783731760014, 'symb-rouge-lP': 0.6883180458763324, 'symb-rouge-lR': 0.5366558314689206}

LaRAdam {'bleu': 0.36474375955099086, 'meteor': 0.315132914059

In [59]:
for method1, method2 in itertools.combinations(global_methods_report, 2):
    print(method1, '-', method2)
    for score in global_methods_report[method1]['score']:
        print(score, end=' ')
        h1 = None
        if global_methods_report[method1]['score'][score] > global_methods_report[method2]['score'][score]:
            pc = round(100 * (global_methods_report[method1]['score'][score] - global_methods_report[method2]['score'][score]) / global_methods_report[method2]['score'][score], 0)
            print('>', f'({pc}%)', end=' ')
            h1 = 'less'
        else:
            pc = round(100 * (global_methods_report[method2]['score'][score] - global_methods_report[method1]['score'][score]) / global_methods_report[method1]['score'][score], 0)
            print('<', f'({pc}%)', end=' ')
            h1 = 'greater'
            
        print()
        w, p = wilcoxon(
            global_methods_report[method1]['scores'][score],
            global_methods_report[method2]['scores'][score],
            alternative=h1
        )
        print('wilcoxon signed-rank: w =', round(int(w), -4), ', p =', p)
        
        s, p = mannwhitneyu(
            global_methods_report[method1]['scores'][score],
            global_methods_report[method2]['scores'][score],
            alternative=h1
        )
        print('mann-whitneyu: s =', round(int(s), -5), ', p =', p)
        
        print()
    print()

Adam - Lamb
bleu < (28.0%) 
wilcoxon signed-rank: w = 15367670000 , p = 1.0
mann-whitneyu: s = 84408700000 , p = 1.0

meteor < (17.0%) 
wilcoxon signed-rank: w = 4495560000 , p = 1.0
mann-whitneyu: s = 84943200000 , p = 1.0

rouge-1F < (12.0%) 
wilcoxon signed-rank: w = 4255200000 , p = 1.0
mann-whitneyu: s = 85350800000 , p = 1.0

rouge-1P < (11.0%) 
wilcoxon signed-rank: w = 4368150000 , p = 1.0
mann-whitneyu: s = 85624900000 , p = 1.0

rouge-1R < (13.0%) 
wilcoxon signed-rank: w = 1938040000 , p = 1.0
mann-whitneyu: s = 85334500000 , p = 1.0

rouge-2F < (28.0%) 
wilcoxon signed-rank: w = 287180000 , p = 1.0
mann-whitneyu: s = 86504300000 , p = 1.0

rouge-2P < (28.0%) 
wilcoxon signed-rank: w = 289350000 , p = 1.0
mann-whitneyu: s = 86492500000 , p = 1.0

rouge-2R < (28.0%) 
wilcoxon signed-rank: w = 233460000 , p = 1.0
mann-whitneyu: s = 86489700000 , p = 1.0

rouge-lF < (12.0%) 
wilcoxon signed-rank: w = 3983750000 , p = 1.0
mann-whitneyu: s = 85159300000 , p = 1.0

rouge-lP < (12.

mann-whitneyu: s = 86500000000 , p = 1.0

rouge-2F < (15.0%) 
wilcoxon signed-rank: w = 498550000 , p = 1.0
mann-whitneyu: s = 88195000000 , p = 1.0

rouge-2P < (15.0%) 
wilcoxon signed-rank: w = 486670000 , p = 1.0
mann-whitneyu: s = 88218100000 , p = 1.0

rouge-2R < (15.0%) 
wilcoxon signed-rank: w = 395470000 , p = 1.0
mann-whitneyu: s = 88178900000 , p = 1.0

rouge-lF < (8.0%) 
wilcoxon signed-rank: w = 4698840000 , p = 1.0
mann-whitneyu: s = 86683200000 , p = 1.0

rouge-lP < (7.0%) 
wilcoxon signed-rank: w = 4052720000 , p = 1.0
mann-whitneyu: s = 87272200000 , p = 1.0

rouge-lR < (9.0%) 
wilcoxon signed-rank: w = 2201950000 , p = 1.0
mann-whitneyu: s = 86539300000 , p = 1.0

symb-rouge-lF < (6.0%) 
wilcoxon signed-rank: w = 13438080000 , p = 1.0
mann-whitneyu: s = 84952800000 , p = 1.0

symb-rouge-lP < (2.0%) 
wilcoxon signed-rank: w = 12676980000 , p = 1.0
mann-whitneyu: s = 88607500000 , p = 1.0

symb-rouge-lR < (8.0%) 
wilcoxon signed-rank: w = 7595820000 , p = 1.0
mann-whitne

mann-whitneyu: s = 97261200000 , p = 1.0

rouge-lP > (8.0%) 
wilcoxon signed-rank: w = 6408020000 , p = 1.0
mann-whitneyu: s = 95348800000 , p = 1.0

rouge-lR > (15.0%) 
wilcoxon signed-rank: w = 5545930000 , p = 1.0
mann-whitneyu: s = 98070400000 , p = 1.0

symb-rouge-lF > (9.0%) 
wilcoxon signed-rank: w = 24146580000 , p = 1.0
mann-whitneyu: s = 100136700000 , p = 1.0

symb-rouge-lP > (2.0%) 
wilcoxon signed-rank: w = 14962270000 , p = 1.0
mann-whitneyu: s = 92906300000 , p = 1.0

symb-rouge-lR > (13.0%) 
wilcoxon signed-rank: w = 17260600000 , p = 1.0
mann-whitneyu: s = 102467200000 , p = 1.0


LaRAdam - LaSGD
bleu > (49.0%) 
wilcoxon signed-rank: w = 31384200000 , p = 1.0
mann-whitneyu: s = 104937600000 , p = 1.0

meteor > (27.0%) 
wilcoxon signed-rank: w = 12758300000 , p = 1.0
mann-whitneyu: s = 100504000000 , p = 1.0

rouge-1F > (17.0%) 
wilcoxon signed-rank: w = 11501380000 , p = 1.0
mann-whitneyu: s = 98883900000 , p = 1.0

rouge-1P > (9.0%) 
wilcoxon signed-rank: w = 83331700

In [62]:
for method in local_methods_report:
    print(f'LaRAdam_global - {method}_local')
    for score in global_methods_report['LaRAdam']['score']:
        print(score, end=' ')
        h1 = None
        if global_methods_report['LaRAdam']['score'][score] > local_methods_report[method]['score'][score]:
            pc = round(100 * (global_methods_report['LaRAdam']['score'][score] - local_methods_report[method]['score'][score]) / local_methods_report[method]['score'][score], 2)
            print('>', f'({pc}%)', end=' ')
            h1 = 'less'
        else:
            pc = round(100 * (local_methods_report[method]['score'][score] - global_methods_report['LaRAdam']['score'][score]) / global_methods_report['LaRAdam']['score'][score], 2)
            print('<', f'({pc}%)', end=' ')
            h1 = 'greater'
            
        print()
        w, p = wilcoxon(
            global_methods_report['LaRAdam']['scores'][score],
            local_methods_report[method]['scores'][score],
            alternative=h1
        )
        print('wilcoxon signed-rank: w =', round(int(w), -4), ', p =', p)
        
        s, p = mannwhitneyu(
            global_methods_report['LaRAdam']['scores'][score],
            local_methods_report[method]['scores'][score],
            alternative=h1
        )
        print('mann-whitneyu: s =', round(int(s), -5), ', p =', p)
        
        print()
    print()

LaRAdam_global - Adadelta_local
bleu < (1.57%) 
wilcoxon signed-rank: w = 11409070000 , p = 1.0
mann-whitneyu: s = 90417300000 , p = 0.999999923458836

meteor < (1.25%) 
wilcoxon signed-rank: w = 3475030000 , p = 1.0
mann-whitneyu: s = 90629900000 , p = 0.9996891254603857

rouge-1F < (0.18%) 
wilcoxon signed-rank: w = 3251340000 , p = 0.9994266304553264
mann-whitneyu: s = 90972200000 , p = 0.6391436223323287

rouge-1P > (0.67%) 
wilcoxon signed-rank: w = 3145140000 , p = 1.0
mann-whitneyu: s = 91379000000 , p = 0.9995881376175807

rouge-1R < (0.66%) 
wilcoxon signed-rank: w = 1362200000 , p = 1.0
mann-whitneyu: s = 90712100000 , p = 0.9966869535974676

rouge-2F < (2.18%) 
wilcoxon signed-rank: w = 313150000 , p = 1.0
mann-whitneyu: s = 90529800000 , p = 0.9999999909090111

rouge-2P < (2.08%) 
wilcoxon signed-rank: w = 293920000 , p = 1.0
mann-whitneyu: s = 90540700000 , p = 0.9999999821253541

rouge-2R < (2.32%) 
wilcoxon signed-rank: w = 218890000 , p = 1.0
mann-whitneyu: s = 90521800

mann-whitneyu: s = 89208300000 , p = 1.0

symb-rouge-lP > (0.18%) 
wilcoxon signed-rank: w = 7437490000 , p = 1.0
mann-whitneyu: s = 91262700000 , p = 0.9883956915715152

symb-rouge-lR < (2.59%) 
wilcoxon signed-rank: w = 4019410000 , p = 1.0
mann-whitneyu: s = 88453000000 , p = 1.0


