In [77]:
from rouge import Rouge
import glob
import os
import re
import pandas as pd

#goldSummaries = glob.glob(r'..data/raw/OpinosisDataset1.0_0/summaries-gold/*/')
gold_summaries = os.walk(r'../data/raw/OpinosisDataset1.0_0/summaries-gold')
gold_list = next(gold_summaries)[1]

In [78]:
def rouge_evaluation(summaryfiles_dir, summaryDir):
    summaryFiles = glob.glob(summaryfiles_dir)
    rouge_1_scores_list = []
    rouge_2_scores_list = []
    rouge_l_scores_list = []
    for h in summaryFiles:
        foldername_search = re.search(r'[^\\/:*?"<>|\r\n]+$', h)
        foldername = (foldername_search.group()).split('.')[0]
        with open(h, 'r') as f:
            hypothesis = f.read()
            if foldername in gold_list:
                files = glob.glob(summaryDir+foldername+'/*')
                foldername = ' '.join(foldername.split('_'))
                for r in files:
                    with open(r, 'r') as f:
                        reference = f.read()
                        rouge = Rouge()
                        scores = rouge.get_scores(hypothesis, reference)[0]
                        rouge_1_scores_list.append(pd.DataFrame(scores['rouge-1'], index=[foldername]))
                        rouge_2_scores_list.append(pd.DataFrame(scores['rouge-2'], index=[foldername]))
                        rouge_l_scores_list.append(pd.DataFrame(scores['rouge-l'], index=[foldername]))
                        rouge_1_df = pd.concat(rouge_1_scores_list)
                        rouge_2_df = pd.concat(rouge_2_scores_list)
                        rouge_l_df = pd.concat(rouge_l_scores_list)
    return rouge_1_df, rouge_2_df, rouge_l_df

In [79]:
textrank_rouge_1, textrank_rouge_2, textrank_rouge_l = rouge_evaluation(r'../data/processed/textrank/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [80]:
lexrank_rouge_1, lexrank_rouge_2, lexrank_rouge_l = rouge_evaluation(r'../data/processed/lexrank/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [81]:
opinosis_rouge_1, opinosis_rouge_2, opinosis_rouge_l = rouge_evaluation(r'../data/processed/opinosis/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [86]:
from scipy.stats import f_oneway

In [87]:
f, p = f_oneway(textrank_rouge_1, lexrank_rouge_1, opinosis_rouge_l)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [ 19.05129924  97.56197074 138.40071076]
P value: [8.71384132e-09 3.62617978e-38 1.71626213e-51] 



In [88]:
f, p = f_oneway(lexrank_rouge_2, lexrank_rouge_2, opinosis_rouge_2)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [3.30353605 9.80952928 0.43931625]
P value: [3.73179369e-02 6.27326588e-05 6.44651755e-01] 



In [89]:
f, p = f_oneway(lexrank_rouge_l, lexrank_rouge_l, opinosis_rouge_l)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [ 2.66195026 23.52298376  0.53429511]
P value: [7.05076886e-02 1.28197023e-10 5.86317397e-01] 



In [132]:
textrank_rouge_1['algorithm'] = 'textrank'
textrank_rouge_2['algorithm'] = 'textrank'
textrank_rouge_l['algorithm'] = 'textrank'

In [133]:
lexrank_rouge_1['algorithm'] = 'lexrank'
lexrank_rouge_2['algorithm'] = 'lexrank'
lexrank_rouge_l['algorithm'] = 'lexrank'

In [149]:
opinosis_rouge_1['algorithm'] = 'opinosis'
opinosis_rouge_2['algorithm'] = 'opinosis'
opinosis_rouge_l['algorithm'] = 'opinosis'

In [150]:
pd_rouge_1_f_measure = pd.concat([textrank_rouge_1[['algorithm', 'f']],
                                  lexrank_rouge_1[['algorithm', 'f']],
                                  opinosis_rouge_1[['algorithm', 'f']]])

In [151]:
pd_rouge_1_precision = pd.concat([textrank_rouge_1[['algorithm', 'p']],
                                  lexrank_rouge_1[['algorithm', 'p']],
                                  opinosis_rouge_1[['algorithm', 'p']]])

In [152]:
pd_rouge_1_recall = pd.concat([textrank_rouge_1[['algorithm', 'r']],
                                  lexrank_rouge_1[['algorithm', 'r']],
                                  opinosis_rouge_1[['algorithm', 'r']]])

In [153]:
from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                         MultiComparison)

# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_f = MultiComparison(pd_rouge_1_f_measure['f'], pd_rouge_1_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_1_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0117  -0.0125  0.0359 False 
lexrank  textrank -0.0589  -0.0831 -0.0348  True 
opinosis textrank -0.0706  -0.0948 -0.0465  True 
-------------------------------------------------


In [154]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_p = MultiComparison(pd_rouge_1_precision['p'], pd_rouge_1_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons
print(MultiComp_rouge_1_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0766   0.0421  0.1111  True 
lexrank  textrank -0.1431  -0.1776 -0.1087  True 
opinosis textrank -0.2198  -0.2543 -0.1853  True 
-------------------------------------------------


In [155]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_r = MultiComparison(pd_rouge_1_recall['r'], pd_rouge_1_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons
print(MultiComp_rouge_1_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower  upper  reject
------------------------------------------------
lexrank  opinosis -0.0299  -0.0608 0.001  False 
lexrank  textrank  0.1572   0.1263 0.1881  True 
opinosis textrank  0.1871   0.1562 0.218   True 
------------------------------------------------


In [156]:
pd_rouge_2_f_measure = pd.concat([textrank_rouge_2[['algorithm', 'f']],
                                  lexrank_rouge_2[['algorithm', 'f']],
                                  opinosis_rouge_2[['algorithm', 'f']]])
pd_rouge_2_precision = pd.concat([textrank_rouge_2[['algorithm', 'p']],
                                  lexrank_rouge_2[['algorithm', 'p']],
                                  opinosis_rouge_2[['algorithm', 'p']]])
pd_rouge_2_recall = pd.concat([textrank_rouge_2[['algorithm', 'r']],
                                  lexrank_rouge_2[['algorithm', 'r']],
                                  opinosis_rouge_2[['algorithm', 'r']]])

In [157]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_f = MultiComparison(pd_rouge_2_f_measure['f'], pd_rouge_2_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0153   0.0006  0.0299  True 
lexrank  textrank  -0.003  -0.0176  0.0117 False 
opinosis textrank -0.0182  -0.0329 -0.0036  True 
-------------------------------------------------


In [158]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_p = MultiComparison(pd_rouge_2_precision['p'], pd_rouge_2_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0401   0.0177  0.0625  True 
lexrank  textrank -0.0218  -0.0442  0.0006 False 
opinosis textrank -0.0619  -0.0843 -0.0395  True 
-------------------------------------------------


In [159]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_r = MultiComparison(pd_rouge_2_recall['r'], pd_rouge_2_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower  upper  reject
------------------------------------------------
lexrank  opinosis  0.0053  -0.0142 0.0249 False 
lexrank  textrank  0.0618   0.0422 0.0814  True 
opinosis textrank  0.0565   0.0369 0.076   True 
------------------------------------------------


In [160]:
pd_rouge_l_f_measure = pd.concat([textrank_rouge_l[['algorithm', 'f']],
                                  lexrank_rouge_l[['algorithm', 'f']],
                                  opinosis_rouge_l[['algorithm', 'f']]])
pd_rouge_l_precision = pd.concat([textrank_rouge_l[['algorithm', 'p']],
                                  lexrank_rouge_l[['algorithm', 'p']],
                                  opinosis_rouge_l[['algorithm', 'p']]])
pd_rouge_l_recall = pd.concat([textrank_rouge_l[['algorithm', 'r']],
                                  lexrank_rouge_l[['algorithm', 'r']],
                                  opinosis_rouge_l[['algorithm', 'r']]])

In [162]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_f = MultiComparison(pd_rouge_l_f_measure['f'], pd_rouge_l_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0196  -0.0009  0.0402 False 
lexrank  textrank -0.0596  -0.0801  -0.039  True 
opinosis textrank -0.0792  -0.0998 -0.0587  True 
-------------------------------------------------


In [163]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_p = MultiComparison(pd_rouge_l_precision['p'], pd_rouge_l_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0926   0.0601  0.1251  True 
lexrank  textrank -0.1116  -0.1441 -0.0791  True 
opinosis textrank -0.2041  -0.2366 -0.1716  True 
-------------------------------------------------


In [164]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_r = MultiComparison(pd_rouge_l_recall['r'], pd_rouge_l_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower  upper  reject
------------------------------------------------
lexrank  opinosis  -0.01   -0.0386 0.0187 False 
lexrank  textrank  0.1641   0.1354 0.1928  True 
opinosis textrank  0.1741   0.1454 0.2028  True 
------------------------------------------------
