In [107]:
from rouge import Rouge
import glob
import os
import re
import pandas as pd

#goldSummaries = glob.glob(r'..data/raw/OpinosisDataset1.0_0/summaries-gold/*/')
gold_summaries = os.walk(r'../data/raw/OpinosisDataset1.0_0/summaries-gold')
gold_list = next(gold_summaries)[1]

In [108]:
def rouge_evaluation(summaryfiles_dir, summaryDir):
    summaryFiles = glob.glob(summaryfiles_dir)
    rouge_1_scores_list = []
    rouge_2_scores_list = []
    rouge_l_scores_list = []
    for h in summaryFiles:
        foldername_search = re.search(r'[^\\/:*?"<>|\r\n]+$', h)
        foldername = (foldername_search.group()).split('.')[0]
        with open(h, 'r') as f:
            hypothesis = f.read()
            if foldername in gold_list:
                files = glob.glob(summaryDir+foldername+'/*')
                foldername = ' '.join(foldername.split('_'))
                for r in files:
                    with open(r, 'r') as f:
                        reference = f.read()
                        rouge = Rouge()
                        scores = rouge.get_scores(hypothesis, reference)[0]
                        rouge_1_scores_list.append(pd.DataFrame(scores['rouge-1'], index=[foldername]))
                        rouge_2_scores_list.append(pd.DataFrame(scores['rouge-2'], index=[foldername]))
                        rouge_l_scores_list.append(pd.DataFrame(scores['rouge-l'], index=[foldername]))
                        rouge_1_df = pd.concat(rouge_1_scores_list)
                        rouge_2_df = pd.concat(rouge_2_scores_list)
                        rouge_l_df = pd.concat(rouge_l_scores_list)
    return rouge_1_df, rouge_2_df, rouge_l_df

In [109]:
import itertools
def human_rouge():
    rouge_1_scores_list = []
    rouge_2_scores_list = []
    rouge_l_scores_list = []
    for folder in gold_list:
        allFiles = glob.glob('C:/Users/obiam/MSc_Project/msc_project/data/raw/OpinosisDataset1.0_0/summaries-gold/'+folder + '/*')
        for h, r in itertools.combinations(allFiles, 2):
            with open(h, 'r') as f:
                hypothesis = f.read()
            with open(r, 'r') as f:
                reference = f.read()
                rouge = Rouge()
                scores = rouge.get_scores(hypothesis, reference)[0]
                rouge_1_scores_list.append(pd.DataFrame(scores['rouge-1'], index=[folder]))
                rouge_2_scores_list.append(pd.DataFrame(scores['rouge-2'], index=[folder]))
                rouge_l_scores_list.append(pd.DataFrame(scores['rouge-l'], index=[folder]))
                rouge_1_df = pd.concat(rouge_1_scores_list)
                rouge_2_df = pd.concat(rouge_2_scores_list)
                rouge_l_df = pd.concat(rouge_l_scores_list)
    return(rouge_1_df, rouge_2_df, rouge_l_df)

In [110]:
textrank_rouge_1, textrank_rouge_2, textrank_rouge_l = rouge_evaluation(r'../data/processed/textrank/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [111]:
lexrank_rouge_1, lexrank_rouge_2, lexrank_rouge_l = rouge_evaluation(r'../data/processed/lexrank/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [112]:
opinosis_rouge_1, opinosis_rouge_2, opinosis_rouge_l = rouge_evaluation(r'../data/processed/opinosis/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [113]:
human_rouge_1, human_rouge_2, human_rouge_l = human_rouge()

In [114]:
from scipy.stats import f_oneway

In [117]:
f, p = f_oneway(textrank_rouge_1, lexrank_rouge_1, opinosis_rouge_1)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [ 28.43531232 117.56735763 116.64452713]
P value: [1.31746767e-12 7.73320398e-45 1.54821528e-44] 



In [118]:
f, p = f_oneway(lexrank_rouge_2, lexrank_rouge_2, opinosis_rouge_2)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [2.38162135 8.90234002 0.04781098]
P value: [9.31374301e-02 1.51837217e-04 9.53317028e-01] 



In [119]:
f, p = f_oneway(lexrank_rouge_l, lexrank_rouge_l, opinosis_rouge_l)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [ 1.76808725 21.8449424   0.63825938]
P value: [1.71408624e-01 6.20770332e-10 5.28513406e-01] 



In [120]:
textrank_rouge_1['algorithm'] = 'textrank'
textrank_rouge_2['algorithm'] = 'textrank'
textrank_rouge_l['algorithm'] = 'textrank'

In [121]:
lexrank_rouge_1['algorithm'] = 'lexrank'
lexrank_rouge_2['algorithm'] = 'lexrank'
lexrank_rouge_l['algorithm'] = 'lexrank'

In [122]:
opinosis_rouge_1['algorithm'] = 'opinosis'
opinosis_rouge_2['algorithm'] = 'opinosis'
opinosis_rouge_l['algorithm'] = 'opinosis'

In [123]:
human_rouge_1['algorithm'] = 'opinosis'
human_rouge_2['algorithm'] = 'opinosis'
human_rouge_l['algorithm'] = 'opinosis'

In [124]:
pd_rouge_1_f_measure = pd.concat([textrank_rouge_1[['algorithm', 'f']],
                                  lexrank_rouge_1[['algorithm', 'f']],
                                  opinosis_rouge_1[['algorithm', 'f']],
                                 human_rouge_1[['algorithm', 'f']]])

In [125]:
pd_rouge_1_precision = pd.concat([textrank_rouge_1[['algorithm', 'p']],
                                  lexrank_rouge_1[['algorithm', 'p']],
                                  opinosis_rouge_1[['algorithm', 'p']],
                                 human_rouge_1[['algorithm', 'p']]])

In [126]:
pd_rouge_1_recall = pd.concat([textrank_rouge_1[['algorithm', 'r']],
                                  lexrank_rouge_1[['algorithm', 'r']],
                                  opinosis_rouge_1[['algorithm', 'r']],
                              human_rouge_1[['algorithm', 'r']]])

In [127]:
from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                         MultiComparison)

# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_f = MultiComparison(pd_rouge_1_f_measure['f'], pd_rouge_1_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_1_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0218  -0.0039  0.0474 False 
lexrank  textrank -0.0625  -0.0938 -0.0313  True 
opinosis textrank -0.0843  -0.1099 -0.0586  True 
-------------------------------------------------


In [128]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_p = MultiComparison(pd_rouge_1_precision['p'], pd_rouge_1_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons
print(MultiComp_rouge_1_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0383   0.0066  0.0701  True 
lexrank  textrank -0.1474   -0.186 -0.1087  True 
opinosis textrank -0.1857  -0.2174 -0.1539  True 
-------------------------------------------------


In [129]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_r = MultiComparison(pd_rouge_1_recall['r'], pd_rouge_1_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons
print(MultiComp_rouge_1_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff lower  upper  reject
-----------------------------------------------
lexrank  opinosis  0.0102  -0.02  0.0403 False 
lexrank  textrank  0.1562  0.1195 0.1929  True 
opinosis textrank  0.146   0.1159 0.1761  True 
-----------------------------------------------


In [156]:
pd_rouge_2_f_measure = pd.concat([textrank_rouge_2[['algorithm', 'f']],
                                  lexrank_rouge_2[['algorithm', 'f']],
                                  opinosis_rouge_2[['algorithm', 'f']]])
pd_rouge_2_precision = pd.concat([textrank_rouge_2[['algorithm', 'p']],
                                  lexrank_rouge_2[['algorithm', 'p']],
                                  opinosis_rouge_2[['algorithm', 'p']]])
pd_rouge_2_recall = pd.concat([textrank_rouge_2[['algorithm', 'r']],
                                  lexrank_rouge_2[['algorithm', 'r']],
                                  opinosis_rouge_2[['algorithm', 'r']]])

In [157]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_f = MultiComparison(pd_rouge_2_f_measure['f'], pd_rouge_2_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0153   0.0006  0.0299  True 
lexrank  textrank  -0.003  -0.0176  0.0117 False 
opinosis textrank -0.0182  -0.0329 -0.0036  True 
-------------------------------------------------


In [158]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_p = MultiComparison(pd_rouge_2_precision['p'], pd_rouge_2_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0401   0.0177  0.0625  True 
lexrank  textrank -0.0218  -0.0442  0.0006 False 
opinosis textrank -0.0619  -0.0843 -0.0395  True 
-------------------------------------------------


In [159]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_r = MultiComparison(pd_rouge_2_recall['r'], pd_rouge_2_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower  upper  reject
------------------------------------------------
lexrank  opinosis  0.0053  -0.0142 0.0249 False 
lexrank  textrank  0.0618   0.0422 0.0814  True 
opinosis textrank  0.0565   0.0369 0.076   True 
------------------------------------------------


In [160]:
pd_rouge_l_f_measure = pd.concat([textrank_rouge_l[['algorithm', 'f']],
                                  lexrank_rouge_l[['algorithm', 'f']],
                                  opinosis_rouge_l[['algorithm', 'f']]])
pd_rouge_l_precision = pd.concat([textrank_rouge_l[['algorithm', 'p']],
                                  lexrank_rouge_l[['algorithm', 'p']],
                                  opinosis_rouge_l[['algorithm', 'p']]])
pd_rouge_l_recall = pd.concat([textrank_rouge_l[['algorithm', 'r']],
                                  lexrank_rouge_l[['algorithm', 'r']],
                                  opinosis_rouge_l[['algorithm', 'r']]])

In [162]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_f = MultiComparison(pd_rouge_l_f_measure['f'], pd_rouge_l_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0196  -0.0009  0.0402 False 
lexrank  textrank -0.0596  -0.0801  -0.039  True 
opinosis textrank -0.0792  -0.0998 -0.0587  True 
-------------------------------------------------


In [163]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_p = MultiComparison(pd_rouge_l_precision['p'], pd_rouge_l_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
lexrank  opinosis  0.0926   0.0601  0.1251  True 
lexrank  textrank -0.1116  -0.1441 -0.0791  True 
opinosis textrank -0.2041  -0.2366 -0.1716  True 
-------------------------------------------------


In [164]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_r = MultiComparison(pd_rouge_l_recall['r'], pd_rouge_l_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower  upper  reject
------------------------------------------------
lexrank  opinosis  -0.01   -0.0386 0.0187 False 
lexrank  textrank  0.1641   0.1354 0.1928  True 
opinosis textrank  0.1741   0.1454 0.2028  True 
------------------------------------------------


In [106]:
human_rouge_1.describe()

Unnamed: 0,f,p,r
count,443.0,443.0,443.0
mean,0.255321,0.275413,0.2749
std,0.186686,0.206015,0.203856
min,0.0,0.0,0.0
25%,0.148148,0.153846,0.142857
50%,0.222222,0.222222,0.235294
75%,0.30303,0.333333,0.333333
max,1.0,1.0,1.0


In [105]:
human_rouge_2.describe()

Unnamed: 0,f,p,r
count,443.0,443.0,443.0
mean,0.087532,0.091925,0.095324
std,0.199262,0.202469,0.211711
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.086957,0.1,0.090909
max,1.0,1.0,1.0


In [104]:
human_rouge_l.describe()

Unnamed: 0,f,p,r
count,443.0,443.0,443.0
mean,0.224163,0.263188,0.263414
std,0.18738,0.203533,0.203813
min,0.0,0.0,0.0
25%,0.122363,0.142857,0.142857
50%,0.185714,0.208333,0.217391
75%,0.267673,0.315789,0.333333
max,1.0,1.0,1.0
