In [34]:
from rouge import Rouge
import glob
import os
import re
import pandas as pd

#goldSummaries = glob.glob(r'..data/raw/OpinosisDataset1.0_0/summaries-gold/*/')
gold_summaries = os.walk(r'../data/raw/OpinosisDataset1.0_0/summaries-gold')
gold_list = next(gold_summaries)[1]

In [35]:
def rouge_evaluation(summaryfiles_dir, summaryDir):
    summaryFiles = glob.glob(summaryfiles_dir)
    rouge_1_scores_list = []
    rouge_2_scores_list = []
    rouge_l_scores_list = []
    for h in summaryFiles:
        foldername_search = re.search(r'[^\\/:*?"<>|\r\n]+$', h)
        foldername = (foldername_search.group()).split('.')[0]
        with open(h, 'r') as f:
            hypothesis = f.read()
            if foldername in gold_list:
                files = glob.glob(summaryDir+foldername+'/*')
                foldername = ' '.join(foldername.split('_'))
                for r in files:
                    with open(r, 'r') as f:
                        reference = f.read()
                        rouge = Rouge()
                        scores = rouge.get_scores(hypothesis, reference)[0]
                        rouge_1_scores_list.append(pd.DataFrame(scores['rouge-1'], index=[foldername]))
                        rouge_2_scores_list.append(pd.DataFrame(scores['rouge-2'], index=[foldername]))
                        rouge_l_scores_list.append(pd.DataFrame(scores['rouge-l'], index=[foldername]))
                        rouge_1_df = pd.concat(rouge_1_scores_list)
                        rouge_2_df = pd.concat(rouge_2_scores_list)
                        rouge_l_df = pd.concat(rouge_l_scores_list)
    return rouge_1_df, rouge_2_df, rouge_l_df

In [36]:
import itertools
def human_rouge():
    rouge_1_scores_list = []
    rouge_2_scores_list = []
    rouge_l_scores_list = []
    for folder in gold_list:
        allFiles = glob.glob('C:/Users/obiam/MSc_Project/msc_project/data/raw/OpinosisDataset1.0_0/summaries-gold/'+folder + '/*')
        for h, r in itertools.combinations(allFiles, 2):
            with open(h, 'r') as f:
                hypothesis = f.read()
            with open(r, 'r') as f:
                reference = f.read()
                rouge = Rouge()
                scores = rouge.get_scores(hypothesis, reference)[0]
                rouge_1_scores_list.append(pd.DataFrame(scores['rouge-1'], index=[folder]))
                rouge_2_scores_list.append(pd.DataFrame(scores['rouge-2'], index=[folder]))
                rouge_l_scores_list.append(pd.DataFrame(scores['rouge-l'], index=[folder]))
                rouge_1_df = pd.concat(rouge_1_scores_list)
                rouge_2_df = pd.concat(rouge_2_scores_list)
                rouge_l_df = pd.concat(rouge_l_scores_list)
    return(rouge_1_df, rouge_2_df, rouge_l_df)

In [37]:
textrank_rouge_1, textrank_rouge_2, textrank_rouge_l = rouge_evaluation(r'../data/processed/textrank/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [38]:
lexrank_rouge_1, lexrank_rouge_2, lexrank_rouge_l = rouge_evaluation(r'../data/processed/lexrank/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [39]:
opinosis_rouge_1, opinosis_rouge_2, opinosis_rouge_l = rouge_evaluation(r'../data/processed/opinosis/*', '../data/raw/OpinosisDataset1.0_0/summaries-gold/')

In [40]:
human_rouge_1, human_rouge_2, human_rouge_l = human_rouge()

In [41]:
from scipy.stats import f_oneway

In [42]:
f, p = f_oneway(textrank_rouge_1, lexrank_rouge_1, opinosis_rouge_1, human_rouge_1)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [20.89525625 68.10986342 56.11411163]
P value: [3.47669329e-13 1.49822456e-40 7.37098388e-34] 



In [43]:
f, p = f_oneway(textrank_rouge_2, lexrank_rouge_2, opinosis_rouge_2, human_rouge_2)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [12.12490524 14.45845396 12.61935858]
P value: [8.14150842e-08 2.99571665e-09 4.04220755e-08] 



In [45]:
f, p = f_oneway(textrank_rouge_l, lexrank_rouge_l, opinosis_rouge_l, human_rouge_l)
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

One-way ANOVA
F value: [39.00089264 65.8851276  56.55286179]
P value: [5.25066593e-24 2.53312482e-39 4.16580614e-34] 



In [46]:
textrank_rouge_1['algorithm'] = 'textrank'
textrank_rouge_2['algorithm'] = 'textrank'
textrank_rouge_l['algorithm'] = 'textrank'

In [47]:
lexrank_rouge_1['algorithm'] = 'lexrank'
lexrank_rouge_2['algorithm'] = 'lexrank'
lexrank_rouge_l['algorithm'] = 'lexrank'

In [48]:
opinosis_rouge_1['algorithm'] = 'opinosis'
opinosis_rouge_2['algorithm'] = 'opinosis'
opinosis_rouge_l['algorithm'] = 'opinosis'

In [49]:
human_rouge_1['algorithm'] = 'human'
human_rouge_2['algorithm'] = 'human'
human_rouge_l['algorithm'] = 'human'

In [53]:
pd_rouge_1_f_measure = pd.concat([textrank_rouge_1[['algorithm', 'f']],
                                  lexrank_rouge_1[['algorithm', 'f']],
                                  opinosis_rouge_1[['algorithm', 'f']],
                                 human_rouge_1[['algorithm', 'f']]])

In [54]:
pd_rouge_1_precision = pd.concat([textrank_rouge_1[['algorithm', 'p']],
                                  lexrank_rouge_1[['algorithm', 'p']],
                                  opinosis_rouge_1[['algorithm', 'p']],
                                human_rouge_1[['algorithm', 'p']]])

In [55]:
pd_rouge_1_recall = pd.concat([textrank_rouge_1[['algorithm', 'r']],
                                  lexrank_rouge_1[['algorithm', 'r']],
                                  opinosis_rouge_1[['algorithm', 'r']],
                              human_rouge_1[['algorithm', 'r']]])

In [56]:
from statsmodels.stats.multicomp import (pairwise_tukeyhsd,
                                         MultiComparison)

# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_f = MultiComparison(pd_rouge_1_f_measure['f'], pd_rouge_1_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_1_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0291  -0.0591  0.0009 False 
 human   opinosis  -0.021   -0.051  0.009  False 
 human   textrank -0.0916  -0.1216 -0.0616  True 
lexrank  opinosis  0.0081  -0.0261  0.0424 False 
lexrank  textrank -0.0625  -0.0967 -0.0283  True 
opinosis textrank -0.0706  -0.1049 -0.0364  True 
-------------------------------------------------


In [57]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_p = MultiComparison(pd_rouge_1_precision['p'], pd_rouge_1_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons
print(MultiComp_rouge_1_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank   -0.02   -0.0569  0.0169 False 
 human   opinosis  0.0524   0.0155  0.0893  True 
 human   textrank -0.1674  -0.2043 -0.1304  True 
lexrank  opinosis  0.0724   0.0303  0.1145  True 
lexrank  textrank -0.1474  -0.1895 -0.1052  True 
opinosis textrank -0.2198  -0.2619 -0.1776  True 
-------------------------------------------------


In [58]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_1_r = MultiComparison(pd_rouge_1_recall['r'], pd_rouge_1_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons
print(MultiComp_rouge_1_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0322  -0.0672  0.0027 False 
 human   opinosis -0.0632  -0.0981 -0.0282  True 
 human   textrank  0.1239   0.089   0.1589  True 
lexrank  opinosis -0.0309  -0.0708  0.0089 False 
lexrank  textrank  0.1562   0.1163  0.196   True 
opinosis textrank  0.1871   0.1472  0.227   True 
-------------------------------------------------


In [59]:
pd_rouge_2_f_measure = pd.concat([textrank_rouge_2[['algorithm', 'f']],
                                  lexrank_rouge_2[['algorithm', 'f']],
                                  opinosis_rouge_2[['algorithm', 'f']],
                                  human_rouge_2[['algorithm', 'f']]])
pd_rouge_2_precision = pd.concat([textrank_rouge_2[['algorithm', 'p']],
                                  lexrank_rouge_2[['algorithm', 'p']],
                                  opinosis_rouge_2[['algorithm', 'p']],
                                  human_rouge_2[['algorithm', 'p']]])
pd_rouge_2_recall = pd.concat([textrank_rouge_2[['algorithm', 'r']],
                                  lexrank_rouge_2[['algorithm', 'r']],
                                  opinosis_rouge_2[['algorithm', 'r']],
                               human_rouge_2[['algorithm', 'r']]])

In [60]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_f = MultiComparison(pd_rouge_2_f_measure['f'], pd_rouge_2_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0501  -0.0779 -0.0223  True 
 human   opinosis -0.0371  -0.0649 -0.0092  True 
 human   textrank -0.0553  -0.0831 -0.0275  True 
lexrank  opinosis  0.013   -0.0187  0.0447 False 
lexrank  textrank -0.0052  -0.0369  0.0265 False 
opinosis textrank -0.0182   -0.05   0.0135 False 
-------------------------------------------------


In [61]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_p = MultiComparison(pd_rouge_2_precision['p'], pd_rouge_2_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0479  -0.0789  -0.017  True 
 human   opinosis -0.0097  -0.0407  0.0212 False 
 human   textrank -0.0716  -0.1025 -0.0407  True 
lexrank  opinosis  0.0382   0.0029  0.0735  True 
lexrank  textrank -0.0237   -0.059  0.0116 False 
opinosis textrank -0.0619  -0.0972 -0.0266  True 
-------------------------------------------------


In [62]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_2_r = MultiComparison(pd_rouge_2_recall['r'], pd_rouge_2_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_2_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0544  -0.0853 -0.0235  True 
 human   opinosis -0.0526  -0.0835 -0.0217  True 
 human   textrank  0.0038  -0.0271  0.0347 False 
lexrank  opinosis  0.0018  -0.0335  0.0371 False 
lexrank  textrank  0.0583   0.023   0.0935  True 
opinosis textrank  0.0565   0.0212  0.0917  True 
-------------------------------------------------


In [63]:
pd_rouge_l_f_measure = pd.concat([textrank_rouge_l[['algorithm', 'f']],
                                  lexrank_rouge_l[['algorithm', 'f']],
                                  opinosis_rouge_l[['algorithm', 'f']],
                                  human_rouge_l[['algorithm', 'f']]])
pd_rouge_l_precision = pd.concat([textrank_rouge_l[['algorithm', 'p']],
                                  lexrank_rouge_l[['algorithm', 'p']],
                                  opinosis_rouge_l[['algorithm', 'p']],
                                  human_rouge_l[['algorithm', 'p']]])
pd_rouge_l_recall = pd.concat([textrank_rouge_l[['algorithm', 'r']],
                                  lexrank_rouge_l[['algorithm', 'r']],
                                  opinosis_rouge_l[['algorithm', 'r']],
                               human_rouge_l[['algorithm', 'r']]])

In [64]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_f = MultiComparison(pd_rouge_l_f_measure['f'], pd_rouge_l_f_measure['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_f.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0556  -0.0841 -0.0271  True 
 human   opinosis -0.0397  -0.0682 -0.0111  True 
 human   textrank -0.1189  -0.1474 -0.0903  True 
lexrank  opinosis  0.0159  -0.0166  0.0485 False 
lexrank  textrank -0.0633  -0.0958 -0.0307  True 
opinosis textrank -0.0792  -0.1118 -0.0467  True 
-------------------------------------------------


In [65]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_p = MultiComparison(pd_rouge_l_precision['p'], pd_rouge_l_precision['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_p.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0488  -0.0845  -0.013  True 
 human   opinosis  0.0399   0.0042  0.0756  True 
 human   textrank -0.1642  -0.1999 -0.1285  True 
lexrank  opinosis  0.0887   0.0479  0.1294  True 
lexrank  textrank -0.1155  -0.1562 -0.0747  True 
opinosis textrank -0.2041  -0.2449 -0.1634  True 
-------------------------------------------------


In [66]:
# Set up the data for comparison (creates a specialised object)
MultiComp_rouge_l_r = MultiComparison(pd_rouge_l_recall['r'], pd_rouge_l_recall['algorithm'])

# Show all pair-wise comparisons:

# Print the comparisons

print(MultiComp_rouge_l_r.tukeyhsd().summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
 human   lexrank  -0.0595  -0.0934 -0.0257  True 
 human   opinosis -0.0704  -0.1043 -0.0366  True 
 human   textrank  0.1037   0.0698  0.1375  True 
lexrank  opinosis -0.0109  -0.0495  0.0277 False 
lexrank  textrank  0.1632   0.1246  0.2018  True 
opinosis textrank  0.1741   0.1355  0.2127  True 
-------------------------------------------------


In [67]:
human_rouge_1.describe().round(3)

Unnamed: 0,f,p,r
count,443.0,443.0,443.0
mean,0.255,0.275,0.275
std,0.187,0.206,0.204
min,0.0,0.0,0.0
25%,0.148,0.154,0.143
50%,0.222,0.222,0.235
75%,0.303,0.333,0.333
max,1.0,1.0,1.0


In [68]:
human_rouge_2.describe().round(3)

Unnamed: 0,f,p,r
count,443.0,443.0,443.0
mean,0.088,0.092,0.095
std,0.199,0.202,0.212
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.087,0.1,0.091
max,1.0,1.0,1.0


In [69]:
human_rouge_l.describe().round(3)

Unnamed: 0,f,p,r
count,443.0,443.0,443.0
mean,0.224,0.263,0.263
std,0.187,0.204,0.204
min,0.0,0.0,0.0
25%,0.122,0.143,0.143
50%,0.186,0.208,0.217
75%,0.268,0.316,0.333
max,1.0,1.0,1.0
