In [None]:
# Import used modules

import pandas as pd
import re
import os
import json
from rouge import Rouge
from tqdm import tqdm_notebook as tqdm

In [None]:
# Configure filepath

os.chdir("../model_result/")

# Create Evaluation Model

In [None]:
# Init ROUGE-N model

lst_metrics = ['rouge-1', 'rouge-2', 'rouge-3']
rouge = Rouge(metrics=lst_metrics)

In [None]:
def clean_token(document):
    """Filter all token in document that has non alphabet character and
    lowercasing all token that has pass the filter

    Parameters
    ----------
    document: two dimensional list
        document that want to be filtered

    Returns
    -------
    list
        list of token that has pass the filter
    """
    
    lst_cleaned = []
    for lst_token in document:
        for token in lst_token:
            if re.match('[a-zA-Z]+', token):
                lst_cleaned.append(token.lower())             
    return lst_cleaned

In [None]:
def convert(lst_token):
    """Convert list of string into string

    Parameters
    ----------
    lst_token: list
        list of string

    Returns
    -------
    string
        sentence formatted string 
    """
    
    return " ".join(lst_token)

In [None]:
def score(hypothesis, reference):
    """Get ROUGE-N score of a hypothesis based on a reference

    Parameters
    ----------
    hypothesis: two dimensional list
        summary generated by the model
    reference: two dimensional list
        gold standard summary

    Returns
    -------
    dict
        ROUGE-N score in dictionary format
    """
    
    cleaned_hypo = clean_token(hypothesis)
    cleaned_ref = clean_token(reference)
    converted_hypo = convert(cleaned_hypo)
    converted_ref = convert(cleaned_ref)
    return rouge.get_scores(converted_hypo, converted_ref)

# Run Evaluation

## One File

In [None]:
# Calculate ROUGE-N score for all file

filename = "" # Fill with filepath of a file that want to be checked
data = json.load(open(filename))

tmp = []

for key in data.keys():
    tmp2 = [key]
    tmp_dct = data[key]
    try:
        res = score(tmp_dct['hypotesis'], tmp_dct['reference'])
    except:
        continue
    for metric in lst_metrics:
        tmp2.append(res[0][metric]['r'])
    tmp.append(tmp2)

df = pd.DataFrame(tmp, columns=["id"] + lst_metrics)

df

In [None]:
# Print how many data that can't be checked

nrow = len(list(data.keys())) - df.shape[0]

print("Number of mistaken data: {}".format(nrow))

In [None]:
# Display the final result

fin_res = df.loc[:,lst_metrics].mean(axis=0)
fin_res

## Multiple File

In [None]:
# Fill all list with used dataset/method on all file that want to be checked

lst_dataset = ["IndoSUM", "Liputan6"]
lst_topic_modelling = ["LDA", "LSA", "NMF"]
lst_embedding = ["Word2Vec", "FastText", "TF-IDF", "BoW", "BERT"]
lst_similarity = ["Cosine", "Euclidean", "Jaccard"]
lst_method = ["Individual", "Combined"]

In [None]:
# Compute the ROUGE-N score

counter = len(lst_dataset) * len(lst_topic_modelling) * len(lst_embedding) * len(lst_similarity) * len(lst_method)
pbar = tqdm(total=counter)

lst_res = []

for dataset in lst_dataset:
    for topic in lst_topic_modelling:
        for embedding in lst_embedding:
            for similarity in lst_similarity:
                for method in lst_method:
                    filename = "{}-{}-{}-{}-{}.json".format(dataset, topic, embedding, similarity, method)
                    data = json.load(open(filename))
                    tmp = []
                    for key in data.keys():
                        tmp2 = [key]
                        tmp_dct = data[key]
                        try:
                            res = score(tmp_dct['hypotesis'], tmp_dct['reference'])
                        except:
                            continue
                        for metric in lst_metrics:
                            tmp2.append(res[0][metric]['r'])
                        tmp.append(tmp2)
                    df = pd.DataFrame(tmp, columns=["id"] + lst_metrics)
                    fin_res = df.loc[:,lst_metrics].mean(axis=0)
                    tmp = [filename]
                    for metrics in lst_metrics:
                        tmp.append(fin_res[metrics])
                    lst_res.append(tmp)
                    pbar.update(1)

In [None]:
# Display the result

res_df = pd.DataFrame(lst_res, columns=["Filename"]+lst_metrics)
res_df