In [None]:
# Import used modules and configure it

import json
import sys
import warnings

sys.path.append('../')
warnings.filterwarnings("ignore")

In [None]:
# Import other dependencies

from extractive_text_summarizer.summarizer import Summarizer
from tqdm import tqdm_notebook as tqdm

# Read Dataset

In [None]:
def print_json(json_object):
    """Print json in a beatiful format

    Parameters
    ----------
    json_object: json object
        Json file that want to be printed

    Returns
    -------
    None
    """
    
    res = json.dumps(json_object, indent=2)
    print(res)

In [None]:
def open_jsonl(path):
    """Open jsonl and convert it into list of string formatted json

    Parameters
    ----------
    path: string
        jsonl filepath

    Returns
    -------
    list
        list of string formatted json
    """
    
    with open(path, 'r') as json_file:
        json_list = list(json_file)
    return json_list

In [None]:
def convert_to_json(data):
    """Convert string formatted json into json

    Parameters
    ----------
    data: string
        string formatted json

    Returns
    -------
    json
        json
    """
    
    return json.loads(data)

In [None]:
def create_indosum_comparasion(lst_par, lst_sum):
    """Convert string formatted json into json

    Parameters
    ----------
    lst_par: three dimensional list
        each index on the first list represent a paragraph in a document
        each index on the second list represent a sentence in a document 
    lst_sum: two dimensional list
        each index on the second list represent a boolean value whether a sentence is included in the extractive summary or not

    Returns
    -------
    two dimensional list
        list of sentences and list of extractive summaries
    """
    
    lst_sentence = []
    pivot = []
    for par in lst_par:
        tmp = []
        for sent in par:
            tmp.append(sent)
            lst_sentence.append(sent)
        pivot.append(tmp)
    sum_res = []
    for i in range(len(lst_sum)):
        for j in range(len(lst_sum[i])):
            if lst_sum[i][j]:
                sum_res.append(pivot[i][j])
    return lst_sentence, sum_res

In [None]:
def get_indosum_data(partition, fold, index):
    """Open a indosum data and return some data related to it

    Parameters
    ----------
    partition: string
        partition that what to be opened (train/test/dev)
    fold: int
        fold that want to be opened in a partition
    index: int
        index of the json file in the opened fold

    Returns
    -------
    two dimensional list
        list of sentences, list of extractive summaries, and gold standard
    """
    
    dir = "../dataset/IndoSUM/indosum/{}.{}.jsonl".format(partition, str(fold).zfill(2))
    lst_json = open_jsonl(dir)
    json_obj = convert_to_json(lst_json[index])
    lst_par = json_obj["paragraphs"]
    lst_sum = json_obj["gold_labels"]
    gold_label = json_obj["summary"]
    lst_sen, lst_ext = create_indosum_comparasion(lst_par, lst_sum)
    return lst_sen, lst_ext, gold_label

In [None]:
def get_all_indosum_data(lst_fold, lst_partition):
    """Open all json file in all fold in 'lst_fold' and all partition in 'lst_partition'

    Parameters
    ----------
    lst_fold: list
        list of fold that want to be opened
    lst_partition: list
        list of partition that want to be opened

    Returns
    -------
    list
        list of json
    int
        count of opened json file 
    """
    
    counter = 0
    res = {}
    for index in lst_fold:
        tmp = {}
        for partition in lst_partition:
            tmp_lst = []
            dir = "../dataset/IndoSUM/indosum/{}.{}.jsonl".format(partition, str(index).zfill(2))
            lst_json = open_jsonl(dir)
            for json_file in lst_json:
                tmp_lst.append(convert_to_json(json_file))
            tmp[partition] = tmp_lst
            counter += len(tmp_lst)
        res[index] = tmp
    return res, counter

# Model Playground

## Helper Function

In [None]:
def create_sentences(lst_word):
    """Convert list of word into a sentence

    Parameters
    ----------
    lst_word: list
        list of word

    Returns
    -------
    string
        a sentenece
    """
    
    res = ""
    for i in range(len(lst_word) - 1):
        tmp = lst_word[i]
        tmp_next = lst_word[i+1]
        res += tmp
        if len(tmp) > 1 and len(tmp_next) > 1: 
            res += " "
    return res + "."

In [None]:
def print_comparasion(doc, gold_label, sum_res, lst_topic_word):
    """Print document, gold_standard, model result and topic vector in a beautiful format

    Parameters
    ----------
    doc: two dimensional list 
        document that used for the summarization process
    gold_label: two dimensional list
        summary gold standard
    sum_res: two dimensional list
        model result
    lst_topic_word: two dimensional lost
        None

    Returns
    -------
    None
    """
    
    print("Original Text")
    print(" ")
    for sent in doc:
        print(create_sentences(sent))
    print(" ")
    print("Gold Label")
    print(" ")
    for sent in gold_label:
        print(create_sentences(sent))
    print(" ")
    print("Topic Word")
    print(" ")
    for topic in lst_topic_word:
        print(topic)
    print(" ")
    print("Model Result")
    print(" ")
    for sent in sum_res:
        print(create_sentences(sent))

## Run Playground

In [None]:
# Init model

topic_modeling = "LDA"
vector_space_model = {
    "model_name": "Word2Vec"
}
similarity = "Euclidean"

model = Summarizer(topic_modeling, vector_space_model, similarity)

For vector space model, there were 3 keys that can be specified i.e.:
1) model_name: vector space model that want to be used <br>
2) pretrained_file: <br>
    - pretrained model relative filepath based on ../extractive_text_summarizer/vector_space_model.py, or <br>
    - model name based on Huggingface <br>
3) batch_size: batch size for deep learning model <br>
4) device: gpu device name for deep learning model <br>

In [None]:
# Open a data

partition = "test"         
fold = 1                    
index = 1288

lst_sent, lst_ext, gold_label = get_indosum_data(partition, fold, index)

In [None]:
# Run the model

num_of_topic = len(gold_label)
num_of_words = 5
ranking_method = "Combined"

res, lst_topic_word = model.summarize(lst_sent, num_of_topic, num_of_words, ranking_method)

In [None]:
# Print model result and compare it 

print_comparasion(lst_sent, gold_label, res, lst_topic_word)

# Create Evaluation File

## Helper Function

In [None]:
def create_filename(dataset, 
                    topic_modeling, 
                    vector_space_model, 
                    similarity, 
                    ranking_method):
    """Create filepath to save model result in json format

    Parameters
    ----------
    dataset: string 
        used dataset
    topic_modeling: string
        used topic modeling method
    vector_space_model: dict
        used vsm data
    similarity: string
        used similarity metric
    ranking_method:
        used ranking method

    Returns
    -------
    string
        filepath of model result in json format
    """
    
    return "../model_result/{}-{}-{}-{}-{}.json".format(dataset, 
                                                        topic_modeling, 
                                                        vector_space_model['model_name'], 
                                                        similarity, 
                                                        ranking_method)

## Create File 

In [None]:
# Init model

topic_modeling = "LDA"
vector_space_model = {
    "model_name": "Word2Vec"
}
similarity = "Euclidean"

model = Summarizer(topic_modeling, vector_space_model, similarity)

In [None]:
# Open used dataset

all_data, counter = get_all_indosum_data(range(1, 6), ["test"])

In [None]:
# Create model result file 

pbar = tqdm(total=counter)

num_of_words = 5
ranking_method = "Combined"

last = ""
res = {}
cnt = 0

for fold in all_data.keys():
    tmp = all_data[fold]
    for partition in tmp.keys():
        lst_json = tmp[partition]
        for index in range(len(lst_json)):
            last = "{} - {} - {}".format(fold, partition, str(index))
            json_obj = lst_json[index]
            lst_par = json_obj["paragraphs"]
            lst_sum = json_obj["gold_labels"]
            gold_label = json_obj["summary"]
            lst_sent, lst_ext = create_indosum_comparasion(lst_par, lst_sum)
            if len(gold_label) == 0:
                continue 
            num_of_topic = len(gold_label)
            result, lst_topic_word = model.summarize(lst_sent, num_of_topic, num_of_words, ranking_method)
            tmp_res = {
                'hypotesis' : result,
                'reference_ext' : lst_ext,
                'reference' : gold_label
            }
            res[cnt] = tmp_res
            pbar.update(1)
            cnt += 1

print("DONE")

In [None]:
# Save model result

filename = create_filename("IndoSUM", topic_modeling, vector_space_model, similarity, ranking_method)

with open(filename, "w") as outfile:
    json.dump(res, outfile)