# Evaluation of ACCESS and Rule-Based Simplification on the BART Summary Output

In [1]:
%load_ext autoreload
%autoreload

## Step 1: Data loading

Load the Straits Times summaries generated by BART.

In [2]:
import pandas as pd

df_st_sum = pd.read_csv("results/system_outputs/bart_summary.csv",encoding = "ISO-8859-1")
st_sum_dict =  dict({'article': [] ,'summary_text': []})


for i in df_st_sum['article']:
    st_sum_dict['article'].append(i)
for i in df_st_sum['summary_text']:
    st_sum_dict['summary_text'].append(i)

In [3]:
for idx in range(len(df_st_sum['article'])):
    print(f"{idx+1}:\n    Article: {st_sum_dict['article'][idx]}\n    BART Sum.: {st_sum_dict['summary_text'][idx]}")

1:
    Article: SINGAPORE  The construction of the Johor Bahru-Singapore Rapid Transit System (RTS) Link is progressing well, with 45 per cent of the work on the Singapore side completed. Transport Minister S. Iswaran provided the update on Friday when he visited the work site in Admiralty Road West. He said: We are on track to achieve the completion goal so that the systemcan be operational by the end of 2026, and this is what both sides are working towards. Mr Iswaran described the new link as an important addition to the existing road connectivities between Singapore and Malaysia as it will also promote people to people, economic and other linkages as well.  When the 4km RTS Link shuttle service starts operating, passengers will be able to travel from the Bukit Chagar station in Johor Bahru to the Woodlands North station, or in the reverse direction, in about five minutes.  The train service can serve up to 10,000 passengers per hour in each direction.  Passengers will also be able 

## Step 2: Simplify the Text

Helper function for saving the outputs of the simplifiers.

In [4]:
def save_simplifier_outputs(comp_doc, simp_doc, simp_doc_csv_fp, 
                            comp_simp_pairs, comp_simp_pairs_fp):
    """Saves the outputs of the simplifier to the specified CSV files. Also prints the outputs for display"""

    # Save simplified summaries to CSV.
    df_simp_doc = pd.DataFrame.from_dict({
        "complex": comp_doc,
        "simple": simp_doc
    })
    df_simp_doc.to_csv(simp_doc_csv_fp, index=False)

    # Save the complex-simp sentence pairs to CSv
    df_comp_simp_pairs_dict = dict({
        "doc_id": [],
        "doc_sent_id": [],
        "complex": [],
        "simple": []
    })
    for doc_idx in range(len(comp_simp_pairs)):
        # Get the comp-simp sentence pairs for each doc 
        doc_sent_pairs = comp_simp_pairs[doc_idx]
        for doc_sent_idx, (comp, simp) in enumerate(doc_sent_pairs):
            # Store the pair
            df_comp_simp_pairs_dict["doc_id"].append(doc_idx)
            df_comp_simp_pairs_dict["doc_sent_id"].append(doc_sent_idx)
            df_comp_simp_pairs_dict["complex"].append(comp)
            df_comp_simp_pairs_dict["simple"].append(simp)
    df_comp_simp_pairs = pd.DataFrame.from_dict(df_comp_simp_pairs_dict)
    df_comp_simp_pairs.to_csv(comp_simp_pairs_fp, index=False)

    # Demo some of the simplifications (at most 5)
    for doc_idx in range(min(5, len(comp_simp_pairs))):
        print(f"============================================================")
        print()
        
        doc_sent_pairs = comp_simp_pairs[doc_idx]
        print(f"Document {doc_idx}:")
        for comp, simp in doc_sent_pairs:
            print(f"Orig: {comp}")
            print(f"-> Simp: {simp}")
        print()
        print(f"============================================================")

Use ACCESS to simplify the text in the generated summaries

In [5]:
from simplertimes import simplify

# Create ACCESS simplifier
access_simplifier = simplify.create_simplifier(simplify.ACCESS)

[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\hansg\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hansg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hansg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
access_simp_doc_ST_SUM, access_comp_simp_pairs_ST_SUM = access_simplifier.simplify_documents(st_sum_dict["summary_text"])

# Remove ACCESS model from memory
del access_simplifier

2023-04-16 13:19:55 | INFO | fairseq_cli.generate | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name'

In [7]:
# Save ACCESS outputs
save_simplifier_outputs(st_sum_dict["summary_text"], access_simp_doc_ST_SUM, "access_doc_simp.csv",
                        access_comp_simp_pairs_ST_SUM, "access_doc_sent_pair_simp.csv")


Document 0:
Orig: The Johor Bahru-Singapore Rapid Transit System (RTS) Link is progressing well.
-> Simp: The Johor Bahru-Singapore Rapid Transit System (RTS) Link does not work well.
Orig: 45 per cent of the work on the Singapore side has been completed.
-> Simp: 45% of the work on the Singapore side has been completed.
Orig: Transport Minister S. Iswaran provided the update on Friday when he visited the work site.
-> Simp: Transport Minister S. Iswaran gave the update on Friday when he went to the work site.
Orig: He said: We are on track to achieve the completion goal.
-> Simp: He said: We are on track to get the completion goal.


Document 1:
Orig: Bankman-Fried, 31, entered the plea to the new, 13-count indictment through his lawyer.
-> Simp: Bankman-Fried, 31, went back to the new, 13-count indictment through his lawyer.
Orig: He had earlier pleaded not guilty to eight counts of fraud and conspiracy.
-> Simp: He had earlier said that he was not guilty to eight counts of fraud an

In [8]:
# Create DEPSYM simplifier
depsym_simplifier = simplify.create_simplifier(simplify.DEPSYM)

In [9]:
depsym_simp_doc_ST_SUM, depsym_comp_simp_pairs_ST_SUM = depsym_simplifier.simplify_documents(st_sum_dict["summary_text"])

# Remove DEPSYM model from memory
del depsym_simplifier

2023-04-16 13:23:07 | INFO | stanza | Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-16 13:23:11 | INFO | stanza | Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

2023-04-16 13:23:11 | INFO | stanza | Using device: cuda
2023-04-16 13:23:11 | INFO | stanza | Loading: tokenize
2023-04-16 13:23:11 | INFO | stanza | Loading: pos
2023-04-16 13:23:12 | INFO | stanza | Loading: lemma
2023-04-16 13:23:12 | INFO | stanza | Loading: constituency
2023-04-16 13:23:13 | INFO | stanza | Loading: depparse
2023-04-16 13:23:13 | INFO | stanza | Loading: sentiment
2023-04-16 13:23:14 | INFO | stanza | Loading: ner
2023-04-16 13:23:15 | INFO | stanza | Done loading processors!


Simplifying document 1
Simplifying document 2
Simplifying document 3
Simplifying document 4
Simplifying document 5
Simplifying document 6
Simplifying document 7
Simplifying document 8
Simplifying document 9
Simplifying document 10
Simplifying document 11
Simplifying document 12
Simplifying document 13
Simplifying document 14
Simplifying document 15
Simplifying document 16
Simplifying document 17
Simplifying document 18
Simplifying document 19
Simplifying document 20


In [10]:
# Save DEPSYM outputs
save_simplifier_outputs(st_sum_dict["summary_text"], depsym_simp_doc_ST_SUM, "depsym_doc_simp.csv",
                        depsym_comp_simp_pairs_ST_SUM, "depsym_doc_sent_pair_simp.csv")


Document 0:
Orig: The Johor Bahru-Singapore Rapid Transit System (RTS) Link is progressing well.
-> Simp: The Johor Bahru-Singapore rapid transit system (). The Johor Bahru-Singapore rapid transit system are RTS. Link is progressing well.
Orig: 45 per cent of the work on the Singapore side has been completed.
-> Simp: 45 per cent of the work on the Singapore side has been completed.
Orig: Transport Minister S. Iswaran provided the update on Friday when he visited the work site.
-> Simp: Transport Minister S. Iswaran provided the update on Friday. This was when he visited the work site.
Orig: He said: We are on track to achieve the completion goal.
-> Simp: He said: We are on track to achieve the completion goal.


Document 1:
Orig: Bankman-Fried, 31, entered the plea to the new, 13-count indictment through his lawyer.
-> Simp: Bankman-fried entered the plea to the new 13-count indictment through his lawyer. Bankman-fried was 31.
Orig: He had earlier pleaded not guilty to eight counts 

## Step 3: Compare Performance

#### Comparing ACCESS and DEPSYM

The simplification methods are evaluated on the generated outputs of BART. Only FKGL will be used for quantitatively measuring simplification as a reference simplification is unavailable for the generated summaries. A qualitative analysis of the errors of each method will be in the report.

In [11]:
from textstat import flesch_kincaid_grade
import numpy as np

def calc_fkgl_report(orig_docs, simp_docs) -> pd.DataFrame:

    fkgl_report = dict({
        "orig_fkgl": [],
        "simp_fkgl": [],
        "reduction": [],
        "ave_reduction": []
    })

    assert len(orig_docs) == len(simp_docs)

    # Calculate the FKGL scores of the original document and the simplified document
    doc_cnt = len(orig_docs)
    for doc_idx in range(doc_cnt):
        orig_doc = orig_docs[doc_idx]
        simp_doc = simp_docs[doc_idx]
        fkgl_report["orig_fkgl"].append(flesch_kincaid_grade(orig_doc))
        fkgl_report["simp_fkgl"].append(flesch_kincaid_grade(simp_doc))

    # Calculate how much smaller the fkgl of the simplified document is
    reduction = [ 1 - (simp_fkgl/orig_fkgl)  for orig_fkgl, simp_fkgl in zip(fkgl_report["orig_fkgl"], fkgl_report["simp_fkgl"]) ]
    fkgl_report["reduction"] = reduction 

    # Calculate the average reduction. Just put in the first cell.
    ave_reduction = np.mean(reduction)
    for _ in range(len(reduction)):
        fkgl_report["ave_reduction"].append("") # Empty values for padding
    fkgl_report["ave_reduction"][0] = ave_reduction 

    # Turn to a pandas dataframe
    fkgl_report = pd.DataFrame.from_dict(fkgl_report)

    return fkgl_report

access_fkgl_ST_SUM = calc_fkgl_report(st_sum_dict['summary_text'], access_simp_doc_ST_SUM)
depsym_fkgl_ST_SUM = calc_fkgl_report(st_sum_dict['summary_text'], depsym_simp_doc_ST_SUM)
print("ACCESS FKGL Report:")
print(access_fkgl_ST_SUM)
access_fkgl_ST_SUM.to_csv("access_fkgl_report.csv", index=False)
print()
print("DEPSYM FKGL Report:")
print(depsym_fkgl_ST_SUM)
depsym_fkgl_ST_SUM.to_csv("depsym_fkgl_report.csv", index=False)

ACCESS FKGL Report:
    orig_fkgl  simp_fkgl  reduction ave_reduction
0         4.8        3.7   0.229167       0.13108
1         6.9        5.7   0.173913              
2        12.4        9.5   0.233871              
3         6.1        4.0   0.344262              
4         6.3        5.9   0.063492              
5         6.6        6.8  -0.030303              
6         4.6        4.7  -0.021739              
7         7.2        7.0   0.027778              
8         8.7        7.9   0.091954              
9         8.3        7.8   0.060241              
10        7.0        6.7   0.042857              
11       10.6        6.6   0.377358              
12       11.1       10.0   0.099099              
13        9.2        9.0   0.021739              
14        9.6        8.4   0.125000              
15        9.2        8.2   0.108696              
16        8.2        7.3   0.109756              
17        8.7        6.4   0.264368              
18        4.6        4.5   0.0