# W266 Final Project - Qualitative Analysis of Generated Summaries

**Description:** 

- This notebook generations 60 samples for qualitative analysis
- Results are set out in a separate Word document.

# 1. Set-up

In [1]:
import evaluate
from pprint import pprint

## General plotting
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import nltk

## Managing memory
import gc
import pickle

## Text processing
import re
import numpy as np
from scipy import stats as st

In [11]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer

In [105]:
## Loading rouge
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


# 2. Loading X-Science Dataset (Test Set only)

## 2.1 Loading the dataset

In [3]:
## Checking if GPU is available when running locally
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



In [4]:
## Loading the dataset
xsci_test = load_dataset('multi_x_science_sum', split='test')

## For text processing as X-Science have not concatenated the source articles
DOC_SEP = " ||||| "

Found cached dataset multi_x_science_sum (C:/Users/JustinTo/.cache/huggingface/datasets/multi_x_science_sum/default/1.1.0/2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729)


## 2.2 Preprocessing

- Tokenization is not necessary as all the answers from models/baseline to be compared to the test labels are already in text form.
- So, we only need to pre-process the X-Science dataset labels to the form we want, e.g. changing the citation numbers to @cite, etc.

In [5]:
pat = re.compile("@cite_[0-9]+")

In [6]:
def preprocess_dataset(example):
    output = {}
    output["abstracts"] = (
        example["abstract"].split("| Abstract: ")[-1]
        + DOC_SEP
        + DOC_SEP.join([x for x in example["ref_abstract"]["abstract"] if x])
    )
    output["related_work"] = pat.sub("@cite", example["related_work"])
    
    return output

In [7]:
def preprocess_dataset_batched(example):
    output = {}
    output["abstracts"] = []
    output["related_work"] = []
    output["main_article"] = []
    
    for abstract, ref_abstract in zip(
        example["abstract"], example["ref_abstract"]
    ):
        output["abstracts"].append(
            abstract.split("| Abstract: ")[-1]
            + DOC_SEP
            + DOC_SEP.join([x for x in ref_abstract["abstract"] if x])
        )
        
        # Main article added for calculating the degree of copying
        output["main_article"].append(abstract)
        
    for related_work in example["related_work"]:
        output["related_work"].append(pat.sub("@cite", related_work))
    
    return output

In [8]:
xsci_test_processed = xsci_test.map(
    # preprocess_dataset,
    preprocess_dataset_batched,
    remove_columns=xsci_test.column_names,
    batched=True,
    batch_size=1,
    )



  0%|          | 0/5093 [00:00<?, ?ba/s]

## 2.3 Separating the Dataset into Short, Medium & Long Input Length Samples

In [12]:
def get_tokenizer(host_tokenizer: str):
    """return the tokenizer and model for LLM training"""

    return AutoTokenizer.from_pretrained(host_tokenizer, 
                                         use_cache=False, 
                                         gradient_checkpointing=True)


centrum_tokenizer = get_tokenizer("ratishsp/Centrum")

centrum_tokenizer.add_tokens(DOC_SEP, special_tokens=True)

1

In [13]:
## Estimating the token length of the test data set
len_inputs = []
len_labels = []
num_articles = []

for sample in xsci_test_processed['abstracts']:
    temp = centrum_tokenizer(sample, return_tensors="pt")
    len_inputs.append(temp.input_ids.shape[1])
    num_articles.append((sample.count(DOC_SEP)+1))

for sample in xsci_test_processed['related_work']:
    temp = centrum_tokenizer(sample, return_tensors="pt")
    len_labels.append(temp.input_ids.shape[1])
                        
assert len(len_inputs) == len(len_labels) == len(num_articles)

print(f"Completed, number of samples processed is: {len(len_inputs)}")

Completed, number of samples processed is: 5093


In [14]:
## Showing quartiles
q1 = np.quantile(len_inputs, [0,0.25,0.5,0.75,1])
q2 = np.quantile(len_labels, [0,0.25,0.5,0.75,1])
q3 = np.quantile(num_articles, [0,0.25,0.5,0.75,1])

print(f"The quartiles in terms of input lengths are: {(q1[1], q1[2], q1[3])}")
print(f"The quartiles in terms of label lengths are: {(q2[1], q2[2], q2[3])}")
print(f"The quartiles in terms of number of articles are: {(q3[1], q3[2], q3[3])}")

The quartiles in terms of input lengths are: (486.0, 735.0, 1150.0)
The quartiles in terms of label lengths are: (96.0, 138.0, 184.0)
The quartiles in terms of number of articles are: (2.0, 4.0, 6.0)


In [18]:
len(np.array(range(5094))[np.where(len_inputs < q1[1])[0]])

1273

In [19]:
## Breaking up test set into three parts: short (<lower quartile); medium (in between quartiles); long (>upper quartile)
short_samples = xsci_test_processed.select(np.where(len_inputs < q1[1])[0])
medium_samples = xsci_test_processed.select(np.where(np.logical_and(q1[1]<=len_inputs, len_inputs < q1[3]))[0])
long_samples = xsci_test_processed.select(np.where(len_inputs >= q1[3])[0])

short_samples, medium_samples, long_samples

(Dataset({
     features: ['related_work', 'abstracts', 'main_article'],
     num_rows: 1273
 }),
 Dataset({
     features: ['related_work', 'abstracts', 'main_article'],
     num_rows: 2546
 }),
 Dataset({
     features: ['related_work', 'abstracts', 'main_article'],
     num_rows: 1274
 }))

In [21]:
## Indice for mapping the index of the subdata sets back to the original
indices = {
    'short':  np.array(range(5094))[np.where(len_inputs < q1[1])[0]],
    'medium': np.array(range(5094))[np.where(np.logical_and(q1[1]<=len_inputs, len_inputs < q1[3]))],
    'long':   np.array(range(5094))[np.where(len_inputs >= q1[3])]
}

indices

{'short': array([   1,    2,    5, ..., 5085, 5086, 5090]),
 'medium': array([   0,    3,    4, ..., 5088, 5089, 5091]),
 'long': array([   9,   21,   22, ..., 5067, 5079, 5092])}

## 2.4 Selecting random samples (20 samples each)

In [48]:
np.random.seed(20230409)  # Random seed

random_arrays = {
    'short':  np.sort(np.random.choice(range(len(indices['short'])), size=20, replace=False)),
    'medium': np.sort(np.random.choice(range(len(indices['medium'])), size=20, replace=False)),
    'long':   np.sort(np.random.choice(range(len(indices['long'])), size=20, replace=False))
}

selected_samples = {
    'short':  short_samples.select(random_arrays['short']),
    'medium': medium_samples.select(random_arrays['medium']),
    'long':   long_samples.select(random_arrays['long'])
}

original_indices = {
    'short':  indices['short'][random_arrays['short']],
    'medium': indices['medium'][random_arrays['medium']],
    'long':   indices['long'][random_arrays['long']]
}

In [49]:
selected_samples

{'short': Dataset({
     features: ['related_work', 'abstracts', 'main_article'],
     num_rows: 20
 }),
 'medium': Dataset({
     features: ['related_work', 'abstracts', 'main_article'],
     num_rows: 20
 }),
 'long': Dataset({
     features: ['related_work', 'abstracts', 'main_article'],
     num_rows: 20
 })}

In [50]:
original_indices

{'short': array([  43,  483,  556,  599,  620,  686,  828, 1127, 1816, 2179, 2317,
        2536, 3027, 3036, 3130, 3157, 3524, 4160, 4191, 4820]),
 'medium': array([ 235,  638,  831,  845, 1165, 1542, 1894, 2011, 2697, 2846, 2916,
        3404, 3419, 3771, 4046, 4263, 4371, 4717, 4858, 5068]),
 'long': array([  76,  485, 1368, 1864, 1872, 1916, 2333, 2542, 2563, 2698, 2741,
        2914, 3010, 3045, 3247, 3739, 4038, 4261, 4510, 4713])}

In [52]:
## Saving results for easier running
with open("misc_data/random_samples_qualitative_analysis.pkl", "wb") as f:
    pickle.dump((original_indices, selected_samples), f)

# 3. Qualitative Analysis

## 3.1 Loading Generated Summaries

In [99]:
## dictionary mapping to file locations of pickled summaries
filepaths = {
    'Base LED (16k)':  "answers_revised/baselines/LED_base_16384tokens.pkl",
    'Large LED (16k)': "answers_revised/baselines/LED_large_16384tokens.pkl",
    'Centrum (4k)':    "answers_revised/baselines/Centrum_4096tokens.pkl",
    'Finetuned LED':   "answers_revised/epoch2/LED_xsci_finetuned_run10.pkl",
    'Finetuned Cent':  "answers_revised/centrum/Centrum_finetuned_norepeat4_run2.pkl",
    '2-Step (Cent)':   "misc_data/XSci_test_2step_CENTRUM.pkl"
}

model_results = {}
for model, path in filepaths.items():
    with open(path, "rb") as f:
        model_results[model] = pickle.load(f)
        

In [100]:
model_results.keys()

dict_keys(['Base LED (16k)', 'Large LED (16k)', 'Centrum (4k)', 'Finetuned LED', 'Finetuned Cent', '2-Step (Cent)'])

## 3.2 Function for Printing Results

In [131]:
def show_model_results(subset, subset_index):
    
    original_index = int(original_indices[subset][subset_index])
    
    print("\033[1m" + f"-----Showing Results for: {subset.capitalize()} Sample; Number {subset_index}-----\n" + "\033[0m")
    print("\033[1m" + "Original index: " + "\033[0m" + f"{original_index}")
    
    ## Abstracts
    temp = xsci_test_processed[original_index]['abstracts'].split(DOC_SEP)
    print("\033[1m" + "\nAbstracts:" + "\033[0m")
    for idx, abstract in enumerate(temp):
        print("\033[1m" + f"({idx+1}):" + "\033[0m")
        print(temp[idx])
    
    ## Labels
    print("\033[1m" + "\nLabel:" + "\033[0m")
    print(xsci_test_processed[original_index]['related_work'])
    
    ## Model Summaries
    for model, summary in model_results.items():
        print("\033[1m" + f"\n{model}:" + "\033[0m")
        
        score = rouge.compute(predictions=[summary[original_index]],
                              references=[xsci_test_processed[original_index]['related_work']],
                              rouge_types=["rouge2", "rougeL"],
                              use_stemmer = True)
        print("\033[1m" + "Rouge scores:" + "\033[0m")
        print(f"- Rouge 2:\n {round(score['rouge2'].mid.precision, 4)} (prec) \n " +
              f"{round(score['rouge2'].mid.recall, 4)} (recall) \n " +
              f"{round(score['rouge2'].mid.fmeasure, 4)} (f-1)")
        print(f"- Rouge L:\n {round(score['rougeL'].mid.precision, 4)} (prec) \n " +
              f"{round(score['rougeL'].mid.recall, 4)} (recall) \n " +
              f"{round(score['rougeL'].mid.fmeasure, 4)} (f-1)")
        print("\033[1m" + "Summary:" + "\033[0m")
        print(summary[original_index])

## 3.3 Showing Results

In [162]:
## Cell for repeated running

# subset = 'short'
# subset = 'medium'
subset = 'long'

# Remember: index should be from 0 to 19 (inclusive)
subset_index = 19

show_model_results(subset=subset, subset_index=subset_index)

[1m-----Showing Results for: Long Sample; Number 19-----
[0m
[1mOriginal index: [0m4713
[1m
Abstracts:[0m
[1m(1):[0m
Gabor filters (GFs) play an important role in many application areas for the enhancement of various types of images and the extraction of Gabor features. For the purpose of enhancing curved structures in noisy images, we introduce curved GFs that locally adapt their shape to the direction of flow. These curved GFs enable the choice of filter parameters that increase the smoothing power without creating artifacts in the enhanced image. In this paper, curved GFs are applied to the curved ridge and valley structures of low-quality fingerprint images. First, we combine two orientation-field estimation methods in order to obtain a more robust estimation for very noisy images. Next, curved regions are constructed by following the respective local orientation. Subsequently, these curved regions are used for estimating the local ridge frequency. Finally, curved GFs are d