# Generate Hallucinated Summaries

In [1]:
import os
import pandas as pd
import numpy as np
import time

# from folder functions
from functions.load_subsampled_data import load_first_n_rows
from functions.call_llama import generate_chat_completion, create_hallucinated_summaries

In [2]:
# load subsample data
file_paths_train = {
    "train": {"src": "cnndm/train.src", "tgt": "cnndm/train.tgt"}
}

datasets_train = load_first_n_rows(7000, file_paths=file_paths_train)


Subsample files for train with 7000 rows already exist. Loading them.


In [3]:
datasets_train_5 = load_first_n_rows(5, file_paths=file_paths_train) # used for testing

Subsample files for train with 5 rows already exist. Loading them.


In [4]:
# load subsample data
file_paths_test_val = {
    "test": {"src": "cnndm/test.src", "tgt": "cnndm/test.tgt"},
    "valid": {"src": "cnndm/valid.src", "tgt": "cnndm/valid.tgt"}
}

datasets_test_val = load_first_n_rows(1000, file_paths=file_paths_test_val)

Subsample files for test with 1000 rows already exist. Loading them.
Subsample files for valid with 1000 rows already exist. Loading them.


In [14]:
print(datasets_train["train"].head())
print(datasets_test_val["test"].head())
print(datasets_test_val["valid"].head())

                                              source  \
0  Editor's note: In our Behind the Scenes series...   
1  LONDON, England (Reuters) -- Harry Potter star...   
2  MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...   
3  BAGHDAD, Iraq (CNN) -- Dressed in a Superman s...   
4  WASHINGTON (CNN) -- Doctors removed five small...   

                                              target  
0  Mentally ill inmates in Miami are housed on th...  
1  Harry Potter star Daniel Radcliffe gets £20M f...  
2  NEW: "I thought I was going to die," driver sa...  
3  Parents beam with pride, can't stop from smili...  
4  Five small polyps found during procedure; "non...  
                                              source  \
0  Marseille, France (CNN)The French prosecutor l...   
1  The Palestinian Authority officially became th...   
2  Governments around the world are using the thr...   
3  On May 28, 2014, some 7,000 people gathered in...   
4  Seventy years ago, Anne Frank died of typhus i... 

In [15]:
# Calculate word count for each column in the DataFrame
source_word_count = datasets_train["train"]["source"].apply(lambda x: len(str(x).split()))
target_word_count = datasets_train["train"]["target"].apply(lambda x: len(str(x).split()))

source_word_count.head()


0    696
1    453
2    739
3    704
4    412
Name: source, dtype: int64

In [16]:
print(target_word_count.mean())
print(target_word_count.max())
print(source_word_count.mean())

40.778285714285715
71
616.3961428571429


## Create Train Set Hallucinations

In [8]:
bad_path = "test_case"
good_path = "cnndm/fake_summary/check_function.csv"

fake_train_5 = create_hallucinated_summaries(
    df=datasets_train_5["train"], 
    source_col="source", 
    target_col="target",
    output_file_name=good_path
)

path check
Processing 5/5 rows...
Progress saved to cnndm/fake_summary/check_function.csv after 5 rows.

Processing complete!


In [None]:
output_file_train = "cnndm/fake_summary/train_hallucinated.csv"

fake_train = create_hallucinated_summaries(
    df=datasets_train["train"], 
    source_col="source", 
    target_col="target",
    output_file_name = output_file_train
)


## Create Test and Validation Set Hallucinations

In [None]:
output_file_test = "cnndm/fake_summary/test_hallucinated.csv"

fake_test = create_hallucinated_summaries(
    df=datasets_test_val["test"], 
    source_col="source", 
    target_col="target",
    output_file_name = output_file_test
)

In [None]:
output_file_val = "cnndm/fake_summary/val_hallucinated.csv"

fake_val = create_hallucinated_summaries(
    df=datasets_test_val["valid"], 
    source_col="source",
    target_col="target",
    output_file_name = output_file_val
)

 **The commented out code below was used to generate and save the fake summaries - since then I have added option to save the data directly via the function**

In [19]:
# fake_train = create_hallucinated_summaries(
#     df=datasets_train["train"], 
#     source_col="source", 
#     target_col="target"
# )


Processing 7000/7000 rows...
Processing complete!


In [21]:
# df_hallucinated_train = pd.DataFrame(fake_train, columns=["fake_summary"])

# # Save to CSV
# output_file_train = "cnndm/fake_summary/train_hallucinated.csv"
# df_hallucinated_train.to_csv(output_file_train, index=False, encoding="utf-8")

# print(f"List of hallucinated summaries saved to {output_file_train}")

List of hallucinated summaries saved to cnndm/fake_summary/train_hallucinated.csv


In [22]:
# fake_test = create_hallucinated_summaries(
#     df=datasets_test_val["test"], 
#     source_col="source", 
#     target_col="target"
# )


Processing 1000/1000 rows...
Processing complete!


In [23]:
# df_hallucinated_test = pd.DataFrame(fake_test, columns=["fake_summary"])

# # Save to CSV
# output_file_test = "cnndm/fake_summary/test_hallucinated.csv"
# df_hallucinated_test.to_csv(output_file_test, index=False, encoding="utf-8")

# print(f"List of hallucinated summaries saved to {output_file_test}")

List of hallucinated summaries saved to cnndm/fake_summary/test_hallucinated.csv


In [None]:
# fake_val = create_hallucinated_summaries(
#     df=datasets_test_val["valid"], 
#     source_col="source", 
#     target_col="target"
# )

Processing 1000/1000 rows...
Processing complete!


In [26]:
# df_hallucinated_val = pd.DataFrame(fake_val, columns=["fake_summary"])

# # Save to CSV
# output_file_val = "cnndm/fake_summary/val_hallucinated.csv"
# df_hallucinated_val.to_csv(output_file_val, index=False, encoding="utf-8")

# print(f"List of hallucinated summaries saved to {output_file_val}")

List of hallucinated summaries saved to cnndm/fake_summary/val_hallucinated.csv
