# Generate Hallucinated Summaries

In [1]:
import os
import pandas as pd
import numpy as np
import time

# from folder functions
from functions.load_subsampled_data import load_first_n_rows
from functions.call_llama import create_hallucinated_summaries

In [2]:
# load subsample data

file_paths_train = {
    "train": "cnndm/train_raw.csv"
}
datasets_train_5 = load_first_n_rows(5, file_paths=file_paths_train) # used for testing

Creating subset file for train with 5 rows.


In [3]:

datasets_train = load_first_n_rows(7000, file_paths=file_paths_train)


Creating subset file for train with 7000 rows.


In [4]:
datasets_train_5["train"]

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


In [5]:
# load subsample data
file_paths_test_val = {
   "test": "cnndm/test_raw.csv",
    "valid": "cnndm/valid_raw.csv"
}

datasets_test_val = load_first_n_rows(1000, file_paths=file_paths_test_val)

Creating subset file for test with 1000 rows.
Creating subset file for valid with 1000 rows.


In [6]:
print(datasets_train["train"].head())
print(datasets_test_val["test"].head())
print(datasets_test_val["valid"].head())

                                             article  \
0  LONDON, England (Reuters) -- Harry Potter star...   
1  Editor's note: In our Behind the Scenes series...   
2  MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...   
3  WASHINGTON (CNN) -- Doctors removed five small...   
4  (CNN)  -- The National Football League has ind...   

                                          highlights  \
0  Harry Potter star Daniel Radcliffe gets £20M f...   
1  Mentally ill inmates in Miami are housed on th...   
2  NEW: "I thought I was going to die," driver sa...   
3  Five small polyps found during procedure; "non...   
4  NEW: NFL chief, Atlanta Falcons owner critical...   

                                         id  
0  42c027e4ff9730fbb3de84c1af0d2c506e41c3e4  
1  ee8871b15c50d0db17b0179a6d2beab35065f1e9  
2  06352019a19ae31e527f37f7571c6dd7f0c5da37  
3  24521a2abb2e1f5e34e6824e0f9e56904a2b0e88  
4  7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a  
                                             artic

In [8]:
# Calculate word count for each column in the DataFrame
source_word_count = datasets_train["train"]["article"].apply(lambda x: len(str(x).split()))
target_word_count = datasets_train["train"]["highlights"].apply(lambda x: len(str(x).split()))

source_word_count.head()


0    455
1    698
2    743
3    414
4    973
Name: article, dtype: int64

In [9]:
print(target_word_count.mean())
print(target_word_count.max())
print(source_word_count.mean())

44.15642857142857
75
621.6441428571428


## Create Train Set Hallucinations

In [16]:
# bad_path = "test_case"
# good_path = "cnndm/fake_summary/check_function.csv"

# fake_train_5 = create_hallucinated_summaries(
#     df=datasets_train_5["train"], 
#     source_col="source", 
#     target_col="target",
#     output_file_name=good_path
# )

In [20]:
output_file_train = "cnndm/fake_summary/train_hallucinated.csv"

fake_train = create_hallucinated_summaries(
    df=datasets_train["train"], 
    source_col="article", 
    target_col="highlights",
    output_file_name = output_file_train
)


path check
Processing 250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 250 rows.
Processing 500/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 500 rows.
Processing 750/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 750 rows.
Processing 887/7000 rows...

KeyboardInterrupt: 

## Create Test and Validation Set Hallucinations

In [None]:
output_file_test = "cnndm/fake_summary/test_hallucinated.csv"

fake_test = create_hallucinated_summaries(
    df=datasets_test_val["test"], 
    source_col="article", 
    target_col="highlights",
    output_file_name = output_file_test
)

In [None]:
output_file_val = "cnndm/fake_summary/val_hallucinated.csv"

fake_val = create_hallucinated_summaries(
    df=datasets_test_val["valid"], 
    source_col="article",
    target_col="highlights",
    output_file_name = output_file_val
)

 **The commented out code below was used to generate and save the fake summaries - since then I have added option to save the data directly via the function**

In [19]:
# fake_train = create_hallucinated_summaries(
#     df=datasets_train["train"], 
#     source_col="source", 
#     target_col="target"
# )


Processing 7000/7000 rows...
Processing complete!


In [21]:
# df_hallucinated_train = pd.DataFrame(fake_train, columns=["fake_summary"])

# # Save to CSV
# output_file_train = "cnndm/fake_summary/train_hallucinated.csv"
# df_hallucinated_train.to_csv(output_file_train, index=False, encoding="utf-8")

# print(f"List of hallucinated summaries saved to {output_file_train}")

List of hallucinated summaries saved to cnndm/fake_summary/train_hallucinated.csv


In [22]:
# fake_test = create_hallucinated_summaries(
#     df=datasets_test_val["test"], 
#     source_col="source", 
#     target_col="target"
# )


Processing 1000/1000 rows...
Processing complete!


In [23]:
# df_hallucinated_test = pd.DataFrame(fake_test, columns=["fake_summary"])

# # Save to CSV
# output_file_test = "cnndm/fake_summary/test_hallucinated.csv"
# df_hallucinated_test.to_csv(output_file_test, index=False, encoding="utf-8")

# print(f"List of hallucinated summaries saved to {output_file_test}")

List of hallucinated summaries saved to cnndm/fake_summary/test_hallucinated.csv


In [None]:
# fake_val = create_hallucinated_summaries(
#     df=datasets_test_val["valid"], 
#     source_col="source", 
#     target_col="target"
# )

Processing 1000/1000 rows...
Processing complete!


In [26]:
# df_hallucinated_val = pd.DataFrame(fake_val, columns=["fake_summary"])

# # Save to CSV
# output_file_val = "cnndm/fake_summary/val_hallucinated.csv"
# df_hallucinated_val.to_csv(output_file_val, index=False, encoding="utf-8")

# print(f"List of hallucinated summaries saved to {output_file_val}")

List of hallucinated summaries saved to cnndm/fake_summary/val_hallucinated.csv
