# Generate Hallucinated Summaries

In [1]:
import os
import pandas as pd
import numpy as np
import time

# from folder functions
from functions.load_subsampled_data import load_n_rows
from functions.call_llama import create_hallucinated_summaries

In [2]:
file_paths_train = {
    "train": "cnndm/subsamples/train_filtered.csv"
}
datasets_train = load_n_rows(7000, file_paths=file_paths_train)


Subset file for train with 7000 rows already exists. Loading it.


In [3]:
datasets_train["train"]

Unnamed: 0,article,highlights,id
0,(CNN) -- Police and FBI agents are investigati...,Empty anti-tank weapon turns up in front of Ne...,613d6311ec2c1985bd44707d1796d275452fe156
1,"BREMEN, Germany -- Carlos Alberto, who scored ...",Werder Bremen pay a club record $10.7 million ...,77d7c8cf2a9432e395d629371a12790c563c19f7
2,WASHINGTON (CNN) -- Vice President Dick Cheney...,President Bush will have a routine colonoscopy...,35f0e33de7923036a97ac245d899f990bda5e242
3,"SAN FRANCISCO, California (CNN) -- A magnitud...","2,000 customers without electricity, power com...",2ad31cae96512af5105b9b23f9b681dc732b2605
4,(CNN) -- At least 14 people were killed and 60...,Bomb victims waiting for presidential visit .\...,bf0cd4ccacd4fe045995338f4c44d9cf18000226
...,...,...,...
6995,(CNN) -- A senior Pakistani Taliban leader was...,NEW: Pakistani Taliban official confirms airst...,e045841f1c3cf8e8d37cc8177782224a388bb1c2
6996,(CNN) -- Chelsea captain John Terry was charge...,English FA charge John Terry over his clash wi...,73b44ec461215a7c416966f13ef0c7d37b3c9e53
6997,(CNN) -- Four people died when a school bus co...,A school bus and a tractor-trailer collided ne...,80d6c97749106e38bc49847cfff821f051a89d2d
6998,(EW.com) -- First thing's first: TLC's new rat...,"TLC's ""Here Comes Honey Boo Boo"" is gaining in...",7b8ea83f819706b31d55c1bacb3967da885127be


In [4]:
# load subsample data
file_paths_test_val = {
   "test": "cnndm/subsamples/test_filtered.csv",
    "valid": "cnndm/subsamples/valid_filtered.csv"
}

datasets_test_val = load_n_rows(1000, file_paths=file_paths_test_val)

Creating subset file for test with 1000 rows.
Creating subset file for valid with 1000 rows.


In [7]:
# create inference file
file_paths_inf = {
    "inference": "cnndm/subsamples/valid_filtered.csv"
}

dataset_inf = load_n_rows(1100, file_paths=file_paths_inf, start= 1005)
print(dataset_inf["inference"].shape)

Subset file for inference with 1100 rows already exists. Loading it.
(95, 3)


In [5]:
print(datasets_train["train"].head())
print(datasets_test_val["test"].head())
print(datasets_test_val["valid"].head())

                                             article  \
0  (CNN) -- Police and FBI agents are investigati...   
1  BREMEN, Germany -- Carlos Alberto, who scored ...   
2  WASHINGTON (CNN) -- Vice President Dick Cheney...   
3  SAN FRANCISCO, California (CNN)  -- A magnitud...   
4  (CNN) -- At least 14 people were killed and 60...   

                                          highlights  \
0  Empty anti-tank weapon turns up in front of Ne...   
1  Werder Bremen pay a club record $10.7 million ...   
2  President Bush will have a routine colonoscopy...   
3  2,000 customers without electricity, power com...   
4  Bomb victims waiting for presidential visit .\...   

                                         id  
0  613d6311ec2c1985bd44707d1796d275452fe156  
1  77d7c8cf2a9432e395d629371a12790c563c19f7  
2  35f0e33de7923036a97ac245d899f990bda5e242  
3  2ad31cae96512af5105b9b23f9b681dc732b2605  
4  bf0cd4ccacd4fe045995338f4c44d9cf18000226  
                                             artic

In [6]:
# Calculate word count for each column in the DataFrame
source_word_count = datasets_train["train"]["article"].apply(lambda x: len(str(x).split()))
target_word_count = datasets_train["train"]["highlights"].apply(lambda x: len(str(x).split()))

source_word_count.head()


0    349
1    298
2    235
3    193
4    251
Name: article, dtype: int64

In [7]:
print(target_word_count.mean())
print(target_word_count.max())
print(source_word_count.mean())

40.22442857142857
85
256.3102857142857


## Baseline Project

### Create Train Set Hallucinations

In [None]:
output_file_train = "cnndm/fake_summary/train_hallucinated_base.csv"

fake_train = create_hallucinated_summaries(
    df=datasets_train["train"], 
    source_col="article", 
    target_col="highlights",
    output_file_name = output_file_train
)

path check
Processing 250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 250 rows.
Processing 500/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 500 rows.
Processing 750/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 750 rows.
Processing 1000/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 1000 rows.
Processing 1250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 1250 rows.
Processing 1500/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 1500 rows.
Processing 1750/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 1750 rows.
Processing 2000/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 2000 rows.
Processing 2250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated_base.csv after 2250 ro

### Create Test and Validation Set Hallucinations

In [11]:
output_file_test = "cnndm/fake_summary/test_hallucinated_base.csv"

fake_test = create_hallucinated_summaries(
    df=datasets_test_val["test"], 
    source_col="article", 
    target_col="highlights",
    output_file_name = output_file_test
)

path check
Processing 250/1000 rows...
Progress saved to cnndm/fake_summary/test_hallucinated.csv after 250 rows.
Processing 500/1000 rows...
Progress saved to cnndm/fake_summary/test_hallucinated.csv after 500 rows.
Processing 750/1000 rows...
Progress saved to cnndm/fake_summary/test_hallucinated.csv after 750 rows.
Processing 1000/1000 rows...
Progress saved to cnndm/fake_summary/test_hallucinated.csv after 1000 rows.

Processing complete!


In [12]:
output_file_val = "cnndm/fake_summary/val_hallucinated_base.csv"

fake_val = create_hallucinated_summaries(
    df=datasets_test_val["valid"], 
    source_col="article",
    target_col="highlights",
    output_file_name = output_file_val
)

path check
Processing 250/1000 rows...
Progress saved to cnndm/fake_summary/val_hallucinated.csv after 250 rows.
Processing 500/1000 rows...
Progress saved to cnndm/fake_summary/val_hallucinated.csv after 500 rows.
Processing 750/1000 rows...
Progress saved to cnndm/fake_summary/val_hallucinated.csv after 750 rows.
Processing 1000/1000 rows...
Progress saved to cnndm/fake_summary/val_hallucinated.csv after 1000 rows.

Processing complete!


### Create Inference Set Hallucinations

In [None]:
output_file_inf = "cnndm/fake_summary/inf_hallucinated_base.csv"

fake_val = create_hallucinated_summaries(
    df=dataset_inf["inference"], 
    source_col="article",
    target_col="highlights",
    output_file_name = output_file_inf
)

## Extended Project

### Create Train Set Hallucinations

In [None]:
output_file_train = "cnndm/fake_summary/train_hallucinated_ext.csv"

fake_train = create_hallucinated_summaries(
    df=datasets_train["train"], 
    source_col="article", 
    target_col="highlights",
    output_file_name = output_file_train,
    add_tokens=True

)


path check
Processing 250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 250 rows.
Processing 500/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 500 rows.
Processing 750/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 750 rows.
Processing 1000/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 1000 rows.
Processing 1250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 1250 rows.
Processing 1500/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 1500 rows.
Processing 1750/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 1750 rows.
Processing 2000/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 2000 rows.
Processing 2250/7000 rows...
Progress saved to cnndm/fake_summary/train_hallucinated.csv after 2250 rows.
Processing 2500/7000 rows...
Progress sav

### Create Test and Validation Set Hallucinations

In [None]:
output_file_test = "cnndm/fake_summary/test_hallucinated_ext.csv"

fake_test = create_hallucinated_summaries(
    df=datasets_test_val["test"], 
    source_col="article", 
    target_col="highlights",
    output_file_name = output_file_test,
    add_tokens=True
)

In [None]:
output_file_val = "cnndm/fake_summary/val_hallucinated_ext.csv"

fake_val = create_hallucinated_summaries(
    df=datasets_test_val["valid"], 
    source_col="article",
    target_col="highlights",
    output_file_name = output_file_val,
    add_tokens=True
)