# Generate synthetic data for fiction

Uses the OpenAI API to generate paraphases and summaries of fiction.

In [20]:
import pandas as pd
import textwrap
import random

import backoff
import openai

def print_wrapped_text(text, width=70):
    texts = text.split('\n')
    if len(texts) > 1:
        for t in texts:
            print_wrapped_text(t, width=70)

    else:
        text = texts[0]
        wrapper = textwrap.TextWrapper(width=width)
        wrapped_text = wrapper.fill(text)
        print(wrapped_text)
    
with open('credentials.txt', encoding = 'utf-8') as f:
    organization = f.readline().strip()
    api_key = f.readline().strip()
    
client = openai.OpenAI(organization=organization, api_key=api_key)

#### Load the file that needs augmenting

The data we're about to load was created locally. It's very large and we can't realistically generate that much, so let's sample a subset.

In [21]:
meta = pd.read_csv('training_pairs_fiction.tsv', sep='\t')
meta = meta.sample(150000).reset_index(drop=True)
print(meta.shape)
meta.head()

  meta = pd.read_csv('training_pairs_fiction.tsv', sep='\t')


(150000, 5)


Unnamed: 0,paperId,year,anchor,positive,category
0,uc2.ark+=13960=t4jm23x05,1896,Over the pure white limestone and shells of th...,There was rigged on the stern of the boat a pr...,successive
1,20951,1971,He plucked Ulysses’ infant son from his nurse’...,And that is why we must all go now to the aid ...,successive
2,11252,1929,Were you able to connect him with that too?” “...,"The murderer, secure in his conviction that su...",successive
3,nyp.33433082304480,1915,"But the Regent refused to listen to them, and ...",But there were very few who felt and thought l...,successive
4,nyp.33433074863048,1908,"Your little pupils, too, I have met — "" Mr. Nu...",It did not take her a second to do the sum — b...,synthetic-pair


## The possible categories

single-synthetic: a single chunk paired with the word "generate," which is a signal to generate a
        paraphrase of the chunk using an LLM. 8% of all pairs.

successive: two successive chunks from the same article. 88% of all pairs.
        
synthetic-pair: a successive pair of chunks from the same article, of which one should be replaced
        by a synthetic paraphrase. Only 2% of all pairs.

In [22]:
category_counts = meta['category'].value_counts()
category_counts_table = pd.DataFrame({'Category': category_counts.index, 'Count': category_counts.values})
category_counts_table

Unnamed: 0,Category,Count
0,successive,135911
1,synthetic-pair,7103
2,single-synthetic,6986


In [23]:
# Divide meta into a list of dataframes, each 15,000 rows long
# so we can do this in stages and don't lose everything if the connection drops

meta_list = [meta[i:i+15000] for i in range(0, meta.shape[0], 15000)]

### The function that actually calls the API

We surround this with ```backoff``` instructions to handle errors.

In [24]:
@backoff.on_exception(
    backoff.expo,
    openai.RateLimitError,
    max_time=60,  # Set a maximum wait time in seconds (adjust as needed)
    giveup=lambda e: False  # This prevents giving up on retries
)
def completions_with_backoff(**kwargs):
    global client
    try:
        return client.chat.completions.create(**kwargs)
    except openai.APIError as e:
        print(f"Error: {e}")
        raise  # Re-raise the error to trigger the retry mechanism

#### System prompts

We have five different system prompts to create slightly different flavors of summary.

In [44]:
system_prompt_A = "You are a helpful editor condensing passages of fiction. Your goal is to \
perserve the meaning of the original text while reducing its length to four or five sentences. \
Omit details as necessary. Don't add any editorial comments or questions. \
Reply only with your rephrased summary of the passage."

system_prompt_B = "You are a helpful editor condensing passages of fiction. Your goal is to \
preserve the meaning of the text while reducing its length to two or three sentences. \
Omit details as necessary, and condense dialogue. Don't add any editorial comments or questions. \
Reply only with your rephrased summary of the passage."

system_prompt_C = "You are an editor who disguises the identity of characters in fiction. When \
given a passage of fiction, you give three numbered replies. \n\
1. A single personal name in the passage that you select to be changed--or 'none' if the passage does not contain names. \n\
2. A new invented name that will replace it (or 'none' if 1 was 'none'). \n\
3. The original passage, with all instances of the first name replaced by the second. \
If the passage does not contain personal names, this will be the original passage unchanged."

user_C = "Lisa and Stanley discuss the newcomer Todd Graham, who took over Alistair Hubbard's business. \
Lisa questions Graham's belonging in their community, prompting Stanley to agree that Graham doesn't fit the old-fashioned charm of \
Bellville. Stanley recalls a conversation hinting at foul play regarding Hubbard's death and subtly asks Lisa about the situation."

assistant_C = "1. Lisa. \n\
2. Karen. \n\
3. Karen and Stanley discuss the newcomer Todd Graham, who took over Alistair Hubbard's business. \
Karen questions Graham's belonging in their community, prompting Stanley to agree that Graham doesn't fit the old-fashioned charm of \
Bellville. Stanley recalls a conversation hinting at foul play regarding Hubbard's death and subtly asks Karen about the situation."


In [45]:
print_wrapped_text(assistant_C)

1. Lisa.
2. Karen.
3. Karen and Stanley discuss the newcomer Todd Graham, who took over
Alistair Hubbard's business. Karen questions Graham's belonging in
their community, prompting Stanley to agree that Graham doesn't fit
the old-fashioned charm of Bellville. Stanley recalls a conversation
hinting at foul play regarding Hubbard's death and subtly asks Karen
about the situation.


In [28]:
def submit_prompt(system_prompt, user1, temperature):
    
    prompt = [{"role": "system", "content": system_prompt},
              {"role": "user", "content": user1}]

    p = list(prompt)
    # print(p)
    try:
        completion = completions_with_backoff(
            model = "gpt-3.5-turbo",
            messages = p,
            max_tokens = 768,
            temperature = temperature
        )
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        
    return completion

In [29]:
def submit_complex_prompt(system_prompt, user2, assistant1, user3, temperature):
    
    prompt = [{"role": "system", "content": system_prompt},
              {"role": "user", "content": user2},
              {"role": "assistant", "content": assistant1},
              {"role": "user", "content": user3}]

    p = list(prompt)
    # print(p)
    try:
        completion = completions_with_backoff(
            model = "gpt-3.5-turbo",
            messages = p,
            max_tokens = 1024,
            temperature = temperature
        )
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        
    return completion

## Actually generate synthetic data

In [48]:
# all_new_dfs = []
successes = 1
failures = 0

for meta_idx, meta in enumerate(meta_list[6: ]):
    print('Chunk of meta:', meta_idx + 6)
    ctr = 0

    # we save rows as a list of dictionaries, where keys are
    # paperId, year, anchor, positive, system_prompt, category, and temperature
    # system_prompt and temperature will be blank for rows that
    # do not require synthetic data

    new_rows = []

    wordcount = 0

    for idx, row in meta.iterrows():

        ctr += 1
        if ctr % 500 == 0:
            print(ctr, wordcount, round(successes / (successes + failures), 3))
        
        category = row['category']
        year = row['year']
        
        if category == 'successive':
            new_row = {'paperId': row['paperId'], 'year': year, 'anchor': row['anchor'], 
                    'positive': row['positive'], 'category': category, 'system_prompt': '', 'temperature': ''}
            new_rows.append(new_row)
            continue    # we don't need to generate synthetic data for these categories
        
        if row['anchor'] == 'generate':
            to_replace = 'anchor'
        elif row['positive'] == 'generate':
            to_replace = 'positive'
        elif category == 'synthetic-pair':
            to_replace = random.choices(['anchor', 'positive'], weights=[0.5, 0.5])[0]
        else:
            print('error')
            continue

        system_prompt = random.choices([system_prompt_A, system_prompt_B], weights=[0.5, 0.5])[0]
        
        if system_prompt == system_prompt_A:
            promptname = 'A'
        elif system_prompt == system_prompt_B:
            promptname = 'B'
        else: 
            promptname = 'C'    

        # The single-synthetic category is where we replace either the anchor or the positive
        # with a synthetic summary of the *other* cell, producing a pair where one element
        # is a direct paraphrase of the other.
        
        # The synthetic-pair category is where we replace one of the cells with a synthetic summary
        # of itself, producing a pair of passages adjacent in the original article, one of them
        # paraphrased.

        if category == 'single-synthetic':
            if to_replace == 'anchor':
                user1 = row['positive']   # the text to be paraphrased comes from the non-replaced cell
            else:
                user1 = row['anchor']
        elif category == 'synthetic-pair':
            if to_replace == 'anchor':
                user1 = row['anchor'] # the text to be paraphrased comes from the replaced cell
            else:
                user1 = row['positive']

        temperature = round(random.uniform(0.5, 0.7), 3)
        
        try:
            completion = submit_prompt(system_prompt, user1, temperature)
        except Exception as e:
            print(f"Error: {e}")
            continue

        summary = completion.choices[0].message.content

        if promptname == 'A':
            user1 = summary
            success = True
            try:
                completion = submit_complex_prompt(system_prompt_C, user_C, assistant_C, user1, temperature)
                renamed = completion.choices[0].message.content
            except Exception as e:
                print(f"Error: {e}")
                success = False
        
            if success and '3.' in renamed:
                summary = renamed.split('3.')[1].strip().replace('\n', ' ')
                switcheroo = renamed.split('3.')[0].replace('\n', ' ')
            else:
                success = False
            
            if success:
                successes += 1
                promptname = switcheroo
            else:
                failures += 1

        # print_wrapped_text(row['anchor'])
        # print('---')
        # print_wrapped_text(row['positive'])
        # print('---')
        # print_wrapped_text(summary)
        # print(to_replace, category, temperature)  
        # dummy = input("Press Enter to continue...")  

        if to_replace == 'anchor':
            new_row = {'paperId': row['paperId'], 'year': year, 'anchor': summary, 
                    'positive': row['positive'], 'system_prompt': promptname, 
                    'category': category, 'temperature': temperature}
        else:
            new_row = {'paperId': row['paperId'], 'year': year, 'anchor': row['anchor'], 
                    'positive': summary, 'system_prompt': promptname,
                    'category': category, 'temperature': temperature}
            
        new_rows.append(new_row)

        wordcount += len(summary.split())
        wordcount += len(user1.split())
        wordcount += len(system_prompt.split())

    columns_sequence = ['paperId', 'year', 'anchor', 'positive', 'category', 'system_prompt', 'temperature']
    new_df = pd.DataFrame(new_rows, columns=columns_sequence)
    outfile = f'fiction_pairs_{meta_idx + 6}.tsv'
    new_df.to_csv(outfile, sep='\t', index=False)
    all_new_dfs.append(new_df)
    print("Word count:", wordcount)

Chunk of meta: 6
500 15206 0.536
1000 30550 0.592
1500 45015 0.506
2000 64728 0.51
2500 82410 0.467
3000 98504 0.497
3500 113037 0.5
4000 127925 0.511
4500 141593 0.481
5000 158879 0.487
5500 177388 0.479
6000 195011 0.471
6500 209628 0.47
7000 225816 0.469
7500 240994 0.475
8000 254928 0.473
8500 270414 0.475
9000 284942 0.468
9500 296461 0.471
10000 311531 0.472
10500 326631 0.482
11000 345367 0.481
11500 360678 0.48
12000 375531 0.477
12500 390330 0.471
13000 406090 0.474
13500 420764 0.472
14000 437682 0.475
14500 451453 0.479
15000 473348 0.477
Word count: 473348
Chunk of meta: 7
500 14853 0.481
1000 30806 0.482
1500 50037 0.487
2000 65372 0.488
2500 78727 0.489
3000 92483 0.487
3500 113650 0.49
4000 130047 0.493
4500 143726 0.493
5000 160055 0.497
5500 172269 0.496
6000 185064 0.494
6500 198011 0.5
7000 211934 0.503
7500 234102 0.503
8000 251199 0.502
8500 266939 0.503
9000 282099 0.505
9500 297304 0.504
10000 315414 0.502
10500 331172 0.5
11000 346799 0.502
11500 363847 0.502
12

In [49]:
# Concatenate all data frames vertically
concatenated_df = pd.concat(all_new_dfs, axis=0)

concatenated_df.to_csv('synth_fiction_pairs150k.tsv', sep='\t', index=False)

In [50]:
concatenated_df.shape

(150000, 7)

In [15]:
concatenated_df.columns

Index(['paperId', 'year', 'anchor', 'positive', 'system_prompt',
       'temperature'],
      dtype='object')

In [51]:
import pandas as pd

# Count cells with tab or newline characters in 'anchor' column
anchor_count = concatenated_df['anchor'].str.contains('\t|\n').sum()

# Count cells with tab or newline characters in 'positive' column
positive_count = concatenated_df['positive'].str.contains('\t|\n').sum()

# Print the counts
print("Number of cells with tab or newline characters in 'anchor' column:", anchor_count)
print("Number of cells with tab or newline characters in 'positive' column:", positive_count)

Number of cells with tab or newline characters in 'anchor' column: 0
Number of cells with tab or newline characters in 'positive' column: 0


In [53]:
from collections import Counter
prompttype = Counter()

for idx, row in concatenated_df.iterrows():
    prompt = row['system_prompt']
    if pd.isnull(prompt) or prompt == '':
        prompttype['none'] += 1
    elif prompt == 'A':
        prompttype['A'] += 1
    elif prompt == 'B':
        prompttype['B'] += 1
    else:
        prompttype['C'] += 1

prompttype

    

Counter({'none': 135911, 'B': 7052, 'A': 5622, 'C': 1415})

In [36]:
older = pd.read_csv('old_final_pairs.tsv', sep='\t')
system_prompt_counts = older['system_prompt'].value_counts().reset_index()
system_prompt_counts.columns = ['System Prompt', 'Count']
system_prompt_counts

Unnamed: 0,System Prompt,Count
0,A,3861
1,C,953
2,B,745
