In [1]:
import cohere
import json
import io
import warnings

import pandas as pd
from IPython.display import display
from PIL import Image

from stability_sdk import client
import stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation

import sys
if '..' not in sys.path:
    sys.path.append('..')
    
import os
import random
import numpy as np


from app.story_generator.segmentation import StoryDivider

In [2]:
def get_random_story():
    stories_path = os.path.join(os.path.abspath(''), os.pardir,
                                "data", "stories", "fairy_tales.json")
    with open(stories_path, "rb") as f:
        stories = json.load(f)
        example_story = random.choice(stories)
    return example_story


def get_segmented_story(n_pages=None, sentences_per_page=3):
    story = get_random_story()
    seg = StoryDivider(story["title"], story["text"])
    segmented = seg.divide_story_into_segments(n_pages=n_pages, sentences_per_page=sentences_per_page)
    # print(segmented)
    return seg.story_id, segmented

# APIs

In [3]:
with open('../cohere_api_key.txt', 'r') as f:
    cohere_api_key = f.read()
co = cohere.Client(cohere_api_key)

with open('../stability_api_key.txt', 'r') as f:
    stability_api_key = f.read()
stability_api = client.StabilityInference(
    key=stability_api_key, 
    verbose=True,
)

del cohere_api_key
del stability_api_key

# EDA

In [4]:
# with open('../data/stories/fairy_tales.json') as f:
#     data = json.load(f)
#     # text = data['text']
#     # print(type(text), len(text))
# data = data[0]
# title = data['title']
# text = data['text']
# text

In [5]:
# story_title_1, random_story_1 = get_segmented_story(sentences_per_page=5)
# random_story_2 = random_story_1

# while random_story_1[0] == random_story_2[0]:
#     story_title_2, random_story_2 = get_segmented_story(sentences_per_page=5)

# Generating stories

In [6]:
def generate(prompt, model="xlarge", 
             num_generations=5, temperature=0.7, 
             max_tokens=2000, stop_sequences=['<end>']):
             
  prediction = co.generate(
    model=model,
    prompt=prompt,
    return_likelihoods = 'GENERATION',
    stop_sequences=stop_sequences,
    max_tokens=max_tokens,
    temperature=temperature,
    num_generations=num_generations)
  
  # Get list of generations
  gens = []
  likelihoods = []
  for gen in prediction.generations:
      gens.append(gen.text)
      
      sum_likelihood = 0
      for t in gen.token_likelihoods:
          sum_likelihood += t.likelihood
      # Get sum of likelihoods
      likelihoods.append(sum_likelihood)

  pd.options.display.max_colwidth = 200
  # Create a dataframe for the generated sentences and their likelihood scores
  df = pd.DataFrame({'generation': gens, 'likelihood': likelihoods})
  # Drop duplicates
  df = df.drop_duplicates(subset=['generation'])
  # Sort by highest sum likelihood
  df = df.sort_values('likelihood', ascending=False, ignore_index=True)
  
  return df

In [7]:
def generate_image(image_prompt):
  # the object returned is a python generator
  answers = stability_api.generate(
      prompt=image_prompt
  )

  # iterating over the generator produces the api response
  for resp in answers:
      for artifact in resp.artifacts:
          if artifact.finish_reason == generation.FILTER:
              warnings.warn(
                  "Your request activated the API's safety filters and could not be processed."
                  "Please modify the prompt and try again.")
          if artifact.type == generation.ARTIFACT_IMAGE:
              img = Image.open(io.BytesIO(artifact.binary))
              display(img)


In [8]:
def add_newline_at_the_end(text):
    if text[-1] != '\n':
        text += '\n'
    return text

# def generate_prompt_for_story_start(header: str,  examples: list, titles: list = None, story_title: str = None,
#                                     pre_example_string: str = 'Answer: ', stop_token: str ='<end>', max_tokens: int = 1000):
#     prompt = add_newline_at_the_end(header)
    
#     for i, ex in enumerate(examples):
#         if titles is not None:
#             prompt += 'Title: ' + add_newline_at_the_end(titles[i])
#         prompt += pre_example_string
#         prompt += add_newline_at_the_end(ex)
#         prompt += add_newline_at_the_end(stop_token)
        
#     if titles is not None:
#         prompt += 'Title: '
#         if story_title is not None:
#             prompt += add_newline_at_the_end(story_title)
#             prompt += pre_example_string
#     else:
#         prompt += pre_example_string
    
    
#     estimated_tokens_number = len(prompt.split(' ')) + len(ex.split(' '))
#     estimated_tokens_number *= 2 # let's assume word is on averate 2 tokens
#     assert estimated_tokens_number < max_tokens, f'Estimated number of tokens was {estimated_tokens_number} which is more than specified max number of tokens ({max_tokens})' 
    
#     return prompt

In [9]:
def get_n_stories(n=3):
    stories_path = os.path.join(os.path.abspath(''), os.pardir,
                                "data", "stories", "fairy_tales.json")
    with open(stories_path, "rb") as f:
        stories = json.load(f)
        assert n <= len(stories), f'Tried to get {n} stories while is only {len(stories)} stories in the database'
        example_stories = list(np.random.choice(stories, size=n, replace=False))
        
    return example_stories

def get_segment_of_stories(stories: list, segment_idx: int, handle_too_big_index=True,
                           n_pages=None, sentences_per_page=3):
    segmented_stories = []
    for s in stories:
        seg = StoryDivider(s["title"], s["text"])
        segmented = seg.divide_story_into_segments(n_pages=n_pages, sentences_per_page=sentences_per_page)
        if handle_too_big_index and segment_idx >= len(segmented):
            index = len(segmented) - 2
            
        segmented_stories.append({'title': seg.story_id, 'text': segmented[segment_idx]})
        
    return segmented_stories

def generate_prompt_for_story_start(stories: list, keys_to_use: list, header: str, parameters: list = None,  
                                    stop_token: str ='<end>', max_tokens: int = 1000):
    assert parameters is None or len(parameters) == len(keys_to_use) - 1
    # assert keys_to_use[-1] == 'text'    
    
    prompt = add_newline_at_the_end(header) + '\n'
    
    avg_story_length = 0
    for i, story in enumerate(stories):
        avg_story_length += len(story[keys_to_use[-1]])
        for key in keys_to_use:
            prompt += f'{key.capitalize()}: {story[key]}'
            prompt = add_newline_at_the_end(prompt)
        prompt += add_newline_at_the_end(stop_token)
    avg_story_length = avg_story_length // len(stories)
        
    for key, param in zip(keys_to_use[:-1], parameters):
        prompt += f'{key.capitalize()}: '
        if not param:
            break
        else:
            prompt += add_newline_at_the_end(param)
    
    if param:
        prompt += f'{keys_to_use[-1].capitalize()}: '
            
    estimated_tokens_number = len(prompt.split(' ')) + avg_story_length
    estimated_tokens_number *= 2 # let's assume word is on averate 2 tokens
    assert estimated_tokens_number < max_tokens, f'Estimated number of tokens was {estimated_tokens_number} which is more than specified max number of tokens ({max_tokens})' 
    
    return prompt

def print_result(result, parameters, keys_used):
    starting_string = ''
    break_used = False
    for param, key in zip(parameters, keys_used[:-1]):
        starting_string += f'{key.capitalize()}: '
        if param:
            starting_string += param
        else: 
            break_used = True
            break

        starting_string = add_newline_at_the_end(starting_string)

    if not break_used:
        starting_string += f'{keys_used[-1].capitalize()}:'

    for i, row in result.iterrows():
        print('-'*50)
        print('likelihood:', row['likelihood'])

        print(starting_string + row["generation"])

In [10]:
MAX_TOKENS = 1000
STOP_SEQUENCES = ['<end>']
TEMPERATURE = 0.7
MODEL = 'xlarge'

<h2>Generating beginning of the story</h2>

In [11]:
example_stories = get_n_stories(3)
example_beginnings = get_segment_of_stories(example_stories, 0)

keys_to_use = ['title', 'text']
header = 'Exercise: Generate the beginning of the story for children based on a information given.'
parameters = ['Two Little Pigs That Tried To Learn Programming']

prompt = generate_prompt_for_story_start(example_beginnings, keys_to_use, header, parameters, stop_token=STOP_SEQUENCES[0])
print(prompt)

Exercise: Generate the beginning of the story for children based on a information given.

Title: Hansel and Gretel
Text: Once upon a time, a brother and sister named Hansel and Gretel lived in a hut in the woods with their father who was a poor woodcutter and their mother. Their parents were very poor and had barely enough food to eat One day, their parents sent them off into the woods in search of greener pastures. Their mother cried as she sent them off but they could not take care of them any longer
<end>
Title: Red Riding Hood
Text: Once upon a time, there was a little girl who lived with her mother very close to the woods. The little girl was called Little Red Riding Hood because of the red hat her grandmother gifted her. She liked this hat very much and wore it everywhere
<end>
Title: The Three Little Pigs
Text: Once upon a time, there was a farmer with three little pigs. He did not have enough food to take care of his pigs so he sent them away to take care of themselves. The fir

In [12]:
# len(prompt)

In [13]:
# examples = [random_story_1[0], random_story_2[0]]
# titles = [story_title_1, story_title_2]
# story_title = 'The Dragon and prince penguin' # should be None if there is 
# MAX_TOKENS = 1000

In [14]:
# prompt = generate_prompt_for_story_start(header, examples, titles=titles, story_title=story_title)

In [15]:
beg_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL
)

[0m


In [16]:
print_result(beg_result, parameters, keys_to_use)

--------------------------------------------------
likelihood: -15.659687083299996
Title: Two Little Pigs That Tried To Learn Programming
Text: Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
<end>
--------------------------------------------------
likelihood: -35.930106164600005
Title: Two Little Pigs That Tried To Learn Programming
Text: Once upon a time, there were two little pigs who decided to go to school to learn programming. The first little pig learned Java and C++ and the second little pig learned Python and HTML
<end>
--------------------------------------------------
likelihood: -57.76527549570001
Title: Two Little Pigs That Tried To Learn Programming
Text: Once upon a time, there was a father who had two little pigs. He was a software developer and he wanted his two little pigs to be like him when they grew up. He bought two laptops for them and taught them how to use the keyboard. He also taught them how to type words
<end>

<h2>Generating continuation - strategy 1: Structured prompt</h2>

In [17]:
example_continuations = get_segment_of_stories(example_stories, 1)
example_continuations = example_continuations[:1]
for i, cont in enumerate(example_continuations):
        cont['Previous Part'] = example_beginnings[i]['text']
        cont['Continuation'] = cont['text']
        del cont['text']

keys_to_use = ['title', 'Previous Part', 'Continuation']
header = 'Exercise: Generate the continuation of the story for children based on the title and previous part given.'
parameters = ['Two Little Pigs That Tried To Learn Programming', beg_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '')]


prompt = generate_prompt_for_story_start(example_continuations, keys_to_use, header, parameters)
print(prompt)

Exercise: Generate the continuation of the story for children based on the title and previous part given.

Title: Hansel and Gretel
Previous part: Once upon a time, a brother and sister named Hansel and Gretel lived in a hut in the woods with their father who was a poor woodcutter and their mother. Their parents were very poor and had barely enough food to eat One day, their parents sent them off into the woods in search of greener pastures. Their mother cried as she sent them off but they could not take care of them any longer
Continuation:  Hansel and Gretel took a few pebbles and some bread crumbs with them. That night the two children had nowhere to sleep. They wandered in the forest for days, looking for food and a place to stay
<end>
Title: Two Little Pigs That Tried To Learn Programming
Previous part:  Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
Continuation: 


In [18]:
cont_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL
)

[0m


In [19]:
print_result(cont_result, parameters, keys_to_use)

--------------------------------------------------
likelihood: -30.182892406
Title: Two Little Pigs That Tried To Learn Programming
Previous part:  Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
Continuation:  The two little pigs had no experience in programming. One day, they went to the programmer and told him that they wanted to learn programming.
<end>
--------------------------------------------------
likelihood: -35.941682281000006
Title: Two Little Pigs That Tried To Learn Programming
Previous part:  Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
Continuation:  The two little pigs went to the programmer and asked him to teach them programming. The programmer agreed and gave them some books to read. The two little pigs went to their room and started reading the books
<end>
--------------------------------------------------
likelihood: -56.594793010000004
Title: Two Little Pigs That

<h2>Generating continuation - strategy 2: No examples</h2>

In [20]:
# prompt
prompt = f'Story title: {parameters[0]}\n'
prompt += beg_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '')
print(prompt)

Story title: Two Little Pigs That Tried To Learn Programming
 Once upon a time, there was a programmer who had two little pigs who wanted to learn programming



In [21]:

cont_result_2 = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL
)

[0m


In [22]:
for i, row in cont_result_2.iterrows():
    print('-'*50)
    print(prompt + row['generation'])

--------------------------------------------------
Story title: Two Little Pigs That Tried To Learn Programming
 Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
. So the programmer sent the pigs to school to learn how to program. But after three years, they were not learning very well. They were learning how to program in COBOL.

The pigs said, "Please send us to a school that teaches us how to program in a language that is more modern." So the programmer sent the pigs to a new school.

This new school taught them how to program in C++. After three years, the pigs were not learning how to program in C++ very well. They were not learning very quickly.

The pigs said, "Please send us to a school that teaches us how to program in a language that is more modern." So the programmer sent the pigs to a new school.

This new school taught them how to program in Java. After three years, the pigs were not learning how to program in Java very well.

<strong>Seems like the first approach is better so let's go with that</strong>

<h2>Generating ending</h2>

In [44]:

# NOTE: here we use 2 first fragment of each story and the last fragment.
# It probably would be better to use 3 last fragments or summarization of the story so far and a last fragment
example_endings = get_segment_of_stories(example_stories, -1)
example_endings = example_endings[:1]
for i, ending in enumerate(example_endings):
    ending['Previous Part'] = example_continuations[0]['Previous Part'] + example_continuations[0]['Continuation']
    ending['Story Ending'] = ending['text']
    
    del ending['text']
    print(ending)

keys_to_use = ['title', 'Previous Part', 'Story Ending']
header = 'Exercise: Finish the story for children based on the title and previous part given.'

# NOTE: here we use the whole stories generated so far. It may be impossible to do so in the application as 
generated_so_far = beg_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '') + cont_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '')
parameters = ['Two Little Pigs That Tried To Learn Programming', generated_so_far]


prompt = generate_prompt_for_story_start(example_endings, keys_to_use, header, parameters)
print(prompt)

{'title': 'Hansel and Gretel', 'Previous Part': 'Once upon a time, a brother and sister named Hansel and Gretel lived in a hut in the woods with their father who was a poor woodcutter and their mother. Their parents were very poor and had barely enough food to eat One day, their parents sent them off into the woods in search of greener pastures. Their mother cried as she sent them off but they could not take care of them any longer Hansel and Gretel took a few pebbles and some bread crumbs with them. That night the two children had nowhere to sleep. They wandered in the forest for days, looking for food and a place to stay', 'Story Ending': ' They were too scared to stop for the witch may catch up to them. Finally, they managed to find their way back home and gave jewels to their parents. Thanks to the clever children! The family was never poor and hungry again'}
Exercise: Finish the story for children based on the title and previous part given.

Title: Hansel and Gretel
Previous part:

In [45]:
end_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL
)

[0m


In [46]:
print_result(end_result, parameters, keys_to_use)

--------------------------------------------------
likelihood: -36.45579794900001
Title: Two Little Pigs That Tried To Learn Programming
Previous part:  Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
  The two little pigs had no experience in programming. One day, they went to the programmer and told him that they wanted to learn programming.
Story ending:  The two little pigs were good at programming and eventually became programmers themselves. The programmer was very proud of them and told everyone how smart they were.
<end>
--------------------------------------------------
likelihood: -44.714665995000004
Title: Two Little Pigs That Tried To Learn Programming
Previous part:  Once upon a time, there was a programmer who had two little pigs who wanted to learn programming
  The two little pigs had no experience in programming. One day, they went to the programmer and told him that they wanted to learn programming.
Story ending:  The p

<h1>Old</h1>

In [23]:
# print(result.info())

In [24]:
# result.iloc[0]['generation']

In [25]:
# print(result['generation'][0])

In [26]:
# test_string = """
# Once upon a time, there was a farmer with three little pigs. He did not have enough food to take care of his pigs so he sent them away to take care of themselves. The first little pig was walking on the road. Suddenly he saw a man with some straws. He could  build a house with the straws, he said to himself and asked the man to give him his straws. The man was kind so he gave the first little pig his straws. The pig used the straws to build a straw house and danced around. Suddenly, a big bad wo
# """

In [27]:
# len(test_string)