In [1]:
import cohere
import json
import io
import warnings

import pandas as pd
from IPython.display import display
from PIL import Image

from stability_sdk import client
import stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation

import sys
if '..' not in sys.path:
    sys.path.append('..')
    
import os
import random
import numpy as np


from app.story_generator.segmentation import StoryDivider

In [2]:
def get_random_story():
    stories_path = os.path.join(os.path.abspath(''), os.pardir,
                                "data", "stories", "fairy_tales.json")
    with open(stories_path, "rb") as f:
        stories = json.load(f)
        example_story = random.choice(stories)
    return example_story


def get_segmented_story(n_pages=None, sentences_per_page=3):
    story = get_random_story()
    seg = StoryDivider(story["title"], story["text"])
    segmented = seg.divide_story_into_segments(n_pages=n_pages, sentences_per_page=sentences_per_page)
    # print(segmented)
    return seg.story_id, segmented

# APIs

In [3]:
with open('../cohere_api_key.txt', 'r') as f:
    cohere_api_key = f.read()
co = cohere.Client(cohere_api_key)

with open('../stability_api_key.txt', 'r') as f:
    stability_api_key = f.read()
stability_api = client.StabilityInference(
    key=stability_api_key, 
    verbose=True,
)

del cohere_api_key
del stability_api_key

# EDA

In [4]:
# with open('../data/stories/fairy_tales.json') as f:
#     data = json.load(f)
#     # text = data['text']
#     # print(type(text), len(text))
# data = data[0]
# title = data['title']
# text = data['text']
# text

In [5]:
# story_title_1, random_story_1 = get_segmented_story(sentences_per_page=5)
# random_story_2 = random_story_1

# while random_story_1[0] == random_story_2[0]:
#     story_title_2, random_story_2 = get_segmented_story(sentences_per_page=5)

# Generating stories

In [6]:
def generate(prompt, model="xlarge", 
             num_generations=5, temperature=0.7, 
             max_tokens=2000, stop_sequences=['--'], **kwargs):
             
  prediction = co.generate(
    model=model,
    prompt=prompt,
    return_likelihoods = 'GENERATION',
    stop_sequences=stop_sequences,
    max_tokens=max_tokens,
    temperature=temperature,
    num_generations=num_generations,
    **kwargs)
  
  # Get list of generations
  gens = []
  likelihoods = []
  for gen in prediction.generations:
      gens.append(gen.text)
      
      sum_likelihood = 0
      for t in gen.token_likelihoods:
          sum_likelihood += t.likelihood
      # Get sum of likelihoods
      likelihoods.append(sum_likelihood)

  pd.options.display.max_colwidth = 200
  # Create a dataframe for the generated sentences and their likelihood scores
  df = pd.DataFrame({'generation': gens, 'likelihood': likelihoods})
  # Drop duplicates
  df = df.drop_duplicates(subset=['generation'])
  # Sort by highest sum likelihood
  df = df.sort_values('likelihood', ascending=False, ignore_index=True)
  
  return df

In [7]:
def generate_image(image_prompt):
  # the object returned is a python generator
  answers = stability_api.generate(
      prompt=image_prompt
  )

  # iterating over the generator produces the api response
  for resp in answers:
      for artifact in resp.artifacts:
          if artifact.finish_reason == generation.FILTER:
              warnings.warn(
                  "Your request activated the API's safety filters and could not be processed."
                  "Please modify the prompt and try again.")
          if artifact.type == generation.ARTIFACT_IMAGE:
              img = Image.open(io.BytesIO(artifact.binary))
              display(img)


In [8]:
# prompt = """"Illustration of open blank book for children icon, stock image
# """

# generate_image(prompt)

In [9]:
def add_newline_at_the_end(text):
    if text[-1] != '\n':
        text += '\n'
    return text

# def generate_prompt_for_story(header: str,  examples: list, titles: list = None, story_title: str = None,
#                                     pre_example_string: str = 'Answer: ', stop_token: str ='<end>', max_tokens: int = 1000):
#     prompt = add_newline_at_the_end(header)
    
#     for i, ex in enumerate(examples):
#         if titles is not None:
#             prompt += 'Title: ' + add_newline_at_the_end(titles[i])
#         prompt += pre_example_string
#         prompt += add_newline_at_the_end(ex)
#         prompt += add_newline_at_the_end(stop_token)
        
#     if titles is not None:
#         prompt += 'Title: '
#         if story_title is not None:
#             prompt += add_newline_at_the_end(story_title)
#             prompt += pre_example_string
#     else:
#         prompt += pre_example_string
    
    
#     estimated_tokens_number = len(prompt.split(' ')) + len(ex.split(' '))
#     estimated_tokens_number *= 2 # let's assume word is on averate 2 tokens
#     assert estimated_tokens_number < max_tokens, f'Estimated number of tokens was {estimated_tokens_number} which is more than specified max number of tokens ({max_tokens})' 
    
#     return prompt

In [10]:
def get_n_stories(n=3):
    stories_path = os.path.join(os.path.abspath(''), os.pardir,
                                "data", "stories", "fairy_tales.json")
    with open(stories_path, "rb") as f:
        stories = json.load(f)
        assert n <= len(stories), f'Tried to get {n} stories while is only {len(stories)} stories in the database'
        example_stories = list(np.random.choice(stories, size=n, replace=False))
        
    return example_stories

def get_segment_of_stories(stories: list, segment_idx: int, handle_too_big_index=True,
                           n_pages=None, sentences_per_page=3):
    segmented_stories = []
    for s in stories:
        seg = StoryDivider(s["title"], s["text"])
        segmented = seg.divide_story_into_segments(n_pages=n_pages, sentences_per_page=sentences_per_page)
        if handle_too_big_index and segment_idx >= len(segmented):
            index = len(segmented) - 2
            
        segmented_stories.append({'title': seg.story_id, 'text': segmented[segment_idx]})
        
    return segmented_stories

def generate_prompt_for_story(stories: list, keys_to_use: list, header: str, parameters: list = None,  
                              stop_token: str ='--', max_tokens: int = 2048):
    if parameters is None:
        parameters = []
        
    assert len(parameters) <= len(keys_to_use) - 1
    # assert keys_to_use[-1] == 'text'    
    
    if header:
        prompt = add_newline_at_the_end(header) + '\n'
    else: 
        prompt = ''
    
    avg_story_length = 0
    for i, story in enumerate(stories):
        avg_story_length += len(story[keys_to_use[-1]])
        for key in keys_to_use:
            prompt += f'{key.title()}: {story[key]}'
            prompt = add_newline_at_the_end(prompt)
        prompt += add_newline_at_the_end(stop_token)
    avg_story_length = avg_story_length // len(stories)
        
    for key, param in zip(keys_to_use[:-1], parameters):
        prompt += f'{key.title()}: '
        if not param:
            break
        else:
            prompt += add_newline_at_the_end(param)
    
    if param:
        prompt += f'{keys_to_use[-1].title()}: '
            
    estimated_tokens_number = len(prompt.split(' ')) + avg_story_length
    estimated_tokens_number *= 2 # let's assume word is on averate 2 tokens
    assert estimated_tokens_number < max_tokens, f'Estimated number of tokens was {estimated_tokens_number} which is more than specified max number of tokens ({max_tokens})' 
    
    return prompt

def print_result(result, parameters, keys_used, max_words=1500):
    starting_string = ''
    break_used = False
    for param, key in zip(parameters, keys_used[:-1]):
        starting_string += f'{key.title()}: '
        if param:
            starting_string += param
        else: 
            break_used = True
            break

        starting_string = add_newline_at_the_end(starting_string)

    if not break_used:
        starting_string += f'{keys_used[-1].title()}:'

    for i, row in result.iterrows():
        print('-'*50)
        print('likelihood:', row['likelihood'])

        text = starting_string + row["generation"]
        if len(text.split(' ')) > max_words:
            print('Text too long')
            continue
        print(text)

In [11]:
MAX_TOKENS = 300
STOP_SEQUENCES = ['--']
TEMPERATURE = 0.5
MODEL = 'xlarge'
MIN_P = 0.8
FREQ_PENALTY = 0.3
PRESENCE_PENALTY = 0.5

<h2>Generating summarization of the story</h2>

In [36]:
example_stories = get_n_stories(3)

keys_to_use = ['title', 'summary']
header = 'Exercise: Write a short summary of a story for children based on title given.'
title = 'the adventures of miss koala and mister penguin'.title()
parameters = []

prompt = generate_prompt_for_story(example_stories, keys_to_use, header, parameters, stop_token=STOP_SEQUENCES[0])
print(prompt)

AssertionError: 

In [13]:
summary_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL, 
    p=MIN_P,
    frequency_penalty=FREQ_PENALTY,
    presence_penalty=PRESENCE_PENALTY
)

[0m


In [14]:
print_result(summary_result, parameters, keys_to_use, max_words=500)

--------------------------------------------------
likelihood: -91.52001648955002
Title: The Adventures Of Miss Koala And Mister Penguin
Summary: Mr Penguin went on an adventure with Miss Koala as they walked together through desert, jungle, sea, forest, and mountain before returning home where they snuggled up together in bed with their books!
--
--------------------------------------------------
likelihood: -102.35307267940996
Title: The Adventures Of Miss Koala And Mister Penguin
Summary: Miss Koala goes on vacation with her friend Mister Penguin when she meets some other penguins who want her to become their leader. She agrees, but soon realizes that being leader means being bossy all day long! She goes back home where Mister Penguin is waiting for her with flowers and candy!
--
--------------------------------------------------
likelihood: -118.16484076289996
Title: The Adventures Of Miss Koala And Mister Penguin
Summary: Mister Penguin travels around Europe from England to Italy,

<h2>Generating beginning of the story</h2>

In [15]:
example_beginnings = get_segment_of_stories(example_stories, 0)

keys_to_use = ['title', 'text']
header = 'Exercise: Generate the beginning of the story for children based on a information given.'
title = 'the fearsome princess and a cowardly dragon'.title()
parameters = [title]

prompt = generate_prompt_for_story(example_beginnings, keys_to_use, header, parameters, stop_token=STOP_SEQUENCES[0])
print(prompt)

Exercise: Generate the beginning of the story for children based on a information given.

Title: The Ugly Duckling
Text: Once upon a time, there was a duck. She lived in the forest. One day she laid some eggs
--
Title: The Three Little Pigs
Text: Once upon a time, there was a farmer with three little pigs. He did not have enough food to take care of his pigs so he sent them away to take care of themselves. The first little pig was walking on the road
--
Title: The Emperors' New Suit
Text: Many years ago, there lived an emperor who cared too much about his looks. He liked to wear the most fancy suits and parade around his kingdom. He had a new coat for evry hour of the day and often liked to show off his outfits
--
Title: The Fearsome Princess And A Cowardly Dragon
Text: 


In [16]:
# len(prompt)

In [17]:
# examples = [random_story_1[0], random_story_2[0]]
# titles = [story_title_1, story_title_2]
# story_title = 'The Dragon and prince penguin' # should be None if there is 
# MAX_TOKENS = 1000

In [18]:
# prompt = generate_prompt_for_story(header, examples, titles=titles, story_title=story_title)

In [19]:
beg_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL, 
    p=MIN_P,
    frequency_penalty=FREQ_PENALTY,
    presence_penalty=PRESENCE_PENALTY
)

[0m


In [20]:
# beg_result.iloc[0]['generation']

In [21]:
print_result(beg_result, parameters, keys_to_use, max_words=500)

--------------------------------------------------
likelihood: -76.6981995286
Title: The Fearsome Princess And A Cowardly Dragon
Text: Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...
--
--------------------------------------------------
likelihood: -95.30794127800003
Title: The Fearsome Princess And A Cowardly Dragon
Text: There was once a princess in a far-away land who feared nothing. She had been raised by her father, the king, who had taught her that fear could not be controlled by anyone, not even him. "It is something that must come from within," he said. "Only then can you truly conquer your fears."
--
--------------------------------------------------
likelihood: -127.49046446100003
Title: The Fearsome Princess And A Cowardly Drago

<h2>Generating continuation - strategy 1: Structured prompt</h2>

In [22]:
example_continuations = get_segment_of_stories(example_stories, 1)
example_continuations = example_continuations[:1]
for i, cont in enumerate(example_continuations):
    cont['Previous Part'] = example_beginnings[i]['text']
    cont['Continuation'] = cont['text']
    del cont['text']

keys_to_use = ['title', 'Previous Part', 'Continuation']
header = 'Exercise: Generate the continuation of the story for children based on the title and previous part given.'
parameters = [title, beg_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '')]


prompt = generate_prompt_for_story(example_continuations, keys_to_use, header, parameters)
print(prompt)

Exercise: Generate the continuation of the story for children based on the title and previous part given.

Title: The Ugly Duckling
Previous Part: Once upon a time, there was a duck. She lived in the forest. One day she laid some eggs
Continuation:  There was one very big egg among the eggs she laid. After warming the eggs carefully, the duck waited for the eggs to hatch. As she watched, three of her eggs cracked and three beautiful ducklings hatched
--
Title: The Fearsome Princess And A Cowardly Dragon
Previous Part:  Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...
Continuation: 


In [23]:
cont_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL,
    p=MIN_P,
    frequency_penalty=FREQ_PENALTY,
    presence_penalty=PRESENCE_PENALTY
)

[0m


In [24]:
print_result(cont_result, parameters, keys_to_use, max_words=500)

--------------------------------------------------
likelihood: -64.97160166
Title: The Fearsome Princess And A Cowardly Dragon
Previous Part:  Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...
Continuation:  The dragon told her that he would do anything for her if only he could be sure that no harm would come to him. The princess agreed to keep him safe from harm if he promised not to eat any humans or destroy their homes or crops...
--
--------------------------------------------------
likelihood: -118.45756135750001
Title: The Fearsome Princess And A Cowardly Dragon
Previous Part:  Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind

<h2>Generating continuation - strategy 2: No examples</h2>

In [25]:
# prompt
prompt = f'Story title: {parameters[0]}\n'
prompt += beg_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '')
print(prompt)

Story title: The Fearsome Princess And A Cowardly Dragon
 Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...



In [26]:
cont_result_2 = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL,
    p=MIN_P,
    frequency_penalty=FREQ_PENALTY,
    presence_penalty=PRESENCE_PENALTY
)

[0m


In [27]:
for i, row in cont_result_2.iterrows():
    print('-'*50)
    print(prompt + row['generation'])

--------------------------------------------------
Story title: The Fearsome Princess And A Cowardly Dragon
 Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...


==Cast==
*Heng Ee - The Fearsome Princess (Princess Margaretha)
*Ker Ying - The Cowardly Dragon (Dragon) 
*Mazlan Ahmad - The King (Sultan) 
*Mahmood Ali - The Prime Minister (Tunku Besar) 
*Omar Rojik - The Royal Horseman (Pegawai Kerana Diraja) 
*Fadzil Ali - The Royal Horseman (Pegawai Kerana Diraja) 
*Dullah Hashim - The Royal Horseman (Pegawai Kerana Diraja) 
*Yusoff Latiff - The Royal Horseman (Pegawai Kerana Diraja) 
*Salleh Kamil - The Royal Horseman (Pegawai Kerana Diraja) 
*Suhaimi Kamat - The Royal Horseman (Pegawai Kerana Diraja) 
*Ani Hamidah - The Royal Horseman (Pegawa

<strong>Seems like the first approach is better so let's go with that</strong>

<h2>Generating ending</h2>

In [28]:
# NOTE: here we use 2 first fragment of each story and the last fragment.
# It probably would be better to use 3 last fragments or summarization of the story so far and a last fragment
# Preprocessing: Add THE END for all of the examples / add lived happily ever after
example_endings = get_segment_of_stories(example_stories, -1)
example_endings = example_endings[:1]
for i, ending in enumerate(example_endings):
    ending['Previous Part'] = example_continuations[0]['Previous Part'] + example_continuations[0]['Continuation']
    ending['Story Ending'] = ending['text']
    
    del ending['text']
    # print(ending)

keys_to_use = ['title', 'Previous Part', 'Story Ending']
header = 'Exercise: Finish the story for children based on the title and previous part given.'

# NOTE: here we use the whole stories generated so far. It may be impossible to do so in the application as we are limited by a number of tokens
# TODO: I think we can either use only last or couple last fragments of what was generated so far or a summary of everything that was generated so far
generated_so_far = beg_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '') + cont_result.iloc[0]['generation'].replace(STOP_SEQUENCES[0], '')
parameters = [title, generated_so_far]


prompt = generate_prompt_for_story(example_endings, keys_to_use, header, parameters)
print(prompt)

Exercise: Finish the story for children based on the title and previous part given.

Title: The Ugly Duckling
Previous Part: Once upon a time, there was a duck. She lived in the forest. One day she laid some eggs There was one very big egg among the eggs she laid. After warming the eggs carefully, the duck waited for the eggs to hatch. As she watched, three of her eggs cracked and three beautiful ducklings hatched
Story Ending:  The ugly duckling now heard the swans say that he was the most beautiful among them. Now he was no more the ugly duckling. He lived with his new family and they lived happily ever after
--
Title: The Fearsome Princess And A Cowardly Dragon
Previous Part:  Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...
  The dragon

In [29]:
end_result = generate(
    prompt,
    max_tokens=MAX_TOKENS,
    stop_sequences=STOP_SEQUENCES,
    temperature=TEMPERATURE,
    model=MODEL,
    p=MIN_P,
    frequency_penalty=FREQ_PENALTY,
    presence_penalty=PRESENCE_PENALTY
)

[0m


In [30]:
print_result(end_result, parameters, keys_to_use)

--------------------------------------------------
likelihood: -11.689428327300003
Title: The Fearsome Princess And A Cowardly Dragon
Previous Part:  Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitors but she didn't like any of them because they were all unkind and selfish. One day while taking her walk through the forest she came across a dragon who was sleeping under one of the trees near by...
  The dragon told her that he would do anything for her if only he could be sure that no harm would come to him. The princess agreed to keep him safe from harm if he promised not to eat any humans or destroy their homes or crops...
Story Ending:  She married the dragon and they lived happily ever after!
--
--------------------------------------------------
likelihood: -45.0548506916
Title: The Fearsome Princess And A Cowardly Dragon
Previous Part:  Once upon a time, there was a princess who was very beautiful and kindhearted. She had many suitor

<h1>Old</h1>

In [31]:
# print(result.info())

In [32]:
# result.iloc[0]['generation']

In [33]:
# print(result['generation'][0])

In [34]:
# test_string = """
# Once upon a time, there was a farmer with three little pigs. He did not have enough food to take care of his pigs so he sent them away to take care of themselves. The first little pig was walking on the road. Suddenly he saw a man with some straws. He could  build a house with the straws, he said to himself and asked the man to give him his straws. The man was kind so he gave the first little pig his straws. The pig used the straws to build a straw house and danced around. Suddenly, a big bad wo
# """

In [35]:
# len(test_string)