In [1]:
import json
import numpy as np
import os
import re
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.util import Cm, Pt, Inches
from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
from h2ogpte import H2OGPTE
from mediawikiapi import MediaWikiAPI
from tqdm import tqdm
with open('secrets.txt') as f:
    api = f.read()
    f.close()



In [2]:
client = H2OGPTE(
    address="https://h2ogpte.genai.h2o.ai",
    api_key=api
)

user_query = 'Create a presentation on the Pokemon franchise'

In [3]:
def trim(lst, retain=30):
    '''
    Trims a list. This function was originally used to permute before trimming, but\
    now that functionality is removed, so it appears rather redundant.
    '''
    _ = lst.copy()
    
    return _[0:retain]



def try_and_parse(user_query, function, failed=0, markdown=False):
    '''
    Accepts a function and user_query, an input. Evaluates function(user_query) and 
    converts string output (usually a reply from an llm) into a json value. Use markdown=True
    if the json value is contained within a code chunk.
    '''
    chosen = function(user_query)
    try:
        if not markdown:
            topics = json.loads(chosen.content)
        else:
            print(chosen.content)
            pattern = r'^```(?:\w+)?\s*\n(.*?)(?=^```)```'
            result = re.findall(pattern, chosen.content, re.DOTALL | re.MULTILINE)[0].strip() 
            #print(result)
            topics = json.loads(result)
            
        return topics
    except Exception as e:
        failed+=1
        print(failed)
        print(e)# CHANGE TO LOGGING STATEMENT
        return try_and_parse(user_query, function, failed=failed, markdown=markdown)


## Step 1. What comes to mind when you think about xyz?

In [4]:
search = lambda user_query: client.answer_question(
    question=user_query,
    system_prompt="""You are an assistant whose task is to perform Wikipedia searches on a specific topic.\
    The user is interested to create a presentation about a topic of interest.\
    Reply with at least one corresponding Wikipedia query as an array in JSON format.\
    Only reply with the JSON array and nothing else. Here are some examples.
    Example 1.
    User: Create a presentation on the book, Baby Rudin.
    Assistant: ["Real analysis", "Mathematical Analysis"]

    Example 2.
    User: I want to create a ppt about Milk.
    Assistant: ["Milk", "Plant Milk", "Goat's Milk", "Cow Milk", "Almond Milk"]
    """,
    llm='mistralai/Mixtral-8x7B-Instruct-v0.1'
)
searched = try_and_parse(user_query, search)
searched

['Barbie',
 'Oppenheimer',
 'Barbie vs Oppenheimer',
 'Pop Culture and Nuclear Physics',
 'Feminism and Science',
 'Cultural Mashups']

## Step 2. Search Wikipedia

In [5]:
import random
wiki = MediaWikiAPI()

articles = list(
    set(
        
        [i for j in [wiki.search(cat, results=5) for cat in searched] for i in j]
        
    )
) # remove duplicates with set(list())
random.shuffle(articles) # random shuffle



In [8]:
snippet = trim(
    list(map(lambda x: wiki.summary(x, auto_suggest=False, sentences=1), articles))
)

snippet_ = snippet.copy()
i = 0
for string in snippet:
    string = f"{i}. {string}"
    snippet[i] = string
    i+=1
snippet_text = "\n\n".join(snippet)
print(snippet_text)

0. The Atomic Age, also known as the Atomic Era, is the period of history following the detonation of the first nuclear weapon, The Gadget at the Trinity test in New Mexico, on 16 July 1945, during World War II. Although nuclear chain reactions had been hypothesized in 1933 and the first artificial self-sustaining nuclear chain reaction (Chicago Pile-1) had taken place in December 1942, the Trinity test and the ensuing bombings of Hiroshima and Nagasaki that ended World War II represented the first large-scale use of nuclear technology and ushered in profound changes in sociopolitical thinking and the course of technological development.

1. Feminist science fiction is a subgenre of science fiction (abbreviated "SF") focused on such feminist themes as: gender inequality, sexuality, race, economics, reproduction, and environment.

2. Katherine Vissering Oppenheimer (née Puening; August 8, 1910 – October 27, 1972) was a German American biologist, botanist, and a member of the Communist P

## Step 3. "Brainstorm" and filter Wikipedia searches for useful ones
Chain-of-thought prompting
https://www.promptingguide.ai/techniques/cot

In [9]:
choose_topics = lambda user_query: client.answer_question(
        question=f"""{user_query}
        Referring to the list of wikipedia entries you have been provided, decide on which topics are useful for the presentation. For each entry, explain, in a few words,\
        whether you think an entry is useful or not and why.
        After that, generate a code chunk. Within the code chunk is an array of integers in JSON, and these integers correspond to the topics you think are useful.
        For each line, think about its usefulness/relevance and make some short deductions, then conclude whether it is useful or not useful.
        Please keep strictly to the format in the following example:
        0. - Sugar irrelevant to Jesus Christ, therefore not useful.
        1. - Christianity is about the topic of Jesus Christ, thus Useful.
        2. - Protestants follow Jesus Christ, therefore useful.
        ```
        [1, 2]
        ```
        """,
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entries, starting from the 0th entry, that may or may not be related to the topic at hand:
        {snippet_text}
        """,
        llm='gpt-4-1106-preview' # only instance of gpt4 usage. need this for the big brain.
    )

topics = try_and_parse(user_query, choose_topics, markdown=True)
print(topics)

0. - The Atomic Age is related to the historical context of Oppenheimer but not directly to the "Barbenheimer" phenomenon, therefore not useful.
1. - Feminist science fiction does not relate to the "Barbenheimer" phenomenon, therefore not useful.
2. - Katherine Vissering Oppenheimer is related to J. Robert Oppenheimer but not directly to the "Barbenheimer" phenomenon, therefore not useful.
3. - Barbie's film history is relevant to the "Barbenheimer" phenomenon as it pertains to Barbie, therefore useful.
4. - 2023 in film is relevant as it may contain information about the films involved in the "Barbenheimer" phenomenon, therefore useful.
5. - The National Museum of Nuclear Science & History is related to nuclear science and not directly to the "Barbenheimer" phenomenon, therefore not useful.
6. - Oppenheimer film is directly related to the "Barbenheimer" phenomenon as it pertains to Oppenheimer, therefore useful.
7. - Barbenheimer is the exact topic of the presentation, therefore usefu

In [10]:
articles

['Atomic Age',
 'Feminist science fiction',
 'Katherine Oppenheimer',
 'List of Barbie films',
 '2023 in film',
 'National Museum of Nuclear Science & History',
 'Oppenheimer (film)',
 'Barbenheimer',
 'The Monkey King (2023 film)',
 'Klaus Barbie',
 'Lists of Armenians',
 'Mashup (culture)',
 'Mashup (video)',
 'List of civilian nuclear accidents',
 'Feminist movements and ideologies',
 'Frank Oppenheimer',
 'Cultural appropriation',
 'Barbie',
 'Physics outreach',
 'Mashup (music)',
 'J. Robert Oppenheimer',
 'Liberal feminism',
 'Barbie (film)',
 'Difference feminism',
 'Feminism',
 '28th Satellite Awards']

In [11]:
chosen_articles = [articles[i] for i in topics]
chosen_snippets = [snippet_[i] for i in topics]
chosen_articles

['List of Barbie films',
 '2023 in film',
 'Oppenheimer (film)',
 'Barbenheimer',
 'Barbie',
 'Barbie (film)']

In [12]:
chosen_full_articles = list(map(lambda x: wiki.page(x, auto_suggest=False).content, chosen_articles))
chosen_articles_images = list(map(lambda x: wiki.page(x, auto_suggest=False).images, chosen_articles))


In [13]:

    
# now its time to store them for RAG
import os


collection_id = client.create_collection(
    name='Articles',
    description='wikipedia articles for presentation',
)

pages = dict(zip(chosen_articles, chosen_full_articles))



## Step 4: Store useful ideas in VectorDB (h2oai)

In [14]:
import re
to_ingest = []
for title, content in pages.items():
    title = re.sub('[\W_]+', '', title)
    name = f"./articles/{title}.txt"
    f = open(name, "w+", encoding="utf-8")
    f.write(content)
    f.close() # dont know why i gotta do this, i think it has to be in binary
    f = open(name, 'rb')
    to_ingest.append(client.upload(name, f))
    print(f"{name} fed!")
    f.close() 

client.ingest_uploads(collection_id, to_ingest)  

./articles/ListofBarbiefilms.txt fed!
./articles/2023infilm.txt fed!
./articles/Oppenheimerfilm.txt fed!
./articles/Barbenheimer.txt fed!
./articles/Barbie.txt fed!
./articles/Barbiefilm.txt fed!


Job(id='882b5492-ef6b-46f7-a650-be3046b0a2a5', passed=1.0, failed=0.0, progress=1.0, completed=True, canceled=False, date=datetime.datetime(2024, 2, 27, 15, 19, 8, tzinfo=TzInfo(UTC)), kind=<JobKind.IngestUploadsJob: 'IngestUploadsJob'>, statuses=[JobStatus(id='11b9f1688e07496ea1271685b4a62909', status='Indexing done.'), JobStatus(id='1e8c47ea9f974ab5915bf20ce35f2a0c', status='Indexing done.'), JobStatus(id='8e6b14e3d9c745168a28821c795ac686', status='Indexing done.'), JobStatus(id='6de0bd52efd94606898ea390a1b4b809', status='Indexing done.')], errors=[], last_update_date=datetime.datetime(2024, 2, 27, 15, 20, 3, tzinfo=TzInfo(UTC)), duration='55s')

In [15]:
chosen_snippets

['Since 2001, Barbie, a fashion doll manufactured by American toy and entertainment company Mattel, has starred or featured in 42 CGI or computer-animated feature films and streaming television films which since then has become a core component of an eponymous media franchise.',
 '2023 in film is an overview of events, including award ceremonies, festivals, a list of country- and genre-specific lists of films released, and notable deaths.',
 'Oppenheimer is a 2023 epic biographical thriller film written, directed, and co-produced by Christopher Nolan, starring Cillian Murphy as J. Robert Oppenheimer, the American theoretical physicist credited with being the "father of the atomic bomb" for his role in the Manhattan Project—the World War II undertaking that developed the first nuclear weapons.',
 'Barbenheimer ( BAR-bən-hy-mər) is a cultural phenomenon which preceded and continues to surround the simultaneous theatrical release of two films, Warner Bros.',
 'Barbie is a fashion doll cre

## Step 5: Plan sections for slide
This is to ensure the entire presentation is a coherent one with a flow/narrative, instead of many disjoint/overlapping generations.
Again, chain of thought prompting is very heavily incorporated

In [16]:
decide_sections = lambda user_query: client.answer_question(
        question=f"""{user_query}
        Please plan the presentation by doing the following:
        1. Explain how you would design the presentation slides such that the presentation will flow well.\
        Remember that each slide must contain something different, and content should not overlap.
        2. Think of a good title for the presentation.
        3. Create a code chunk. Inside that code chunk, generate a JSON array consisting of appropriate slide titles starting from the first slide to the last slide, \
        remembering your answer to point 2. Include the title slide, which is the title for the presentation.

        Below is an example reply. Please adhere strictly to the format in the example below and remember to output the JSON array in a code chunk:  
        1. I would introduce the Transformers franchise and provide general information about its history to ease my viewers into the subject. \
        Then, I will think about subtopics, such as the Transformers films, Transformers characters and Transformers in comics, using the wikipedia entry summaries\
        I have been provided. 
            * For my first subtopic, the Transformers films, I would create two additional slides to expand on Bumblebee (2018) and Revenge of the Fallen (2009)\
            as these are popular films within the franchise. I will order the films chronologically.
            * For my second subtopic on Transformers characters, I will have a slide on the cast of the film. 
            * For my next subtopic...
        2. I think a good title for this presentation is "Transformers: An Overview".
        3. Here is the json array of slide titles:
        ```json
        [
        "Transformers: An Overview", 
        "Introduction to Transformers", 
        "Transformers in Film",
        "Transformers: Revenge of the Fallen (2009)", 
        "Bumblebee (2018)", 
        "Characters in the Transformers Universe", 
        "Transformers in comics",
        "Conclusion"
        ]
        ```
        """,
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Below is a list of wikipedia entry summaries that are selected for the presentation.\
        You will be asked to come up with slide titles for the presentation. Each line is a 1-sentence summary of a wikipedia page.\
        The number of slides should depend on the amount of information/wikipedia entry summaries available.
        Summaries:
        {chosen_snippets}
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
    )

all_sections = try_and_parse(user_query, decide_sections, markdown=True)

1. For this presentation, I would start with an introductory slide that introduces the concept of "Barbenheimer" and the two films that are the focus of the presentation: Barbie and Oppenheimer. This slide will serve as a title slide and provide context for the rest of the presentation.

Next, I would create a slide that provides more information about the Barbie media franchise, including its history and popularity. This will be followed by a slide about the Barbie film, including its plot, characters, and production.

After that, I would create a slide about the Oppenheimer film, including its plot, characters, and production. This will be followed by a slide that compares and contrasts the two films, highlighting their similarities and differences.

Finally, I would end the presentation with a conclusion slide that summarizes the main points of the presentation and provides some final thoughts on the "Barbenheimer" phenomenon.

2. A good title for this presentation could be "Barbenh

In [17]:
sections = all_sections[1:]

sections

['The Barbie Media Franchise',
 'The Barbie Film',
 'The Oppenheimer Film',
 'Comparing Barbie and Oppenheimer',
 'Conclusion']

In [18]:
del client
client = H2OGPTE(
    address="https://h2ogpte.genai.h2o.ai",
    api_key=api
) # does this reset client?

chat_session_id = client.create_chat_session(collection_id)
chat_session_id

'c2634f1d-0fa5-43bd-94bd-f0f96edb92a5'

#### Ref for slide types:  
0. title and subtitle 
1. title and content 
2. section header 
3. two content 
4. Comparison 
5. Title only  
6. Blank 
7. Content with caption 
8. Pic with caption 


## Step 6: Generate using RAG
LLM chooses colour with chain-of-thought prompting again.

In [19]:
prs = Presentation()
prs.slide_width = Inches(16)
prs.slide_height = Inches(9)
title_slide = prs.slides.add_slide(prs.slide_layouts[0]) 
decide_slide_format = lambda user_query: client.answer_question(
        question=f"""{user_query} Think of a good background colour, in RGB format,\
        for the slides and a good colour, also in RGB format, for the\
        text. Typically, if the text colour is bright (for example RGB [255, 255, 255] is white), then the background colour should be dark
        (RGB [0, 0, 100] is dark blue). Conversely, if the text colour is dark (for example RGB [0, 0, 0] is black), the background colour should be bright\
        . You are free to choose any text and background colour, \
        as long as you follow these rules. Please do not assign grey-scale colours for the text and background (like RGB [50, 50, 50]), as much as possible.

        Explain clearly why you chose the background and text colours. Then, generate a code chunk. Within the code chunk,\
        provide a JSON array containing two colours. Adhere strictly to the example reply below:
        I chose blue RGB [0, 35, 140] for the background color and light yellow RGB [255, 234, 0] for the font color. The contrast makes it easy to read.\
        Furthermore, the colours blue and yellow are associated with the Pokémon Franchise.
        ```
        [{{"background": [0, 0, 140]}}, {{"text": [255, 234, 0]}}]
        ```
        """,
    
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entry summaries that are selected for the presentation:
        {chosen_snippets}
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
)

format = try_and_parse(user_query, decide_slide_format, markdown=True)

I chose a light purple background color (RGB [153, 102, 255]) and a dark grey text color (RGB [51, 51, 51]). The contrast between the light background and dark text makes it easy to read. Additionally, purple is often associated with creativity and luxury, while grey is a neutral color that allows the content to stand out.

```
[{"background": [153, 102, 255]}, {"text": [51, 51, 51]}]
```
Slide 1: Introduction

* Introduce the topic of the presentation: Barbenheimer (Barbie vs Oppenheimer)
* Explain that the presentation will cover the cultural phenomenon of the simultaneous release of the Barbie and Oppenheimer films, as well as a brief overview of each film.

Slide 2: Barbie

* Provide a summary of the Barbie franchise, including the number of CGI or computer-animated feature films and streaming television films released since 2001.
* Mention the 2023 fantasy comedy film directed by Greta Gerwig.

Slide 3: Oppenheimer

* Provide a summary of the Oppenheimer film, including the main c

In [20]:
format

[{'background': [153, 102, 255]}, {'text': [51, 51, 51]}]

In [21]:
background = RGBColor(*tuple(list(format[0].values())[0])) 
font = RGBColor(*tuple(list(format[1].values())[0])) 
fill = title_slide.background.fill
fill.solid()
fill.fore_color.rgb = background


title_slide.shapes.title.text = all_sections[0]
title_slide.shapes.title.text_frame.paragraphs[0].font.color.rgb =  font
title_slide.shapes.title.text_frame.paragraphs[0].font.name = 'Montserrat'
title_slide.shapes.title.text_frame.paragraphs[0].font.bold = True

first_shape =  title_slide.shapes[0]
first_shape.left, first_shape.top, first_shape.width, first_shape.height = (prs.slide_width - Inches(12))//2, \
(prs.slide_height-first_shape.height)//2 - Inches(1),\
Inches(12),\
Inches(2)

In [22]:

with client.connect(chat_session_id) as session:

    for section in tqdm(sections):
        slide = prs.slides.add_slide(prs.slide_layouts[1])
        fill = slide.background.fill
        fill.solid()
        fill.fore_color.rgb = background

        
        contents = slide.placeholders[1]
        contents.text_frame.word_wrap = True

        title = slide.shapes.title
        title.text = section
        title.text_frame.paragraphs[0].font.color.rgb = font
        title.text_frame.paragraphs[0].font.size = Pt(32)
        title.text_frame.paragraphs[0].font.name = 'Karla'
       
       
        content = session.query(
            
            message = section,
            system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation. \
            The slides of the presentation are as follows: {sections}
            You are now tasked with generating the content of one slide, which will be provided by the user.
            """,
            pre_prompt_query="You have been provided with the following information, which may be useful in your task.",
            prompt_query="""Decide if the information is relevant, and use it if needed.\
            Generate the content required in the slide provided by the user. You only need to generate the contents of the slide, not the title\
            or anything else. Remember, you are designing one slide, so do not generate long paragraphs. Instead, summarise your most important points\
            in up to 3 sentences each, and do not generate more than 15 lines of text. Between each point, leave a line.

            Here is an example. Please adhere to this example strictly:
            This is the first point I am trying to make. I will explain this first point in the second sentence.
            
            This is the second point I am trying to make. This is the second sentence for the second point. \
            I need a third sentence to fully explain this point. Notice that I will leave a line for the next point.

            This is the third point.
            """,
            llm="mistralai/Mixtral-8x7B-Instruct-v0.1",
            rag_config={
                "rag_type": "rag",
            },
        ).content

        contents.text = content
        
        for paragraph in contents.text_frame.paragraphs:
            paragraph.space_after = 1
            paragraph.space_before = 1
            # paragraph.level = 0
           
            paragraph.font.size = Pt(22)  
            paragraph.font.color.rgb = font
            paragraph.font.name = 'Karla'

        contents.text_frame.auto_size = MSO_AUTO_SIZE.SHAPE_TO_FIT_TEXT
        shapes = slide.shapes
        new_width = Inches(14)
        new_height = Inches(7)
        shapes[0].height, shapes[0].width, shapes[0].top, shapes[0].left = shapes[0].height, new_width, shapes[0].top, (prs.slide_width-new_width)//2
        shapes[1].height, shapes[1].width, shapes[1].top, shapes[1].left = new_height, new_width, shapes[1].top, (prs.slide_width-new_width)//2
        
        

# gpt-4-1106-preview


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:40<00:00, 80.16s/it]


## Step 7: Enjoy

In [23]:
sanitised = re.sub(r'[\W_]+', '_', all_sections[0])
prs.save(f"./presentations/{sanitised}.pptx")


## Appendix: Extra Code that may be useful in the future
```python
# Create a chat session
# chat_session_id = client.create_chat_session(collection_id)

# # Query the collection
# with client.connect(chat_session_id) as session:
#     reply = session.query(
#         'How many paper clips were shipped to Scranton?',
#         llm="gpt-4-0613"
#     )
#     print(reply.content)

#     reply = session.query(
#         'Did David Brent co-sign the contract with Initech?',
#         timeout=60,
#         llm="gpt-4-0613"
#     )
#     print(reply.content)

# # Summarize each document
# documents = client.list_documents_in_collection(collection_id, offset=0, limit=99)
# for doc in documents:
#     summary = client.summarize_document(
#         document_id=doc.id,
#         timeout=60,
#     )
#     print(summary.content)


#client.delete_documents_from_collection
```