In [1]:
import json

from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.util import Cm, Pt, Inches
from pptx.enum.text import MSO_ANCHOR
from h2ogpte import H2OGPTE
from mediawikiapi import MediaWikiAPI
from tqdm import tqdm
with open('secrets.txt') as f:
    api = f.read()
    f.close()



In [2]:
client = H2OGPTE(
    address="https://h2ogpte.genai.h2o.ai",
    api_key=api
)

user_query = 'Create a presentation on the transformers movie franchise'

In [3]:
search = client.answer_question(
    question=user_query,
    system_prompt="""You are an assistant whose task is to perform Wikipedia searches on a specific topic.\
    The user is interested to create a presentation about a topic of interest. Reply with a corresponding Wikipedia search query.\
    Only reply with the query and nothing else. Here is an example.
    User: Create a presentation on the book, Baby Rudin.
    Assistant: Real analysis
    """,
    llm='mistralai/Mixtral-8x7B-Instruct-v0.1' # i like this model
)
search

Answer(content='Transformers (franchise)', error='', prompt_raw='<<SYS>>\nYou are an assistant whose task is to perform Wikipedia searches on a specific topic.    The user is interested to create a presentation about a topic of interest. Reply with a corresponding Wikipedia search query.    Only reply with the query and nothing else. Here is an example.\n    User: Create a presentation on the book, Baby Rudin.\n    Assistant: Real analysis\n    \n<</SYS>>\n\n<s> [INST] Create a presentation on the transformers movie franchise [/INST]', llm='mistralai/Mixtral-8x7B-Instruct-v0.1', input_tokens=117, output_tokens=7, origin='answer_question_using_context')

In [4]:
wiki = MediaWikiAPI()
articles = wiki.search(search.content, results=15)

In [5]:
articles

['Transformers',
 'Transformers (film series)',
 'Transformers: Rise of the Beasts',
 'Transformers: The Last Knight',
 'Transformers One (film)',
 'Transformers: War for Cybertron Trilogy',
 'Beast Wars: Transformers',
 'The Transformers (TV series)',
 'Transformers: Age of Extinction',
 'Transformers: The Headmasters',
 'Bumblebee (film)',
 'Optimus Prime',
 'Transformers: Beast Wars',
 'List of Transformers animated series',
 'Transformers: EarthSpark']

In [6]:
snippet = list(map(lambda x: wiki.summary(x, auto_suggest=False, sentences=2), articles))
snippet_ = snippet.copy()
i = 0
for string in snippet:
    string = f"{i}. {string}"
    snippet[i] = string
    i+=1
snippet_text = "\n\n".join(snippet)
print(snippet_text)

0. Transformers is a media franchise produced by American toy company Hasbro and Japanese toy company Takara Tomy. It primarily follows the heroic Autobots and the villainous Decepticons, two alien robot factions at war that can transform into other forms, such as vehicles and animals.

1. Transformers is a series of science fiction action films based on the Transformers franchise of the 1980s. Michael Bay directed the first five live action films: Transformers (2007), Revenge of the Fallen (2009), Dark of the Moon (2011), Age of Extinction (2014), and The Last Knight (2017), and has served as a producer for subsequent films.

2. Transformers: Rise of the Beasts is a 2023 American science fiction action film based on Hasbro's Transformers toy line, and primarily influenced by its Beast Wars sub-franchise. It is the seventh installment in the Transformers film series and serves as  both a standalone sequel to Bumblebee (2018) and a prequel to Transformers (2007).

3. Transformers: The L

In [7]:
def try_and_parse(user_query, function, failed=0,):
    chosen = function(user_query)
    try:
        topics = json.loads(chosen.content)
        return topics
    except:
        failed+=1
        print(failed) # CHANGE TO LOGGING STATEMENT
        try_and_parse(user_query, function, failed=failed)


In [8]:
choose_topics = lambda user_query: client.answer_question(
        question=user_query,
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entries labelled 0 to 14 that may or may not be related to the topic at hand:
        {snippet_text}
        
        Decide on which topics are useful for the presentation\
        and return these topics as an array of integers in JSON. For example, if only choices 0, 2, 4, and 9 are useful, your reply should be: [0, 2, 4, 9].
        If all the choices are useful, then your reply should be: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14].
        Do not say anything else. 
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
    )

topics = try_and_parse(user_query, choose_topics)
print(topics)

[0, 1, 2, 3, 4, 5, 8, 10, 11, 14]


In [9]:
chosen_articles = [articles[i] for i in topics]
chosen_snippets = [snippet_[i] for i in topics]
chosen_articles

['Transformers',
 'Transformers (film series)',
 'Transformers: Rise of the Beasts',
 'Transformers: The Last Knight',
 'Transformers One (film)',
 'Transformers: War for Cybertron Trilogy',
 'Transformers: Age of Extinction',
 'Bumblebee (film)',
 'Optimus Prime',
 'Transformers: EarthSpark']

In [10]:
chosen_full_articles = list(map(lambda x: wiki.page(x, auto_suggest=False).content, chosen_articles))
chosen_articles_images = list(map(lambda x: wiki.page(x, auto_suggest=False).images, chosen_articles))
# # Create documents
# # Note: Done for demonstration purposes only (not usually needed)
# with open('dunder_mifflin.txt', 'w') as f:
#     f.write('There were 55 paper clips shipped, 22 to Scranton and 33 to Filmer.')
    
# with open('initech.txt', 'w') as f:
#     f.write('David Brent did not sign any contract with Initech.')
    
# # Upload documents
# # Many file types are supported: text/image/audio documents and archives
# with open('dunder_mifflin.txt', 'rb') as f:
#     dunder_mifflin = client.upload('Dunder Mifflin.txt', f)
    
# with open('initech.txt', 'rb') as f:
#     initech = client.upload('IniTech.txt', f)

# # Ingest documents (Creates previews, chunks and embeddings)
# client.ingest_uploads(collection_id, [dunder_mifflin, initech])


In [11]:

    
# now its time to store them for RAG
import os


collection_id = client.create_collection(
    name='Articles',
    description='wikipedia articles for presentation',
)

pages = dict(zip(chosen_articles, chosen_full_articles))



In [12]:
import re
to_ingest = []
for title, content in pages.items():
    title = re.sub('[\W_]+', '', title)
    name = f"./articles/{title}.txt"
    f = open(name, "w+", encoding="utf-8")
    f.write(content)
    f.close() # dont know why i gotta do this, i think it has to be in binary
    f = open(name, 'rb')
    to_ingest.append(client.upload(name, f))
    print(f"{name} fed!")
    f.close() 

client.ingest_uploads(collection_id, to_ingest)  

./articles/Transformers.txt fed!
./articles/Transformersfilmseries.txt fed!
./articles/TransformersRiseoftheBeasts.txt fed!
./articles/TransformersTheLastKnight.txt fed!
./articles/TransformersOnefilm.txt fed!
./articles/TransformersWarforCybertronTrilogy.txt fed!
./articles/TransformersAgeofExtinction.txt fed!
./articles/Bumblebeefilm.txt fed!
./articles/OptimusPrime.txt fed!
./articles/TransformersEarthSpark.txt fed!


Job(id='aaab7925-d565-4540-9a7a-51e75da8a321', passed=1.0, failed=0.0, progress=1.0, completed=True, canceled=False, date=datetime.datetime(2024, 2, 26, 12, 24, 59, tzinfo=TzInfo(UTC)), kind=<JobKind.IngestUploadsJob: 'IngestUploadsJob'>, statuses=[JobStatus(id='5284b5fd5ec4479fb8d0cf789be7e18d', status='Indexing done.'), JobStatus(id='05a2a39c58054a818bf4fb83b921e8cd', status='Indexing done.'), JobStatus(id='20d5f684b823432ea55db1d875208ec9', status='Indexing done.'), JobStatus(id='996b15dc29024cf8bfbb2510cccbf0a3', status='Indexing done.'), JobStatus(id='e1f8daeedfd84a0fb47d6e905a57ce0e', status='Indexing done.')], errors=[], last_update_date=datetime.datetime(2024, 2, 26, 12, 25, 22, tzinfo=TzInfo(UTC)), duration='23s')

In [13]:

# Create a chat session
# chat_session_id = client.create_chat_session(collection_id)

# # Query the collection
# with client.connect(chat_session_id) as session:
#     reply = session.query(
#         'How many paper clips were shipped to Scranton?',
#         llm="gpt-4-0613"
#     )
#     print(reply.content)

#     reply = session.query(
#         'Did David Brent co-sign the contract with Initech?',
#         timeout=60,
#         llm="gpt-4-0613"
#     )
#     print(reply.content)

# # Summarize each document
# documents = client.list_documents_in_collection(collection_id, offset=0, limit=99)
# for doc in documents:
#     summary = client.summarize_document(
#         document_id=doc.id,
#         timeout=60,
#     )
#     print(summary.content)


#client.delete_documents_from_collection

In [14]:
chosen_snippets

['Transformers is a media franchise produced by American toy company Hasbro and Japanese toy company Takara Tomy. It primarily follows the heroic Autobots and the villainous Decepticons, two alien robot factions at war that can transform into other forms, such as vehicles and animals.',
 'Transformers is a series of science fiction action films based on the Transformers franchise of the 1980s. Michael Bay directed the first five live action films: Transformers (2007), Revenge of the Fallen (2009), Dark of the Moon (2011), Age of Extinction (2014), and The Last Knight (2017), and has served as a producer for subsequent films.',
 "Transformers: Rise of the Beasts is a 2023 American science fiction action film based on Hasbro's Transformers toy line, and primarily influenced by its Beast Wars sub-franchise. It is the seventh installment in the Transformers film series and serves as  both a standalone sequel to Bumblebee (2018) and a prequel to Transformers (2007).",
 "Transformers: The La

In [15]:
decide_sections = lambda user_query: client.answer_question(
        question=user_query,
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entry summaries that are selected for the presentation:
        {chosen_snippets}
        
        Now, plan the presentation by replying a JSON array \
        consisting of slide titles, starting from the first slide to the last slide. Do not include the title slide. Do not say anything else.

        Here is an example reply:
        ["Introduction to Cookies", "Contents Page", "History of cookies", "Types of Cookies", "Health Concerns", "Conclusion"]
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
    )

sections = try_and_parse(user_query, decide_sections)

In [16]:
sections

['Introduction to Transformers',
 'Transformers Media Franchise',
 'Transformers Film Series',
 'Transformers: Rise of the Beasts',
 'Transformers: The Last Knight',
 'Transformers One',
 'Transformers: War for Cybertron Trilogy',
 'Transformers: Age of Extinction',
 'Bumblebee',
 'Transformers Characters: Optimus Prime',
 'Transformers: EarthSpark',
 'Conclusion']

In [17]:
chat_session_id = client.create_chat_session(collection_id)
chat_session_id

'07f3313e-0a29-4744-b718-9fedd7f67fd9'

In [18]:
""" Ref for slide types:  
0 ->  title and subtitle 
1 ->  title and content 
2 ->  section header 
3 ->  two content 
4 ->  Comparison 
5 ->  Title only  
6 ->  Blank 
7 ->  Content with caption 
8 ->  Pic with caption 
"""

' Ref for slide types:  \n0 ->  title and subtitle \n1 ->  title and content \n2 ->  section header \n3 ->  two content \n4 ->  Comparison \n5 ->  Title only  \n6 ->  Blank \n7 ->  Content with caption \n8 ->  Pic with caption \n'

In [19]:
prs = Presentation()
prs.slide_width = Inches(16)
prs.slide_height = Inches(9)
title_slide = prs.slides.add_slide(prs.slide_layouts[0]) 
decide_slide_format = lambda user_query: client.answer_question(
        question=f"""{user_query} Think of a simple title for this presentation.
        
        Also, think of a good background colour, in RGB format,\
        for the slides and a good colour, also in RGB format, for the\
        text. Typically, if the text colour is bright (for example RGB [255, 255, 255] is white), then the background colour should be dark
        (RGB [0, 0, 100] is dark blue). Conversely, if the text colour is dark (for example RGB [0, 0, 0] is black), the background colour should be bright\
        . You are free to choose any text and background colour, \
        as long as you follow these rules. Please do not assign grey-scale colours for the text and background (like RGB [50, 50, 50]), as much as possible.
        
        Format your reply as a JSON array containing the title and the two colours, following the example below. Do not say anything else.
        Example:
        ["Slide Title", {{"background": [100, 0, 0]}}, {{"text": [255, 255, 255]}}]""",
    
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entry summaries that are selected for the presentation:
        {chosen_snippets}
        This should give you an idea of what this presentation should be about.
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
)

format = try_and_parse(user_query, decide_slide_format)

In [20]:
format

['The Transformers Movie Franchise',
 {'background': [50, 50, 50]},
 {'text': [255, 255, 255]}]

In [21]:
background = RGBColor(*tuple(list(format[1].values())[0])) 
font = RGBColor(*tuple(list(format[2].values())[0])) 
fill = title_slide.background.fill
fill.solid()
fill.fore_color.rgb = background


title_slide.shapes.title.text = format[0]
title_slide.shapes.title.text_frame.paragraphs[0].font.color.rgb =  font
title_slide.shapes.title.text_frame.paragraphs[0].font.name = 'Montserrat'
title_slide.shapes.title.text_frame.paragraphs[0].font.bold = True

first_shape =  title_slide.shapes[0]
first_shape.left, first_shape.top, first_shape.width, first_shape.height = (prs.slide_width - Inches(12))//2, \
(prs.slide_height-first_shape.height)//2 - Inches(1),\
Inches(12),\
Inches(2)

In [22]:

with client.connect(chat_session_id) as session:

    for section in tqdm(sections):
        slide = prs.slides.add_slide(prs.slide_layouts[1])
        fill = slide.background.fill
        fill.solid()
        fill.fore_color.rgb = background

        
        contents = slide.placeholders[1]

        title = slide.shapes.title
        title.text = section
        title.text_frame.paragraphs[0].font.color.rgb = font
        title.text_frame.paragraphs[0].font.size = Pt(32)
        title.text_frame.paragraphs[0].font.name = 'Karla'
       
       
        content = session.query(
            
            message = section,
            system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation. \
            The slides of the presentation are as follows: {sections}
            You are now tasked with generating the content of one slide, which will be provided by the user.
            """,
            pre_prompt_query="You have been provided with the following information, which may be useful in your task.",
            prompt_query="Decide if the information is relevant, and use it if needed. \
            Generate the content required in the slide provided by the user. You only need to generate the contents of the slide, not the title\
            or anything else. Do not provide more than 300 words of information. Generate around 4 paragraphs of text, where each paragraph has around 4 sentences.",
            llm="mistralai/Mixtral-8x7B-Instruct-v0.1",
            rag_config={
                "rag_type": "rag",
            },
        ).content

        contents.text = content
        
        for paragraph in contents.text_frame.paragraphs:
            paragraph.space_after = 1
            paragraph.space_before = 0
            for run in paragraph.runs:
                run.font.size = Pt(16)  
                run.font.color.rgb = font
                run.font.name = 'Karla'

        shapes = slide.shapes
        shapes[0].height, shapes[0].width, shapes[0].top, shapes[0].left = shapes[0].height, Inches(14), shapes[0].top, (prs.slide_width-Inches(12))//2
        shapes[1].height, shapes[1].width, shapes[1].top, shapes[1].left = shapes[1].height, Inches(14), shapes[1].top, (prs.slide_width-Inches(12))//2

# gpt-4-1106-preview


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [04:57<00:00, 24.81s/it]


In [23]:
sanitised = re.sub(r'[\W_]+', '_', format[0])
prs.save(f"./{sanitised}.pptx")

In [24]:
(prs.slide_width) - Inches(12)
#slide.placeholders[1].text_frame.margin_right = slide.placeholders[1].text_frame.margin_left

3657600

In [25]:
#client.delete_documents_from_collection(collection_id)