In [1]:
import json
import numpy as np
from pptx import Presentation
from pptx.dml.color import RGBColor
from pptx.util import Cm, Pt, Inches
from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
from h2ogpte import H2OGPTE
from mediawikiapi import MediaWikiAPI
from tqdm import tqdm
with open('secrets.txt') as f:
    api = f.read()
    f.close()



In [2]:
client = H2OGPTE(
    address="https://h2ogpte.genai.h2o.ai",
    api_key=api
)

user_query = 'Create a presentation on chocolate bars'

In [3]:
def try_and_parse(user_query, function, failed=0,):
    chosen = function(user_query)
    try:
        topics = json.loads(chosen.content)
        return topics
    except:
        failed+=1
        print(failed) # CHANGE TO LOGGING STATEMENT
        try_and_parse(user_query, function, failed=failed)


In [4]:
search = lambda user_query: client.answer_question(
    question=user_query,
    system_prompt="""You are an assistant whose task is to perform Wikipedia searches on a specific topic.\
    The user is interested to create a presentation about a topic of interest.\
    Reply with at least one corresponding Wikipedia query as an array in JSON format.\
    Only reply with the JSON array and nothing else. Here are some examples.
    Example 1.
    User: Create a presentation on the book, Baby Rudin.
    Assistant: ["Real analysis", "Mathematical Analysis"]

    Example 2.
    User: I want to create a ppt about Milk.
    Assistant: ["Milk", "Plant Milk", "Animal Milk", "Almond Milk"]
    """,
    llm='mistralai/Mixtral-8x7B-Instruct-v0.1' # i like this model
)
searched = try_and_parse(user_query, search)
searched

['Chocolate bar',
 'History of chocolate',
 'Types of chocolate bars',
 'Chocolate bar ingredients',
 'Chocolate bar manufacturing process',
 'Chocolate bar brands']

In [5]:
wiki = MediaWikiAPI()

articles = list(
    set(
        np.array(
            [wiki.search(cat, results=5) for cat in searched]
        ).flatten()
    )
)

# remove duplicates with set(list())

In [6]:
snippet = list(map(lambda x: wiki.summary(x, auto_suggest=False, sentences=1), articles))
snippet_ = snippet.copy()
i = 0
for string in snippet:
    string = f"{i}. {string}"
    snippet[i] = string
    i+=1
snippet_text = "\n\n".join(snippet)
print(snippet_text)

0. Ruby chocolate is a style or distinct variety of chocolate that is pink or purple in colour.

1. Milk chocolate is a form of solid chocolate containing cocoa, sugar and milk.

2. The history of chocolate in Spain is part of the culinary history of Spain as understood since the 16th century, when the colonisation of the Americas began and the cocoa plant was discovered in regions of Mesoamerica, until the present.

3. Aero is an aerated chocolate bar manufactured by the Vevey-based company Nestlé.

4. This is a list of chocolate bar brands, in alphabetical order, including discontinued brands.

5. The history of chocolate dates back over 5,000 years.

6. A chocolate bar is a confection containing chocolate, which may also contain layerings or mixtures that include nuts, fruit, caramel, nougat, and wafers.

7. Bounty is a coconut-filled, chocolate-enrobed candy bar manufactured by Mars, Incorporated, introduced in 1951 in the United Kingdom and Canada.

8. Gianduia or gianduja (Italia

In [7]:
choose_topics = lambda user_query: client.answer_question(
        question=user_query,
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entries, starting from the 0th entry, that may or may not be related to the topic at hand:
        {snippet_text}
        
        Decide on which topics are useful for the presentation\
        and return these topics as an array of integers in JSON. For example, if only choices 0, 2, 4, and 9 are useful, your reply should be: [0, 2, 4, 9].
        If all the choices are useful, and there are 6 entries (from 0 to 5) then your reply should be: [0, 1, 2, 3, 4, 5].
        Do not say anything else. 
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
    )

topics = try_and_parse(user_query, choose_topics)
print(topics)

[0, 1, 2, 6, 11, 12, 14, 15, 17, 18, 19]


In [8]:
articles

['Ruby chocolate',
 'Milk chocolate',
 'History of chocolate in Spain',
 'Aero (chocolate bar)',
 'List of chocolate bar brands',
 'History of chocolate',
 'Chocolate bar',
 'Bounty (chocolate bar)',
 'Gianduja (chocolate)',
 'Chocolate',
 'Hot chocolate',
 'Galaxy (chocolate bar)',
 'Dark chocolate',
 'Types of chocolate',
 'Hershey bar',
 'Milky Way (chocolate bar)',
 'Chocolate chip cookie',
 'Dove (chocolate brand)',
 'Flake (chocolate bar)',
 'White chocolate']

In [9]:
chosen_articles = [articles[i] for i in topics]
chosen_snippets = [snippet_[i] for i in topics]
chosen_articles

['Ruby chocolate',
 'Milk chocolate',
 'History of chocolate in Spain',
 'Chocolate bar',
 'Galaxy (chocolate bar)',
 'Dark chocolate',
 'Hershey bar',
 'Milky Way (chocolate bar)',
 'Dove (chocolate brand)',
 'Flake (chocolate bar)',
 'White chocolate']

In [10]:
chosen_full_articles = list(map(lambda x: wiki.page(x, auto_suggest=False).content, chosen_articles))
chosen_articles_images = list(map(lambda x: wiki.page(x, auto_suggest=False).images, chosen_articles))
# # Create documents
# # Note: Done for demonstration purposes only (not usually needed)
# with open('dunder_mifflin.txt', 'w') as f:
#     f.write('There were 55 paper clips shipped, 22 to Scranton and 33 to Filmer.')
    
# with open('initech.txt', 'w') as f:
#     f.write('David Brent did not sign any contract with Initech.')
    
# # Upload documents
# # Many file types are supported: text/image/audio documents and archives
# with open('dunder_mifflin.txt', 'rb') as f:
#     dunder_mifflin = client.upload('Dunder Mifflin.txt', f)
    
# with open('initech.txt', 'rb') as f:
#     initech = client.upload('IniTech.txt', f)

# # Ingest documents (Creates previews, chunks and embeddings)
# client.ingest_uploads(collection_id, [dunder_mifflin, initech])


In [11]:

    
# now its time to store them for RAG
import os


collection_id = client.create_collection(
    name='Articles',
    description='wikipedia articles for presentation',
)

pages = dict(zip(chosen_articles, chosen_full_articles))



In [12]:
import re
to_ingest = []
for title, content in pages.items():
    title = re.sub('[\W_]+', '', title)
    name = f"./articles/{title}.txt"
    f = open(name, "w+", encoding="utf-8")
    f.write(content)
    f.close() # dont know why i gotta do this, i think it has to be in binary
    f = open(name, 'rb')
    to_ingest.append(client.upload(name, f))
    print(f"{name} fed!")
    f.close() 

client.ingest_uploads(collection_id, to_ingest)  

./articles/Rubychocolate.txt fed!
./articles/Milkchocolate.txt fed!
./articles/HistoryofchocolateinSpain.txt fed!
./articles/Chocolatebar.txt fed!
./articles/Galaxychocolatebar.txt fed!
./articles/Darkchocolate.txt fed!
./articles/Hersheybar.txt fed!
./articles/MilkyWaychocolatebar.txt fed!
./articles/Dovechocolatebrand.txt fed!
./articles/Flakechocolatebar.txt fed!
./articles/Whitechocolate.txt fed!


Job(id='0e3b795d-ee0a-412f-9f83-426b8b8378ab', passed=1.0, failed=0.0, progress=1.0, completed=True, canceled=False, date=datetime.datetime(2024, 2, 26, 16, 48, 22, tzinfo=TzInfo(UTC)), kind=<JobKind.IngestUploadsJob: 'IngestUploadsJob'>, statuses=[JobStatus(id='5284b5fd5ec4479fb8d0cf789be7e18d', status='Indexing done.'), JobStatus(id='05a2a39c58054a818bf4fb83b921e8cd', status='Indexing done.'), JobStatus(id='20d5f684b823432ea55db1d875208ec9', status='Indexing done.'), JobStatus(id='996b15dc29024cf8bfbb2510cccbf0a3', status='Indexing done.'), JobStatus(id='e1f8daeedfd84a0fb47d6e905a57ce0e', status='Indexing done.')], errors=[], last_update_date=datetime.datetime(2024, 2, 26, 16, 48, 38, tzinfo=TzInfo(UTC)), duration='16s')

In [13]:

# Create a chat session
# chat_session_id = client.create_chat_session(collection_id)

# # Query the collection
# with client.connect(chat_session_id) as session:
#     reply = session.query(
#         'How many paper clips were shipped to Scranton?',
#         llm="gpt-4-0613"
#     )
#     print(reply.content)

#     reply = session.query(
#         'Did David Brent co-sign the contract with Initech?',
#         timeout=60,
#         llm="gpt-4-0613"
#     )
#     print(reply.content)

# # Summarize each document
# documents = client.list_documents_in_collection(collection_id, offset=0, limit=99)
# for doc in documents:
#     summary = client.summarize_document(
#         document_id=doc.id,
#         timeout=60,
#     )
#     print(summary.content)


#client.delete_documents_from_collection

In [14]:
chosen_snippets

['Ruby chocolate is a style or distinct variety of chocolate that is pink or purple in colour.',
 'Milk chocolate is a form of solid chocolate containing cocoa, sugar and milk.',
 'The history of chocolate in Spain is part of the culinary history of Spain as understood since the 16th century, when the colonisation of the Americas began and the cocoa plant was discovered in regions of Mesoamerica, until the present.',
 'A chocolate bar is a confection containing chocolate, which may also contain layerings or mixtures that include nuts, fruit, caramel, nougat, and wafers.',
 'Galaxy is a chocolate bar, made and marketed by Mars Inc., and first manufactured in the United Kingdom in the 1960s.',
 'Dark chocolate is a form of chocolate containing only cocoa solids, cocoa butter and sugar.',
 "The Hershey's Milk Chocolate Bar (commonly called the Hershey's Bar, or more simply the Hershey Bar) is a flagship chocolate bar manufactured by The Hershey Company.",
 'Milky Way is a brand of chocola

In [None]:
decide_sections = lambda user_query: client.answer_question(
        question=user_query,
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entry summaries that are selected for the presentation:
        {chosen_snippets}
        
        Now, plan the presentation by replying a JSON array \
        consisting of slide titles, starting from the first slide to the last slide. Do not include the title slide. Do not say anything else.

        Here is an example reply:
        ["Introduction to Cookies", "History of cookies", "Types of Cookies", "Health Concerns", "Conclusion"]
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
    )

sections = try_and_parse(user_query, decide_sections)

In [None]:
sections

In [None]:
chat_session_id = client.create_chat_session(collection_id)
chat_session_id

In [None]:
""" Ref for slide types:  
0 ->  title and subtitle 
1 ->  title and content 
2 ->  section header 
3 ->  two content 
4 ->  Comparison 
5 ->  Title only  
6 ->  Blank 
7 ->  Content with caption 
8 ->  Pic with caption 
"""

In [None]:
prs = Presentation()
prs.slide_width = Inches(16)
prs.slide_height = Inches(9)
title_slide = prs.slides.add_slide(prs.slide_layouts[0]) 
decide_slide_format = lambda user_query: client.answer_question(
        question=f"""{user_query} Think of a simple title for this presentation.
        
        Also, think of a good background colour, in RGB format,\
        for the slides and a good colour, also in RGB format, for the\
        text. Typically, if the text colour is bright (for example RGB [255, 255, 255] is white), then the background colour should be dark
        (RGB [0, 0, 100] is dark blue). Conversely, if the text colour is dark (for example RGB [0, 0, 0] is black), the background colour should be bright\
        . You are free to choose any text and background colour, \
        as long as you follow these rules. Please do not assign grey-scale colours for the text and background (like RGB [50, 50, 50]), as much as possible.
        
        Format your reply as a JSON array containing the title and the two colours, following the example below. Do not say anything else.
        Example:
        ["Slide Title", {{"background": [100, 0, 0]}}, {{"text": [255, 255, 255]}}]""",
    
        system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation.\
        Here are a list of wikipedia entry summaries that are selected for the presentation:
        {chosen_snippets}
        This should give you an idea of what this presentation should be about.
        """,
        llm='mistralai/Mixtral-8x7B-Instruct-v0.1' 
)

format = try_and_parse(user_query, decide_slide_format)

In [None]:
format

In [None]:
background = RGBColor(*tuple(list(format[1].values())[0])) 
font = RGBColor(*tuple(list(format[2].values())[0])) 
fill = title_slide.background.fill
fill.solid()
fill.fore_color.rgb = background


title_slide.shapes.title.text = format[0]
title_slide.shapes.title.text_frame.paragraphs[0].font.color.rgb =  font
title_slide.shapes.title.text_frame.paragraphs[0].font.name = 'Montserrat'
title_slide.shapes.title.text_frame.paragraphs[0].font.bold = True

first_shape =  title_slide.shapes[0]
first_shape.left, first_shape.top, first_shape.width, first_shape.height = (prs.slide_width - Inches(12))//2, \
(prs.slide_height-first_shape.height)//2 - Inches(1),\
Inches(12),\
Inches(2)

In [None]:

with client.connect(chat_session_id) as session:

    for section in tqdm(sections):
        slide = prs.slides.add_slide(prs.slide_layouts[1])
        fill = slide.background.fill
        fill.solid()
        fill.fore_color.rgb = background

        
        contents = slide.placeholders[1]

        title = slide.shapes.title
        title.text = section
        title.text_frame.paragraphs[0].font.color.rgb = font
        title.text_frame.paragraphs[0].font.size = Pt(32)
        title.text_frame.paragraphs[0].font.name = 'Karla'
       
       
        content = session.query(
            
            message = section,
            system_prompt=f"""You are an assistant whose task is to help a user in creating a presentation. \
            The slides of the presentation are as follows: {sections}
            You are now tasked with generating the content of one slide, which will be provided by the user.
            """,
            pre_prompt_query="You have been provided with the following information, which may be useful in your task.",
            prompt_query="Decide if the information is relevant, and use it if needed. \
            Generate the content required in the slide provided by the user. You only need to generate the contents of the slide, not the title\
            or anything else. Generate a maximum of 3 paragraphs of text. Keep to a word limit of 250 words. \
            Do not use numbered lists.",
            llm="mistralai/Mixtral-8x7B-Instruct-v0.1",
            rag_config={
                "rag_type": "rag",
            },
        ).content

        contents.text = content
        
        for paragraph in contents.text_frame.paragraphs:
            paragraph.space_after = 1
            paragraph.space_before = 0
            for run in paragraph.runs:
                run.font.size = Pt(18)  
                run.font.color.rgb = font
                run.font.name = 'Karla'

        shapes = slide.shapes
        new_width = Inches(14)
        new_height = Inches(7)
        shapes[0].height, shapes[0].width, shapes[0].top, shapes[0].left = shapes[0].height, new_width, shapes[0].top, (prs.slide_width-new_width)//2
        shapes[1].height, shapes[1].width, shapes[1].top, shapes[1].left = new_height, new_width, shapes[1].top, (prs.slide_width-new_width)//2
        
        

# gpt-4-1106-preview


In [None]:
sanitised = re.sub(r'[\W_]+', '_', format[0])
prs.save(f"./presentations/{sanitised}.pptx")

In [None]:
(prs.slide_width) - Inches(12)
#slide.placeholders[1].text_frame.margin_right = slide.placeholders[1].text_frame.margin_left

In [None]:
#client.delete_documents_from_collection(collection_id)