# DocuPresenter

## Setup



### Install

In [None]:
%%capture

# Updating the apt-get package list
!apt-get update

# Installing wkhtmltopdf for converting HTML to PDF
!apt-get install -y wkhtmltopdf

# Installing Python packages
!pip install -U google-generativeai  # Generative AI functionalities from Google
!pip install googletrans==4.0.0-rc1  # Translation tasks
!pip install --upgrade google-api-python-client  # Interacting with Google APIs
!pip install nltk  # Natural language processing tasks
!pip install PyPDF2  # Handling and manipulating PDF files
!pip install weasyprint CairoSVG  # HTML to PDF conversion and vector graphics handling
!pip install python-docx  # Working with DOCX files


### Import

In [None]:
# Import statements for the notebook

# Google Generative AI
import google.generativeai as palm

# Data handling and processing
import pandas as pd
import numpy as np

# Natural Language Processing
import nltk
from nltk.tokenize import sent_tokenize

# Translation
import googletrans
from googletrans import Translator

# Regular expressions
import re

# PDF handling
import PyPDF2

# Working with DOCX files
from docx import Document

# Provides a way of using operating system dependent functionality like reading or writing to a file system
import os

# Used for manipulating text, such as formatting long strings to a specific width or indenting
import textwrap



### API Key

In [None]:
palm.configure(api_key='AIzaSyDlnJAOpPRzJLT1IeBisvc-Fe4Eg71O9AI')

### Choosing a Model

In [None]:
models = [m for m in palm.list_models() if 'embedText' in m.supported_generation_methods]

model = models[0]

## Retrieve data from Drive

In [None]:
# Mount Google Drive to Google Colab.
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
folder_path = "/content/gdrive/MyDrive/Test1"
documents_dict = {}

# Check if the folder exists
if os.path.exists(folder_path):
    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text = ""

        if filename.endswith(".pdf"):  # Check if it's a PDF file
            with open(file_path, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)

                # Extract text from each page
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:  # Ensure there's text on the page
                        text += page_text + " "  # Add space after each page's content

        elif filename.endswith(".docx"):  # Check if it's a DOCX file
            doc = Document(file_path)

            # Extract text from each paragraph
            for para in doc.paragraphs:
                text += para.text + " "  # Add space after each paragraph's content

        # Truncate text if too long for your application
        shortened_text = text[:9900] if len(text) > 9900 else text

        # Add the shortened text to the dictionary
        documents_dict[filename] = shortened_text
else:
    print(f"Folder {folder_path} does not exist!")

# Convert the dictionary to a dataframe
df = pd.DataFrame(list(documents_dict.items()), columns=['Filename', 'Text'])

df


Unnamed: 0,Filename,Text
0,AI And Machine Learning shorten.pdf,AI And Machine Learning\nArtificial Intelligen...
1,How Management Teams Can Have a Good Fight.pdf,This document is authorized for use only in Ro...


### Embedding

In [None]:
# Get the embeddings of each text and add to an embeddings column in the dataframe
def embed_fn(text):
  return palm.generate_embeddings(model=model, text=text)['embedding']

df['Embeddings'] = df['Text'].apply(embed_fn)
df

Unnamed: 0,Filename,Text,Embeddings
0,AI And Machine Learning shorten.pdf,AI And Machine Learning\nArtificial Intelligen...,"[-0.020140609, -0.06332572, -0.0026104725, 0.0..."
1,How Management Teams Can Have a Good Fight.pdf,This document is authorized for use only in Ro...,"[-0.022857517, -0.06090713, 0.021298328, 0.016..."


## Query: PaLM

### Topic & Age Selection

In [None]:
topic = "AI and Machine Learning"

In [None]:
age = "23"

### Prompt

In [None]:
def find_best_passage(topic, dataframe):
  """
  Compute the distances between the query and each document in the dataframe
  using the dot product.
  """
  query_embedding = palm.generate_embeddings(model=model, text=topic)
  dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding['embedding'])
  idx = np.argmax(dot_products)
  return dataframe.iloc[idx]['Text'] # Return text from index with max value

passage = find_best_passage(topic, df)

def make_prompt(topic, age, relevant_passage):
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = textwrap.dedent("""\
        You are a helpful and informative bot that creates presentations using text from the reference passage included below.
        I am a teacher for a group of '{age}'-year-old students, please output markdown scripts.
        If the passage is irrelevant to the presentation, you may ignore it.
        Topic: '{topic}'
        PASSAGE: '{relevant_passage}'

        ANSWER:
    """).format(topic=topic, age=age, relevant_passage=escaped)

    return prompt


prompt = make_prompt(topic, age, passage)
print(prompt)

You are a helpful and informative bot that creates presentations using text from the reference passage included below.
I am a teacher for a group of '23'-year-old students, please output markdown scripts.
If the passage is irrelevant to the presentation, you may ignore it.
Topic: 'AI and Machine Learning'
PASSAGE: 'AI And Machine Learning Artificial Intelligence for the Real World Don’t start with moon shots.   by Thomas H. Davenport  and Rajeev Ronanki    In 2013, the MD Anderson  Cancer Center launched a “moon shot”  project: diagnose and recommend treatment plans for certain  forms of cancer using IBM’s Watson cognitive system. But in 2017,  the project was put on hold after costs topped $62  million—and the  system had yet to be used on patients. At the same time, the cancer center’s IT group was experimenting with using cognitive  technologies to do much less ambitious jobs, such as making hotel  and restaurant recommendations for patients’ families,  determining which patients ne

In [None]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]

text_model = text_models[0]

temperature = 0.5
answer = palm.generate_text(prompt=prompt,
                            model=text_model,
                            temperature=temperature,
                            max_output_tokens=1000)


llm_output= answer.result

## Translation

In [None]:
def translate_text(text, dest_language):
    translator = Translator()
    try:
        translation = translator.translate(text, dest=dest_language)
        return translation.text
    except Exception as e:
        print(f"Error during translation: {e}")
        return text  # Return the original text if translation fails


In [None]:
lang = "English"

In [None]:
translated_llm_output = translate_text(llm_output, lang)

translated_llm_output

'```\n# AI and Machine Learning\n\n## Introduction\n\nArtificial intelligence (AI) and machine learning are powerful technologies that are rapidly changing the world. They are being used to solve a wide range of problems, from improving healthcare to detecting fraud.\n\nIn this presentation, we will discuss the basics of AI and machine learning. We will also explore how these technologies are being used in the real world.\n\n## What is AI?\n\nAI is the ability of a machine to simulate human intelligence. This includes the ability to learn, reason, and solve problems.\n\nMachine learning is a subfield of AI that focuses on developing algorithms that can learn from data. These algorithms can be used to perform a variety of tasks, such as predicting customer behavior or identifying fraud.\n\n## How are AI and machine learning being used in the real world?\n\nAI and machine learning are being used in a wide variety of industries, including healthcare, finance, and retail.\n\nIn healthcare,

## Convert .md to pdf and save to Gdrive

In [None]:
from weasyprint import HTML
import markdown

# Ensure llm_output is a string and strip unnecessary characters if present.
translated_llm_output = translated_llm_output.strip("```").strip()


# Split the content into slides based on '##' and insert page breaks before headers
slides = re.split(r'\n## ', translated_llm_output)
for i, slide in enumerate(slides):
    if i > 0:
        slides[i] = f"\n\n<div style=\"page-break-before: always;\"></div>\n{slide}"

# Replace all heading levels with bold headings
for i, slide in enumerate(slides):
    for heading_level in range(2, 7):
        heading_pattern = f"^({'#' * heading_level}) (.*)"
        replacement_pattern = f"\\1 **\\2**"
        slide = re.sub(heading_pattern, replacement_pattern, slide, flags=re.MULTILINE)
    slides[i] = slide

# Convert each slide to HTML
html_slides = [markdown.markdown("# {}".format(slide), extensions=['markdown.extensions.extra']) for slide in slides]



# HTML and CSS for the presentation-like format
presentation_html = """
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>Presentation</title>
    <style>
        @page {{
            size: A4 landscape;
            margin: 0mm;
        }}
        body {{
            font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
            margin: 0;
            padding: 0;
            display: block;
        }}
        section {{
            width: 80%;
            max-width: 1280px;
            margin: 1cm auto;
            page-break-after: always;
            page-break-inside: avoid;
            display: block;
        }}
        h1, h2, h3, h4 {{
            text-align: center;
            margin-top: 0.5cm;
            font-weight: bold;
        }}
        p, li {{
            font-size: 24px;
            line-height: 1.5;
            text-align: left;
            margin-left: 10%;
            margin-right: 10%;
        }}
        ul, ol {{
            padding-left: 20px;
        }}
    </style>
</head>
<body>
    {}
</body>
</html>
""".format("\n".join(html_slides))

# Set the output file name
output_file_name = f"{topic}_{lang}_{age}.pdf"

# Set the path to save the PDF file (modify as needed)
pdf_file_path = f"/content/gdrive/MyDrive/Test/{output_file_name}"

# Generate the PDF from the HTML string and save it to the specified path
HTML(string=presentation_html).write_pdf(pdf_file_path)

print(f"The presentation PDF has been created and saved to {pdf_file_path}.")


The presentation PDF has been created and saved to /content/gdrive/MyDrive/Test/AI and Machine Learning_English_23.pdf.
