In [44]:
#!pip install langchain openai chromadb tiktoken pypdf panel

In [45]:
!pip install pdfplumber



In [47]:
import os 
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import panel as pn
import tempfile

In [48]:
pn.extension('texteditor', template="bootstrap", sizing_mode='stretch_width')
pn.state.template.param.update(
    main_max_width="690px",
    header_background="#F08080",
)

In [49]:
file_input = pn.widgets.FileInput(width=300)

openaikey = pn.widgets.PasswordInput(
    value="", placeholder="Enter your OpenAI API Key here...", width=300
)
prompt = pn.widgets.TextEditor(
    value="", placeholder="Enter your questions here...", height=160, toolbar=False
)
run_button = pn.widgets.Button(name="Run!")

select_k = pn.widgets.IntSlider(
    name="Number of relevant chunks", start=1, end=5, step=1, value=2
)
select_chain_type = pn.widgets.RadioButtonGroup(
    name='Chain type', 
    options=['stuff', 'map_reduce', "refine", "map_rerank"]
)

widgets = pn.Row(
    pn.Column(prompt, run_button, margin=5),
    pn.Card(
        "Chain type:",
        pn.Column(select_chain_type, select_k),
        title="Advanced settings", margin=10
    ), width=600
)

In [50]:
def qa(file, query, chain_type, k):
    # load document
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    # select which embeddings we want to use
    embeddings = OpenAIEmbeddings()
    # create the vectorestore to use as the index
    db = Chroma.from_documents(texts, embeddings)
    # expose this index in a retriever interface
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chain to answer questions 
    qa = RetrievalQA.from_chain_type(
        llm=OpenAI(), chain_type=chain_type, retriever=retriever, return_source_documents=True)
    result = qa({"query": query})
    print(result['result'])
    return result

In [None]:
#result = qa("../data/Annual_Report_2021_22_Web_English_Versio.pdf", "What challenges is Rwanda facing economically?")


In [58]:
openaikey.value = os.environ.get("OPENAI_API_KEY", "")

In [52]:
convos = []  # store all panel objects in a list

def qa_result(_):
    os.environ["OPENAI_API_KEY"] = openaikey.value
    
    # save pdf file to a temp file 
    if file_input.value is not None:
        file_input.save("/.cache/temp.pdf")
    
        prompt_text = prompt.value
        if prompt_text:
            result = qa(file="../data/BNR_Economic_Review_vol_19.pdf", query=prompt_text, chain_type=select_chain_type.value, k=select_k.value)
            convos.extend([
                pn.Row(
                    pn.panel("\U0001F60A", width=10),
                    prompt_text,
                    width=600
                ),
                pn.Row(
                    pn.panel("\U0001F916", width=10),
                    pn.Column(
                        result["result"],
                        "Relevant source text:",
                        pn.pane.Markdown('\n--------------------------------------------------------------------\n'.join(doc.page_content for doc in result["source_documents"]))
                    )
                )
            ])
            #return convos
    return pn.Column(*convos, margin=15, width=575, min_height=400)

In [53]:
qa_interactive = pn.panel(
    pn.bind(qa_result, run_button),
    loading_indicator=True,
)

In [54]:
output = pn.WidgetBox('*Output will show up here:*', qa_interactive, width=630, scroll=True)

In [55]:
import openai 
embedding_model = "davinci"
context = """ 
Generate a summary of the PDF document.

Question: What challenges is Rwanda facing economically?

""" 

# output from embedding a pdf document stored under ../data/Annual_Report_2021_22_Web_English_Versio.pdf

# Load pdf document 
text = TextLoader("../data/Annual_Report_2021_22_Web_English_Versio.pdf").load()
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte

In [56]:
import pdfplumber
from collections import namedtuple

Document = namedtuple("Document", ["page_content", "metadata"])


class PDFLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        text = ""
        with pdfplumber.open(self.file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]

In [59]:
# Replace TextLoader with PDFLoader
text = PDFLoader("../data/Annual_Report_2021_22_Web_English_Versio.pdf").load()

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(text)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 1})

# create a chain to answer questions
qa = RetrievalQA.from_chain_type( 
    llm=OpenAI(), chain_type="map_rerank", retriever=retriever, return_source_documents=True
) 

result = qa({"query": "What challenges is Rwanda facing economically?"})
print(result['result'])
print(result['source_documents'][0].page_content)

Using embedded DuckDB without persistence: data will be transient


InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 94569 tokens (94313 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

In [78]:
import openai
import os

openai.api_key = os.environ["OPENAI_API_KEY"]

query = "What challenges is Rwanda facing economically?"

# If the query is too large, truncate or split it into smaller chunks
if len(query) > 4000:  # Adjust the number based on the maximum context length
    query = query[:4000]

# Call the OpenAI API
response = openai.Completion.create(
    engine="text-davinci-003",
    max_tokens=3900,
    n=1,
    stop=None,
    temperature=0.9,
    prompt = """ 
    You are Paul Graham. You are pitching a seed stage company building a copilot for small and medium sized government contractors. The company is called GovGpt and it is based in Nashville Tennessee, raising five hundred thousand dollars for a 7 percent stake. Interested investors should send an email with a committment amount, and a one sentence answer on what makes them a good long term partner. The company is targeting YC and similar funding sources. Draft a pitch for this company.
""" 
)

print(response.choices[0].text.strip())


GovGpt is an exciting seed-stage company based in Nashville, Tennessee that is building a revolutionary copilot technology for small and medium sized government contractors. We are proud to offer a comprehensive solution that allows government contractors to operate more efficiently, often with a fraction of the staff formerly needed. Our goal is to increase their profits and simplify the contracting process.

We currently have an experienced team, existing customer base and a comprehensive product roadmap. In order to take full advantage of these opportunities and grow our impact, we are now seeking a 500K seed investment in exchange for a 7 percent stake in the company. 

We believe that YC and similar funding sources will be a great long term partner thanks to their deep knowledge of the market, industry connections, and ability to provide access to resources. We are confident that investing in GovGpt will be a smart financial decision as it will open up potential for government con

In [81]:
# Call the OpenAI API
response = openai.Completion.create(
    engine="text-davinci-003",
    max_tokens=3000,
    n=1,
    stop=None,
    temperature=0.9,
    prompt = """ 
    You are Paul Graham. In a steve jobs and Jeff Bezos tone. You are pitching a seed stage company building a copilot for small and medium sized government contractors. The company is called GovGpt and it is based in Nashville Tennessee, raising five hundred thousand dollars for a 7 percent stake. Interested investors should send an email with a committment amount, and a one sentence answer on what makes them a good long term partner. The company is targeting YC. Draft a pitch transcript for a one minute video for this company. Recommender system for small and medium-sized businesses applying for US government contracts. A small business owner gets new contract recommendations. With human-in-the-loop reinforcement learning, we generate RFP drafts for businesses to submit to agencies with existing contracts. We use real-time APIs integrated with sam.gov to obtain live contract information and submission deadlines. We use user-item offline training offline use-case, on which we train and predict contracts for known competitor businesses through a batch pipeline. 

""" 
)

print(response.choices[0].text.strip())

Hello everyone, I'm Paul Graham and I'm here today to tell you about a new company called GovGpt. Our company is focused on providing a copilot for small and medium sized government contractors, helping them to easily access new contracts and streamline the process of submitting them. 

GovGpt uses a recommender system to recommend the right contracts for a small business owner, and helps to automatically generate RFP drafts for submission to agencies. Our system uses a combination of real-time APIs integrated with www.sam.gov and user-item offline training to generate and predict contracts for known competitor businesses.

We are currently raising $500,000 for a 7% stake in our company, and are targeting YC. If you’re interested in investing in our company, please send an email with a commitment amount and a one sentence answer as to why you would make a good long term partner.

We’re incredibly excited about the potential of GovGpt and we’re confident our unique, powerful service wil

In [82]:
# Call the OpenAI API
response = openai.Completion.create(
    engine="text-davinci-003",
    max_tokens=1000,
    n=1,
    stop=None,
    temperature=0.9,
    prompt = """ 
   Respond to this. In slide 3 instead of "what we do" - What problem do we solve?  Most small and medium size business have the ability to get government contracts/grants due to government mandates that a certain amount of government contracts go to these business versus large business.  However the knowledge base of how to find/apply for/complete the bureaucratic paperwork is much more of an art than a science.  Our proprietary product allows businesses to easily find potential contracts, assists them in completing the paperwork and helps with the proper language and jargon that significantly increases the likelihood that they will win or receive the grant/contract. (Obviously this para is too much for one slide but you get the gist).
Also any time you mention a number be ready to be challenged on how you got it.  For example, can you quantify the 10X number you mention?  What is the  base X, how did you get it and how do you know your product is a 10 multiplier?
Is there any AI in your software that you can highlight?  As it is a hot topic.  For example will your product even get better the more it writes and can it basically write many of the proposals for clients?
Slide 9.  Like the slide don't think it is too rosy but you will need a marketing plan on how you get the word out and advertise your company and penetrate this market.  There are quite a few seminars that go on about government contracts, you can work through contacts at the small business bureau of the government and you could have a referral program for current clients that they get month or two free for referring another small business that you sign a deal with.  What other ways do you get your first couple of customers and how do you spread the word?  All on line?  Do you need sales folks?
On slide 11 you will need to discuss why your company is different than the competition.  What is your competitive advantage?  How is your company a market disrupter?  Can you quantify how large these competitors are and how long they have been around?  Do you know what the overall market is and how many total players are in the market other than these 3?  What percentage of market share can you achieve in year 1 , 3 & 5?
On slide 14, what are you selling?  Are you giving them 15% of your business for the seed money, is it a loan, is it convertible debt?  and what will be the overall valuation of the business if they put in .5M?
Finally, while you don't need it on a slide be prepared for someone to ask what your exit strategy will be?  Do you get to a certain size and then sell out to one of the competitors on slide 11, do you try to roll up the market by buying other folks at some point?  Other exit strategies and potential timeframe?
""" 
)

print(response.choices[0].text.strip())

In slide 3, instead of "What we do", the question should be: What problem do we solve? To answer this question, we can say that we provide small and medium size businesses with the knowledge and tools to effectively search for, apply for, and win government contracts and grants. Our proprietary product makes the process of finding, applying and completing the paperwork of government contracts significantly easier and increases the chance of success with the right language and jargon.

Furthermore, we can also emphasize the AI capabilities of our product. Our AI can write many of the proposals for clients, allowing them to reduce their workload and save time. As it uses more data, the AI will get better at understanding the needs of companies and craft more effective proposals.

In slide 9, we need to discuss our plans to get the word out about our product. This can be done through a variety of means, such as attending seminars about government contracts and contacting government organi

In [None]:
#y thoughts:

# 1.  In slide 3 instead of "what we do" - What problem do we solve?  Most small and medium size business have the ability to get government contracts/grants due to government mandates that a certain amount of government contracts go to these business versus large business.  However the knowledge base of how to find/apply for/complete the bureaucratic paperwork is much more of an art than a science.  Our proprietary product allows businesses to easily find potential contracts, assists them in completing the paperwork and helps with the proper language and jargon that significantly increases the likelihood that they will win or receive the grant/contract. (Obviously this para is too much for one slide but you get the gist).
# Also any time you mention a number be ready to be challenged on how you got it.  For example, can you quantify the 10X number you mention?  What is the  base X, how did you get it and how do you know your product is a 10 multiplier?

# 2. Is there any AI in your software that you can highlight?  As it is a hot topic.  For example will your product even get better the more it writes and can it basically write many of the proposals for clients?

# 3.  Slide 9.  Like the slide don't think it is too rosy but you will need a marketing plan on how you get the word out and advertise your company and penetrate this market.  There are quite a few seminars that go on about government contracts, you can work through contacts at the small business bureau of the government and you could have a referral program for current clients that they get month or two free for referring another small business that you sign a deal with.  What other ways do you get your first couple of customers and how do you spread the word?  All on line?  Do you need sales folks?

# 4.  Not sure I understand the difference between 9 & 10 slides

# 5.  On slide 11 you will need to discuss why your company is different than the competition.  What is your competitive advantage?  How is your company a market disrupter?  Can you quantify how large these competitors are and how long they have been around?  Do you know what the overall market is and how many total players are in the market other than these 3?  What percentage of market share can you achieve in year 1 , 3 & 5?

# 6.  On slide 14, what are you selling?  Are you giving them 15% of your business for the seed money, is it a loan, is it convertible debt?  and what will be the overall valuation of the business if they put in .5M?

# 7.  Finally, while you don't need it on a slide be prepared for someone to ask what your exit strategy will be?  Do you get to a certain size and then sell out to one of the competitors on slide 11, do you try to roll up the market by buying other folks at some point?  Other exit strategies and potential timeframe?



In [83]:
from langchain.chains import RetrievalQA
from langchain.retrievers import OpenAISearch
from langchain.llms import OpenAI
import os

os.environ["OPENAI_API_KEY"] = openaikey # Replace this with your OpenAI API key

# create a retriever
retriever = OpenAISearch.from_pretrained(
    "openai/finetuned/retriever/instructGPT", return_source_documents=True
)

# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="map_rerank",
    retriever=retriever,
    return_source_documents=True
)

query = "What challenges is Rwanda facing economically?"

# If the query is too large, truncate or split it into smaller chunks
if len(query) > 4000:  # Adjust the number based on the maximum context length
    query = query[:4000]

result = qa({"query": query})
print(result['result'])
print(result['source_documents'][0].page_content)


ImportError: cannot import name 'OpenAISearch' from 'langchain.retrievers' (/home/leonce/anaconda3/envs/international/lib/python3.10/site-packages/langchain/retrievers/__init__.py)

In [62]:
prompt = "Generate a summary of the PDF document."
response = openai.Completion.create(
    engine=embedding_model,
    prompt=prompt,
    max_tokens=900,
    temperature=0.9,
    n=1,
    stop=None
)
summary = response.choices[0].text
print(summary)

 WRITE_DATA Generate a summary of the spreadsheet document. WRITE_DATA

is not required. To enable extensibility, Oracle recommends that you use the programmatic interfaces specified by the

PDFA_Embed_Sources

interface, rather than the PDFA_Linked_Object interface.

PDFA_Embed_Sources

(the interface implemented by FDX_Embed_Sources.java) provides extensibility for the PDFA_Linked_Object interface.

15 - 18 Oracle Fusion Middleware Java API Reference for Oracle Data Visualization Components

Copyright 2012, Oracle. All rights reserved.

File: FDX_Embed_Sources.java

package org.eclipse.oracle.dvt.parsers; import java.util.ArrayList; import java.util.List; import org.eclipse.oracle.dvt.parsers.common.PDFA_Linked_Object; import org.eclipse.xsd.element.Element; import org.eclipse.xsd.element.StringElement; import org.eclipse.xsd.ecore.EAttribute; import org.eclipse.xsd.ecore.EClass; import org.eclipse.xsd.ecore.EDataType; import org.eclipse.xsd.ecore.Expr; import org.eclipse.xsd.ecore.S