In [1]:
# pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [2]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [2]:
with open("./data/data.txt", "r", encoding="utf-8") as file:
    data = file.readlines()

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [3]:
data

['Mike Taber: This is Startups for the Rest of Us: Episode 1.\n',
 '\n',
 '[music]\n',
 '\n',
 'Mike: Welcome to Startups For the Rest of Us, the podcast that helps developers be awesome at launching software products, whether you have built your first product or are just thinking about.  I’m Mike.\n',
 '\n',
 'Rob Walling: And I’m Rob.\n',
 '\n',
 'Mike: And we’re here to share our experiences to help you avoid the same mistakes we’ve made.  So Rob, exactly why are we here again?\n',
 'Rob: I have no idea!  We’ve decided to do some crazy thing and record a podcast; something that neither of us have ever done.\n',
 '\n',
 'Mike: Well, that’s true, but I think that the important thing that we are really here for is to share the things that we’ve done in the past and, most importantly, share the mistakes that we’ve made over the past several years in building out our product portfolios and building up our companies.\n',
 '\n',
 'Rob: Yeah, and I think along with the mistakes come the suc

In [4]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
#print (f'There are {len(data[30].page_content)} characters in your document')

You have 8797 document(s) in your data


### Chunk your data up into smaller documents

In [5]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

def merge_lines(lines, char_threshold=500):
    chunks = []
    buffer = ""
    for line in lines:
        if len(buffer) + len(line) < char_threshold:
            buffer += line + " "
        else:
            chunks.append(buffer.strip())
            buffer = line + " "
    if buffer:
        chunks.append(buffer.strip())
    return chunks

texts = merge_lines(data, 2000)

In [6]:
print (f'Now you have {len(texts)} documents')

Now you have 691 documents


### Create embeddings of your documents to get ready for semantic search

In [7]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [8]:
# Fill in keys or use environment variable
OPENAI_API_KEY = ''

PINECONE_API_KEY = ''
PINECONE_API_ENV = ''

In [9]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [10]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "" # put in the name of your pinecone index here

In [11]:
docsearch = Pinecone.from_texts([t for t in texts], embeddings, index_name=index_name)

In [12]:
query = "example question"
docs = docsearch.similarity_search(query)

In [13]:
# Here's an example of the first document that was returned
print(docs)

[Document(page_content='[25:57] The weird part about it is you can say, OK, well just price your Kindle book at $9.99. But then that totally undercuts all your other sales channels. If you’re selling a hardback or a paperback, or even a PDF EPUB version like I do, if you’re selling those for 24 bucks direct, then you charge $9.99, you’re going to totally just undercut those sales. So it’s really an interesting paradigm. I can see why publishers are having kind of a conniption over it.\n \n [26:22] [music]\n \n [26:26] Ilya: Hey, guys. My name is Ilya. I’m a student and I’m a software developer who’s trying to launch several products. In one of your podcasts you’ve mentioned that to get your website started from zero to X number of visitors, you said that you need to create a landing page and just basically collect email addresses.\n \n [26:45] This is my main concern. When the product is ready, how do I get people to use it? Can you possibly provide some more tips on the subject? Thank

### Query those docs to get your answer back

In [25]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [26]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [38]:
query = "example question"
docs = docsearch.similarity_search(query)

In [30]:
chain.run(input_documents=docs, question=query)

' Dr K suggests that you take the yogic approach of preventing infection by filtering your perceptions, create a gap between what your actual sensory perception is and what your reaction to it is. He also suggests that you control what colonizes your mind and restrict as much as you can the sensory inputs. Finally, he suggests that you pause and reflect after achieving something, asking yourself questions about the build up, the experience, and the contentment afterwards.'