## Documents Q&A

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

#### Load your documents

In [2]:
def load_document(file):
    # Using URL for file param can load data from web.
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('File type not supported!')

    data = loader.load()
    return data

def load_from_wiki(query, lang='en', load_max_docs=1):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load
    return data

#### Create text chunks

In [8]:
def chunk_data(data, chunk_size):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

#### Embedding and Uploading to Pinecone

In [45]:
def insert_or_fetch_embeddings(index_name, chunks=None):
    import pinecone
    from pinecone import ServerlessSpec
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pc = pinecone.Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))

    indexes = pc.list_indexes()
    index_exist = False
    for index in indexes:
        if index['name'] == index_name:
            index_exist = True
            vector_store = Pinecone.from_existing_index(index_name, embeddings)
    
    if not index_exist:
        pc.create_index(index_name, dimension=1536, metric='cosine', spec=ServerlessSpec(cloud="aws", region="us-west-2"))
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
    
    return vector_store

In [27]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))
    
    if index_name == 'all':
        indexes = pc.list_indexes()
        for index in indexes:
            pc.delete_index(index['name'])
    else:
        pc.delete_index(index_name)

## Running Q&A

In [9]:
data = load_document('https://arxiv.org/pdf/1706.03762.pdf')

Loading https://arxiv.org/pdf/1706.03762.pdf


In [10]:
# Uncomment to check the content info
# print(f'There is a total of {len(data)} pages.')
# print(data[0].page_content)
# print(data[0].metadata)

In [11]:
chunks = chunk_data(data, 256)

In [14]:
#chunks[0].page_content

In [37]:
# Delete all the indexes
delete_pinecone_index()

In [47]:
index_name = 'attention'
insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

<langchain_community.vectorstores.pinecone.Pinecone at 0x14d614790>