# Step 1: Install All the Required Pakages

In [55]:
!pip install langchain
!pip install pypdf
!pip install unstructured
!pip install sentence_transformers
!pip install pinecone-client
!pip install huggingface_hub







# Step 2: Import All the Required Libraries

In [1]:
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os

# Step 3: Load the Data¶

In [2]:
loader = PyPDFLoader('Nike_Inc_Info.pdf')

In [3]:
data = loader.load()

In [4]:
data[:2]

[Document(page_content='Contents\nNike, Inc.[note 1] (stylized as NIKE) is an American athletic footwear and apparel corporation headquartered near\nBeaverton, Oregon, United States. It is the world\'s largest supplier of athletic shoes and apparel and a major\nmanufacturer of sports equipment, with revenue in excess of US$46 billion in its fiscal year 2022.\nThe company was founded on January 25, 1964, as "Blue Ribbon Sports", by Bill Bowerman and Phil Knight, and\nofficially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory.\nNike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force\n1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, Nike CR7, and subsidiaries including Air Jordan and Converse\n(brand). Nike also owned Bauer Hockey from 1995 to 2008, and previously owned Cole Haan, Umbro, and Hurley\nInternational. In addition to manufacturing sportswear and equipme

# Step 4: Split the Text into Chunks¶

In [5]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)

In [6]:
docs=text_splitter.split_documents(data)

In [7]:
len(docs)

109

In [8]:
docs[0]

Document(page_content='Contents\nNike, Inc.[note 1] (stylized as NIKE) is an American athletic footwear and apparel corporation headquartered near\nBeaverton, Oregon, United States. It is the world\'s largest supplier of athletic shoes and apparel and a major\nmanufacturer of sports equipment, with revenue in excess of US$46 billion in its fiscal year 2022.\nThe company was founded on January 25, 1964, as "Blue Ribbon Sports", by Bill Bowerman and Phil Knight, and', metadata={'source': 'Nike_Inc_Info.pdf', 'page': 0})

# Step 5: Setup the Environment

In [9]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '92c92dd0-d734-4c2a-b002-843913483b60')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

# Step 6: Downlaod the Embeddings

In [10]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Step 7: Initializing the Pinecone

In [11]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "hk" # put in the name of your pinecone index here

# Step 8: Create Embeddings for Each of the Text Chunk

In [12]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

In [13]:
# docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)

# Step 9: Similarity Search

In [14]:
query='Nike World Headquarters'

In [15]:
docs=docsearch.similarity_search(query)

In [16]:
docs[:4]

[Document(page_content='Figure14: Nike World Headquarters near Beaverton, Oregon\nControversies\nNike has contracted with more than 700 shops around the world and has offices located in 45 countries outside the\nUnited States. Most of the factories are located in Asia, including Indonesia, China, Taiwan, India, Thailand, Vietnam,\nPakistan, Philippines, and Malaysia. Nike is hesitant to disclose information about the contract companies it works with.'),
 Document(page_content="Figure11: Nike Vaporfly cut in half to show the different layers that make up the base of the shoe. The dark grey line shows the carbon fiber plate.\nFigure12: Nike cleat\nFigure13: Nike Elite no-show socks with cushioned sole\nHeadquarters\nNike's world headquarters are surrounded by the city of Beaverton but are within unincorporated Washington County.\nThe city attempted to forcibly annex Nike's headquarters, which led to a lawsuit by Nike, and lobbying by the company"),
 Document(page_content='1990, Nike move

# Import All the Required Libraries

In [17]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
from langchain.chains.question_answering import load_qa_chain

In [18]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager
     

# Step 10: Query the Docs to get the Answer Back (Hugging Face Model)

In [19]:
from langchain.llms import HuggingFaceHub   

In [48]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1025})

In [49]:
chain=load_qa_chain(llm, chain_type="stuff")

In [53]:
query='read this question with attention and tell where is Nike World Headquarters ? , also tell me in  which figure it is persent '
docs=docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

'Figure14'

In [54]:
query='who is phil knight and any figure where we can find him'
docs=docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

'Phil Knight is the founder and former CEO of Nike.'