Import Libraries

In [4]:
from pinecone import Pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

In [5]:
from dotenv import load_dotenv

load_dotenv()

True

In [6]:
import os

Read the PDF

In [7]:
def read_doc(dir):
    file_loader = PyPDFDirectoryLoader(dir)
    documents = file_loader.load()
    return documents

In [8]:
doc = read_doc('documents/')
doc

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-11-04T19:51:24+08:00', 'author': 'Ritika Chopra and Gagan Deep Sharma', 'keywords': 'artificial intelligence; neural networks; training algorithm; NVivo; stock market forecast', 'moddate': '2021-11-04T12:56:51+01:00', 'subject': 'The stock market is characterized by extreme fluctuations, non-linearity, and shifts in internal and external environmental variables. Artificial intelligence (AI) techniques can detect such non-linearity, resulting in much-improved forecast results. This paper reviews 148 studies utilizing neural and hybrid-neuro techniques to predict stock markets, categorized based on 43 auto-coded themes obtained using NVivo 12 software. We group the surveyed articles based on two major categories, namely, study characteristics and model characteristics, where ‘study characteristics’ are further categorized as the stock market covered, input data, and nature of the stu

In [9]:
len(doc)

34

Load into Chunks

In [10]:
def chunk_data(docs, chunk_size = 800, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

In [11]:
documents = chunk_data(docs=doc)
documents

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-11-04T19:51:24+08:00', 'author': 'Ritika Chopra and Gagan Deep Sharma', 'keywords': 'artificial intelligence; neural networks; training algorithm; NVivo; stock market forecast', 'moddate': '2021-11-04T12:56:51+01:00', 'subject': 'The stock market is characterized by extreme fluctuations, non-linearity, and shifts in internal and external environmental variables. Artificial intelligence (AI) techniques can detect such non-linearity, resulting in much-improved forecast results. This paper reviews 148 studies utilizing neural and hybrid-neuro techniques to predict stock markets, categorized based on 43 auto-coded themes obtained using NVivo 12 software. We group the surveyed articles based on two major categories, namely, study characteristics and model characteristics, where ‘study characteristics’ are further categorized as the stock market covered, input data, and nature of the stu

In [12]:
len(documents)

238

Embedding technique

In [13]:
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x00000145D87BC970>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x00000145D87EA950>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [14]:
vectors = embeddings.embed_query("Hello world")
len(vectors)

1536

Vector Search DB in Pinecone

In [15]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

In [16]:
index_name = "langchain-vectors"

index = PineconeVectorStore.from_documents(doc, embeddings, index_name=index_name)

Cosine Similarity

In [17]:
def retrieve_query(query, k=5):
    matching_results = index.similarity_search(query, k=k)
    return matching_results

In [21]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import ChatOpenAI

In [22]:
llm = ChatOpenAI(model_name='gpt-4o', temperature=0.6)
chain = load_qa_chain(llm, chain_type="stuff")


Search answers

In [17]:
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)

    response = chain.run(input_documents=doc_search, question=query)
    return response

In [19]:
user_query = "What are the different models that were used in the stock market?"
answer = retrieve_answers(user_query)
print(answer)

[Document(id='a85071b3-2e6c-4a16-b383-c89a9942e4d6', metadata={'author': 'Ritika Chopra and Gagan Deep Sharma', 'creationdate': '2021-11-04T19:51:24+08:00', 'creator': 'LaTeX with hyperref', 'keywords': 'artificial intelligence; neural networks; training algorithm; NVivo; stock market forecast', 'moddate': '2021-11-04T12:56:51+01:00', 'page': 26.0, 'page_label': '27', 'producer': 'pdfTeX-1.40.21', 'source': 'documents\\jrfm-14-00526.pdf', 'subject': 'The stock market is characterized by extreme fluctuations, non-linearity, and shifts in internal and external environmental variables. Artificial intelligence (AI) techniques can detect such non-linearity, resulting in much-improved forecast results. This paper reviews 148 studies utilizing neural and hybrid-neuro techniques to predict stock markets, categorized based on 43 auto-coded themes obtained using NVivo 12 software. We group the surveyed articles based on two major categories, namely, study characteristics and model characteristic