In [5]:
import os
import pickle
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

In [6]:
# Load the articile urls
loader = UnstructuredURLLoader(urls = [
    "https://finance.yahoo.com/news/nvidia-stock-falls-as-new-us-chip-rules-threaten-business-in-china-133336983.html",
    "https://www.thestar.com.my/business/business-news/2023/10/20/asian-shares-plumb-11-mth-lows-on-surging-us-yields-middle-east-worries",
    "https://www.theguardian.com/australia-news/2023/sep/05/mango-prices-higher-summer-warm-winter-queensland"
])

data = loader.load() # This essentially loads out the content in the urls link


# Recursive Character Splitter:
# 1) Improved Clarity: Breaks large text into smaller chunks, isolating key information and making it easier to understand and retrieve details.
# 2) Enhanced Efficiency: Smaller chunks are processed faster and can be handled in parallel, speeding up analysis.
# 3) Model Compatibility: Ensures text chunks fit within the input size limits of machine learning models, allowing effective processing.

doc_split = RecursiveCharacterTextSplitter(
    separators= ["\n\n", "\n", ".", " "], # List of seperators
    chunk_size = 1000, # size of each chunk created
    chunk_overlap = 100, # size of  overlap between chunks in order to maintain the context
    length_function = len
)

docs = doc_split.split_documents(data)


# Convert chunk of text into tokens using pre-trained models. Process like tokenisation and other common NLP proccess will be done.
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Facebook AI Similarity Search(FAISS), Think of FAISS as a powerful tool for similarity search
vectorindex_openai = FAISS.from_documents(docs, embeddings)


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [12]:
# Run this again to overwrite the model you want to save
file_path = "vector_index.pkl"
with open(file_path, "wb") as f:
  pickle.dump(vectorindex_openai, f)

In [10]:
# Load the pickle file
file_path = "vector_index.pkl"

if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

  return torch.load(io.BytesIO(b))


In [13]:
prompt_template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). If you don't know the answer, just say that you don't know, don't try to make up an answer.
{summaries}
QUESTION: {question}
SOURCES:
FINAL ANSWER:
"""
doc_prompt_template = """
Content: {page_content}
Source: {source}
"""

DOC_PROMPT = PromptTemplate(
    template=doc_prompt_template, input_variables=["page_content", "source"])

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["summaries", "question"]
)
callbacks=[StreamingStdOutCallbackHandler()]


# Ensure that the model ends with ".gguf" so that it is compatible to run locally
local_path = r"path\to\model"

llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)   


chain_type_kwargs = {"prompt": PROMPT, "document_prompt": DOC_PROMPT }
chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff",retriever=vectorIndex.as_retriever(),chain_type_kwargs=chain_type_kwargs,return_source_documents=True,verbose=True)
chain



RetrievalQAWithSourcesChain(verbose=True, combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['question', 'summaries'], template='Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n{summaries}\nQUESTION: {question}\nSOURCES:\nFINAL ANSWER:\n'), llm=GPT4All(verbose=True, callbacks=[<langchain_core.callbacks.streaming_stdout.StreamingStdOutCallbackHandler object at 0x0000021C0A816670>], model='C:\\Users\\Kah Han\\Documents\\Data Science\\GPT4ALL\\q4_0-orca-mini-3b.gguf', client=<gpt4all.gpt4all.GPT4All object at 0x0000021C43344CD0>)), document_prompt=PromptTemplate(input_variables=['page_content', 'source'], template='\nContent: {page_content}\nSource: {source}\n'), document_variable_name='summaries'), return_source_documents=True, retriever=VectorStoreRetriever(tags=['FAISS

In [15]:
query = "How is europe and china doing?"
answer = chain({"question": query}, return_only_outputs=True)
print(answer["source_documents"][0].metadata["source"]) 



[1m> Entering new RetrievalQAWithSourcesChain chain...[0m
As of now, Europe has been experiencing a slowdown in economic growth due to various factors such as the war in Ukraine, high energy costs, and supply chain disruptions. However, Germany's economy remains strong despite these challenges. China is facing various issues such as its zero-COVID policy, which has led to lockdowns and other restrictions that have hurt its economy. Additionally, there are concerns about rising debt levels and a potential slowdown in growth.
[1m> Finished chain.[0m
https://www.thestar.com.my/business/business-news/2023/10/20/asian-shares-plumb-11-mth-lows-on-surging-us-yields-middle-east-worries
