## collecting text 

In [1]:
# %pip install --q unstructured langchain
# %pip install --q "unstructured[all-docs]"

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [4]:
# # pip install googlesearch-python
# !pip install bs4

Defaulting to user installation because normal site-packages is not writeable
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [14]:
# from googlesearch import search
import requests
from bs4 import BeautifulSoup
# from googlesearch import search
import re
def scrape_search_results(url):
   
    # Collect text from the top 5 search results
    top_results_text = []
    print(url)
    # # Fetch the content of each search result URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find and collect text from relevant HTML elements (e.g., paragraphs)
    text = ' '.join([p.get_text() for p in soup.find_all('p')])
    
    # Add the result text to the list
    top_results_text.append(text)
    
    return top_results_text

# Example usage:
search_url = 'https://en.wikipedia.org/wiki/Harry_Potter'
results = scrape_search_results(search_url)
for idx, result in enumerate(results, start=1):
    print(f"Result:\n {result}")

https://en.wikipedia.org/wiki/Harry_Potter
Result:
 
 Harry Potter is a series of seven fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's conflict with Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as the Ministry of Magic, and subjugate all wizards and Muggles (non-magical people).
 The series was originally published in English by Bloomsbury in the United Kingdom and Scholastic Press in the United States.  A series of many genres, including fantasy, drama, coming-of-age fiction, and the British school story (which includes elements of mystery, thriller, adventure, horror, and romance), the world of Harry Potter explores numerous themes and includes many cultural meanings and references.[1] Major themes 

In [15]:
data=results

In [16]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

In [17]:
data = [Document(result) for result in results]

In [None]:
data[0].page_content

## Vector Embeddings

In [None]:
# !ollama pull nomic-embed-text

In [None]:
# !ollama list

In [None]:
# %pip install --q chromadb
# %pip install --q langchain-text-splitters

In [4]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [18]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [None]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings:   0%|                                                                          | 0/9 [00:00<?, ?it/s]

## Retrieval

In [None]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
# LLM from Ollama
local_model = "llama2"
llm = ChatOllama(model=local_model)

In [None]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [None]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke("What are the 5 pillars of global cooperation?")

In [None]:
# Delete all collections in the db
vector_db.delete_collection()