In [1]:
# imports for langchain and Chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [39]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI


In [40]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:

headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [4]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [5]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [9]:
import os
import requests
import numpy as np
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

def fetch_and_parse(url: str) -> str:
    """
    Fetch the webpage content at `url` and return a cleaned string of text.

    Parameters:
    - url (str): The URL of the webpage to fetch.

    Returns:
    - str: Cleaned text content extracted from the webpage.
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    for irrelevant in soup.body(["script", "style", "img", "input"]):
        irrelevant.decompose()
    text = soup.body.get_text(separator="\n", strip=True)

    return text

def split_text_into_documents(text: str, chunk_size: int = 1000, overlap: int = 100):
    """
    Split a long text into overlapping chunks and return them as a list of Documents.

    Parameters:
    - text (str): The long text to split.
    - chunk_size (int): The size of each chunk (default is 1000 characters).
    - overlap (int): The number of overlapping characters between consecutive chunks (default is 100).

    Returns:
    - list: A list of Documents, each containing a chunk of text.
    """

    # Initialize an empty list to store the chunks.
    docs = []

    doc = [Document(page_content = text)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    docs = text_splitter.split_documents(doc)
    return docs


In [12]:
text = fetch_and_parse("https://en.wikipedia.org/wiki/Artificial_intelligence")
docs = split_text_into_documents(text)

In [14]:
def calculate_word_stats(texts):
    """
    Calculate and display average word and character statistics for a list of documents.

    Parameters:
    - texts (list): A list of Document objects, where each Document contains a `page_content` attribute.

    Returns:
    - None: Prints the average word and character counts per document.
    """

    # Step 1: Initialize variables to keep track of total words and total characters.
    total_words, total_characters = 0, 0
    total_docs = 0

    # Step 2: Iterate through each document in the `texts` list.
    for doc in texts:
      text = doc.page_content
      if text.strip():
        total_words += len(text.split())
        total_characters += len(text)
        total_docs += 1

    # Step 3: Calculate the average words and characters per document.
    # - Avoid division by zero by checking if the `texts` list is not empty.
    if texts:
        avg_words = total_words / total_docs
        avg_characters = total_characters / total_docs
    else:
        avg_words = 0
        avg_characters = 0
    # Step 4: Print the calculated averages in a readable format.
    # Example: "Average words per document: 123.45"
    print(f"Average words per document: {avg_words}")
    print(f"Average characters per document: {avg_characters}")


In [19]:
calculate_word_stats(docs)

Average words per document: 155.04032258064515
Average characters per document: 956.9153225806451


In [25]:
type(docs[0])

langchain_core.documents.base.Document

In [26]:
from rank_bm25 import BM25Okapi
from langchain_core.runnables import RunnablePassthrough

class BM25Retriever:
    """
    A class to implement BM25-based document retrieval.

    Attributes:
    - documents (list): A list of Document objects.
    - corpus (list): A list of strings representing the document contents.
    - tokenized_corpus (list): A list of tokenized documents (lists of words).
    - bm25 (BM25Okapi): The BM25 retriever initialized with the tokenized corpus.
    """

    def __init__(self, documents):
        """
        Initialize the BM25 retriever with the given documents.

        Parameters:
        - documents (list): A list of Document objects.
        """
        self.documents = documents
        self.corpus = [doc.page_content for doc in documents]

        self.tokenized_corpus = [doc.split() for doc in self.corpus]

        self.bm25 = BM25Okapi(self.tokenized_corpus)

    def retrieve(self, query, k=5):
        """
        Retrieve the top `k` most relevant documents for a given query.

        Parameters:
        - query (str): The input query as a string.
        - k (int): The number of top documents to return (default is 5).

        Returns:
        - list: A list of the top `k` relevant documents as strings.
        """

        tokenized_query = query.split()

        top_docs = self.bm25.get_top_n(tokenized_query, self.corpus, n=k)

        return top_docs


In [28]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

def build_chroma(documents: list[Document]) -> Chroma:
    """
    Build a Chroma vector store using Hugging Face embeddings
    and add the documents to it.

    Parameters:
    - documents (list[Document]): A list of Document objects to add to the vector store.

    Returns:
    - Chroma: The Chroma vector store containing the embedded documents.
    """

   
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma(embedding_function=embeddings)

    vector_store.add_documents(documents)

    return vector_store


In [33]:
from langchain.schema import Document

class EnsembleRetriever:
    """
    Merges results from Chroma similarity search and BM25 lexical search.
    """

    def __init__(self, chroma_store, bm25_retriever):
        """
        Initialize the EnsembleRetriever with Chroma and BM25 retrievers.

        Parameters:
        - chroma_store: The Chroma vector store for semantic retrieval.
        - bm25_retriever: The BM25 retriever for lexical retrieval.
        """
        self.bm25_retriever = bm25_retriever
        self.chroma_store = chroma_store

    def get_relevant_documents(self, query: str, k: int = 5):
        """
        Retrieve relevant documents by combining results from Chroma and BM25.

        Parameters:
        - query (str): The input search query.
        - k (int): The number of top unique documents to return (default: 5).

        Returns:
        - list[Document]: A list of unique relevant documents.
        """

        
        chroma_docs = self.chroma_store.similarity_search(query, k=k)
     
        bm25_docs = self.bm25_retriever.retrieve(query, k=k)

        combined = chroma_docs + [Document(page_content=doc) for doc in bm25_docs]
     
        seen = set()
        unique_docs = []
        for doc in combined:

            content = doc.page_content if isinstance(doc, Document) else doc
      
            key = content[:60]

            if key not in seen:
                if isinstance(doc, str):
                    doc = Document(page_content=doc)
                unique_docs.append(doc)
                seen.add(key)

        return unique_docs[:k]


In [36]:
from langchain.schema import Document

class EnsembleRetriever:
    """
    Merges results from Chroma similarity search and BM25 lexical search.
    """

    def __init__(self, chroma_store, bm25_retriever):
        """
        Initialize the EnsembleRetriever with Chroma and BM25 retrievers.

        Parameters:
        - chroma_store: The Chroma vector store for semantic retrieval.
        - bm25_retriever: The BM25 retriever for lexical retrieval.
        """
        self.bm25_retriever = bm25_retriever
        self.chroma_store = chroma_store

    def get_relevant_documents(self, query: str, k: int = 5):
        """
        Retrieve relevant documents by combining results from Chroma and BM25.

        Parameters:
        - query (str): The input search query.
        - k (int): The number of top unique documents to return (default: 5).

        Returns:
        - list[Document]: A list of unique relevant documents.
        """

        
        chroma_docs = self.chroma_store.similarity_search(query, k=k)
     
        bm25_docs = self.bm25_retriever.retrieve(query, k=k)

        combined = chroma_docs + [Document(page_content=doc) for doc in bm25_docs]
     
        seen = set()
        unique_docs = []
        for doc in combined:

            content = doc.page_content if isinstance(doc, Document) else doc
      
            key = content[:60]

            if key not in seen:
                if isinstance(doc, str):
                    doc = Document(page_content=doc)
                unique_docs.append(doc)
                seen.add(key)

        return unique_docs[:k]


In [43]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# Create vectorstore

# BEFORE
# vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

# AFTER
vectorstore = FAISS.from_documents(docs, embedding=embeddings)

total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d

print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vector store")

There are 248 vectors with 1,536 dimensions in the vector store


In [45]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [46]:
query = "Can you describe AI"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

Artificial intelligence (AI) is a field of research in computer science that focuses on creating systems and software that enable machines to perceive their environment, learn from data, and take actions to achieve defined goals. It encompasses various approaches, including machine learning, deep learning, symbolic reasoning, and more. AI can be applied in numerous areas, such as natural language processing, computer vision, robotics, and many others. High-profile applications of AI include web search engines, recommendation systems, virtual assistants, autonomous vehicles, and generative tools. The goal of AI research is to develop intelligent agents that can perform tasks that typically require human intelligence.


In [57]:
from openai import OpenAI


In [58]:
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


example_urls = [
        "https://en.wikipedia.org/wiki/Artificial_intelligence",
        "https://en.wikipedia.org/wiki/Machine_learning"
    ]

    # Step 1A: Initialize an empty list to store all documents


# Create the EnsembleRetriever instance
ensemble_retriever = EnsembleRetriever(chroma_store=chroma_store, bm25_retriever=bm25_retriever)


Scraping content from: https://en.wikipedia.org/wiki/Artificial_intelligence
Scraping content from: https://en.wikipedia.org/wiki/Machine_learning


In [85]:
# ... other imports ...

from openai import OpenAI
openai = OpenAI()
system_message = "You are a helpful assistant "
system_message += "Give proper descriptions as required by the user, explain it in detail"
system_message += "Always be accurate. If you don't know the answer, say so."

def get_ai_response(user_prompt, example_urls):
    """
    Gets an AI response to a user prompt, incorporating relevant context.

    Parameters:
    - user_prompt (str): The input prompt from the user.

    Returns:
    - tuple: A tuple containing the AI's response text and an optional image.
    """
    all_docs = []
    
    # Step 1B: Iterate through the URLs to fetch and process content
    for url in example_urls:
        print(f"Scraping content from: {url}")
    
    # Step 1B.1: Fetch and parse the raw text from the URL
        raw_text = fetch_and_parse(url)
    
    # Step 1B.2: Split the raw text into chunks (documents)
        splits = split_text_into_documents(raw_text)
    
    # Step 1B.3: Add the chunks to the list of documents
        all_docs.extend(splits)
    
    chroma_store = build_chroma(all_docs)

    # Step 2B: Build the BM25 retriever
    bm25_retriever = BM25Retriever(all_docs)
        
    ensemble_retriever = EnsembleRetriever(chroma_store=chroma_store, bm25_retriever=bm25_retriever)

    
    relevant_docs = ensemble_retriever.get_relevant_documents(user_prompt, k=3)
    context = "\n".join(doc.page_content for doc in relevant_docs)

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"Context:\n{context}\n\nUser's prompt: {user_prompt}"}
    ]


    response = openai.chat.completions.create(model=MODEL, messages=messages)

    
    reply = response.choices[0].message.content

    return reply


In [86]:
def gradio_interface(urls, user_prompt):
    urls_list = urls.split(",")  # Assuming the input URLs are comma-separated
    response = get_ai_response(user_prompt, urls_list)
    return response

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter URLs (comma-separated)"),
        gr.Textbox(label="User Prompt")
    ],
    outputs="text",
    title="AI Assistant with RAG handing forURL Context and BM25 Retriever + Chroma store",
    description="Enter a list of URLs to retrieve context, along with your prompt."
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.




Scraping content from: https://python.langchain.com/api_reference/langchain/chains/langchain.chains.llm.LLMChain.html
