**DEVELOPING A CHAT ASSISTANT USING RETRIEVAL AUGMENTED GENERATION (RAG)**

In [1]:
# Install relevant libraries
!pip install chromadb==0.5.5 langchain-chroma==0.1.2 langchain==0.2.11 langchain-community==0.2.10 langchain-text-splitters==0.2.2 langchain-groq==0.1.6 transformers==4.43.2 sentence-transformers==3.0.1 unstructured==0.15.0 unstructured[pdf]==0.15.0 gradio pydantic-settings

Collecting chromadb==0.5.5
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-chroma==0.1.2
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain==0.2.11
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community==0.2.10
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-text-splitters==0.2.2
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain-groq==0.1.6
  Downloading langchain_groq-0.1.6-py3-none-any.whl.metadata (2.8 kB)
Collecting transformers==4.43.2
  Downloading transformers-4.43.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting unstructured=

In [1]:
!pip install gradio



In [2]:
# Import relevant libraries

import time
import textwrap
import gradio as gr

from langchain.document_loaders import UnstructuredFileLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.chains import ConversationalRetrievalChain

from config import settings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Create a variable for your api key
groq_api_key = settings.groq_api_key

In [4]:
# Installing Poppler-utils, which includes tools like pdfinfo, pdftotext, and pdfimages.
# This package is required for handling PDF files in the pdf2image library.
# The 'pdfinfo' utility is specifically needed to retrieve page counts and metadata from PDF files.
# Make sure to run this command in environments where Poppler is not already installed (e.g., Google Colab).
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 0s (1,768 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123620 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [5]:
from google.colab import files
import os

# You will be prompted to select and upload files from your local machine
uploaded = files.upload()

Saving ai and the future of humans.pdf to ai and the future of humans.pdf
Saving Artificial Intelligence Accelerates Human Learning_ Discussion Data Analytics ( PDFDrive ).pdf to Artificial Intelligence Accelerates Human Learning_ Discussion Data Analytics ( PDFDrive ).pdf
Saving artificial intelligence.pdf to artificial intelligence.pdf
Saving future of A.I intelligence.pdf to future of A.I intelligence.pdf
Saving future of ai.pdf to future of ai.pdf
Saving Introduction to Artificial Intelligence ( PDFDrive ).pdf to Introduction to Artificial Intelligence ( PDFDrive ).pdf
Saving preparing for future of ai.pdf to preparing for future of ai.pdf
Saving Stahl_2021_Artificial_intelligence_for_human_f.pdf to Stahl_2021_Artificial_intelligence_for_human_f.pdf
Saving the future of AI.pdf to the future of AI.pdf


In [6]:
# List all files uploaded
for file_name in uploaded.keys():
    print(f"Uploaded file: {file_name}")

Uploaded file: ai and the future of humans.pdf
Uploaded file: Artificial Intelligence Accelerates Human Learning_ Discussion Data Analytics ( PDFDrive ).pdf
Uploaded file: artificial intelligence.pdf
Uploaded file: future of A.I intelligence.pdf
Uploaded file: future of ai.pdf
Uploaded file: Introduction to Artificial Intelligence ( PDFDrive ).pdf
Uploaded file: preparing for future of ai.pdf
Uploaded file: Stahl_2021_Artificial_intelligence_for_human_f.pdf
Uploaded file: the future of AI.pdf


In [7]:
# Create a variable for the file paths in the /content/ directory (where files are uploaded in Colab)
file_path = [
    "/content/Introduction to Artificial Intelligence ( PDFDrive ).pdf",
    "/content/Artificial Intelligence Accelerates Human Learning_ Discussion Data Analytics ( PDFDrive ).pdf",
    "/content/Stahl_2021_Artificial_intelligence_for_human_f.pdf",
    "/content/ai and the future of humans.pdf",
    "/content/preparing for future of ai.pdf",
    "/content/artificial intelligence.pdf",
    "/content/future of ai.pdf",
    "/content/future of A.I intelligence.pdf",
    "/content/the future of AI.pdf"
]

# List to store the documents
documents = []

# Step 3: Iterate through each PDF file, check if the file exists, and load the document
for path in file_path:
    if os.path.exists(path):  # Check if the file exists in the /content/ directory
        try:
            loader = UnstructuredFileLoader(path)  # Load the file using your PDF loader (replace with your actual loader function if needed)
            doc = loader.load()
            documents.append(doc)
            print(f"Loaded: {path}")
        except Exception as e:
            print(f"Failed to load {path}: {e}")
    else:
        print(f"File not found: {path}")




Loaded: /content/Introduction to Artificial Intelligence ( PDFDrive ).pdf
Loaded: /content/Artificial Intelligence Accelerates Human Learning_ Discussion Data Analytics ( PDFDrive ).pdf
Loaded: /content/Stahl_2021_Artificial_intelligence_for_human_f.pdf
Loaded: /content/ai and the future of humans.pdf
Loaded: /content/preparing for future of ai.pdf
Loaded: /content/artificial intelligence.pdf
Loaded: /content/future of ai.pdf
Loaded: /content/future of A.I intelligence.pdf
Loaded: /content/the future of AI.pdf


In [8]:
documents

[[Document(metadata={'source': '/content/Introduction to Artificial Intelligence ( PDFDrive ).pdf'}, page_content='Undergraduate Topics in Computer Science\n\nWolfgang Ertel\n\nIntroduction to Artificial Intelligence\n\nSecond Edition\n\nUndergraduate Topics in Computer Science\n\nSeries editor Ian Mackie\n\nAdvisory Board Samson Abramsky, University of Oxford, Oxford, UK Karin Breitman, Pontiﬁcal Catholic University of Rio de Janeiro, Rio de Janeiro, Brazil Chris Hankin, Imperial College London, London, UK Dexter Kozen, Cornell University, Ithaca, USA Andrew Pitts, University of Cambridge, Cambridge, UK Hanne Riis Nielson, Technical University of Denmark, Kongens Lyngby, Denmark Steven Skiena, Stony Brook University, Stony Brook, USA Iain Stewart, University of Durham, Durham, UK\n\nUndergraduate Topics in Computer Science (UTiCS) delivers high-quality instruc- tional content for undergraduates studying in all areas of computing and information science. From core foundational and theo

In [9]:
# Split contents into text chunks
text_splitter = CharacterTextSplitter(
    chunk_size=1700,
    chunk_overlap=200
)

In [10]:
# Gather all text chunks into a list
texts = []
for doc in documents:
    text_chunks = text_splitter.split_documents(doc)
    texts.extend(text_chunks)



In [11]:
# Get the first 5 text chunks
print(texts[0])
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
print(texts[1])

page_content='Undergraduate Topics in Computer Science

Wolfgang Ertel

Introduction to Artificial Intelligence

Second Edition

Undergraduate Topics in Computer Science

Series editor Ian Mackie

Advisory Board Samson Abramsky, University of Oxford, Oxford, UK Karin Breitman, Pontiﬁcal Catholic University of Rio de Janeiro, Rio de Janeiro, Brazil Chris Hankin, Imperial College London, London, UK Dexter Kozen, Cornell University, Ithaca, USA Andrew Pitts, University of Cambridge, Cambridge, UK Hanne Riis Nielson, Technical University of Denmark, Kongens Lyngby, Denmark Steven Skiena, Stony Brook University, Stony Brook, USA Iain Stewart, University of Durham, Durham, UK

Undergraduate Topics in Computer Science (UTiCS) delivers high-quality instruc- tional content for undergraduates studying in all areas of computing and information science. From core foundational and theoretical material to ﬁnal-year topics and applications, UTiCS books take a fresh, concise, and modern approach and a

In [12]:
# Instantiate embedding
embedding = HuggingFaceEmbeddings()

# Create variable for vector database folder
persist_directory = "/content/chroma_db"

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# Create variable for vector database
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embedding,
    persist_directory=persist_directory
)

In [14]:
# Create retriever to retrieve information from our vector database
retriever = vectordb.as_retriever()

In [15]:
# Create a llm from groq
llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.5,
    groq_api_key=groq_api_key
)

In [16]:
# Create a conversational chain
conv_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [17]:
# Invoke the conversational chain to ask our question and get a response
question = "What is ai?"
response = conv_chain.invoke({"question": question, "chat_history": []})
print(f"Answer: {response['answer']}")
print(f"Source Document: {response['source_documents']}")

Answer: According to the provided context, there is no single definition of Artificial Intelligence (AI) that is universally accepted by practitioners. However, some common definitions include:

* A computerized system that exhibits behavior that is commonly thought of as requiring intelligence.
* A system capable of rationally solving complex problems or taking appropriate actions to achieve its goals in whatever real-world circumstances it encounters.
* A set of technologies that try to imitate or augment human intelligence.

Experts also agree that AI is a broad field that includes various subfields, such as machine learning, natural language processing, and neural networks. The goal of AI is to develop machines that behave as though they were intelligent, as defined by John McCarthy, one of the pioneers of AI, in 1955.
Source Document: [Document(metadata={'source': '/content/preparing for future of ai.pdf'}, page_content='Simultaneously, industry has been increasing its investment 

In [21]:
import time
import random

# Function to search and recall from memory
def recall_from_memory(memory, query):
    for entry in memory:
        if query.lower() in entry["question"].lower():
            return f"You mentioned this before: {entry['question']}\nResponse: {entry['response']}"

    # Provide feedback if nothing is found
    return f"I don't recall anything specific about '{query}'. Could you please provide more details or rephrase your question?"

# Create function to process the user question with memory and history
def process_question(user_question, history, memory=None):
    try:
        start_time = time.time()

        if history is None:
            history = []

        if memory is None:
            memory = []

        # Prepare chat_history in the format expected by conv_chain
        chat_history = [(h[0], h[1].split("\n\nResponse time:")[0]) for h in history]

        # Debug print
        print(f"Processing question: {user_question}")
        print(f"Chat history: {chat_history}")

        # Check if the user is asking for a recall of past conversations
        if "recall" in user_question.lower() or "remember" in user_question.lower():
            recall_query = user_question.replace("recall", "").replace("remember", "").strip()
            if recall_query:
                response = recall_from_memory(memory, recall_query)
            else:
                response = "Please specify what you'd like me to recall."

        # Custom response for ownership/founder-related questions
        elif "founder" in user_question.lower() or "builder" in user_question.lower() or "owner" in user_question.lower():
            response = (
                "This chat assistant was built and is maintained by Joel Tamakloe, a data scientist and AI enthusiast. "
                "Joel's background includes extensive experience in building AI-powered applications and solving real-world problems using data. "
                "This assistant was created to make information about artificial intelligence more accessible and to assist users in exploring AI concepts interactively. "
                "It is powered by advanced AI models like LLaMA and uses cutting-edge tools such as Hugging Face for embedding and Chroma for vector storage. "
                "Currently, it's in the testing phase with a focus on AI-related topics, aiming to improve its capabilities and expand into educational and business applications in the future."
            )
        # Custom response for personal questions
        elif "who are you" in user_question.lower() or "about you" in user_question.lower() or "yourself" in user_question.lower():
            response = (
                "I am an AI-powered chat assistant designed to assist users with exploring and learning about artificial intelligence and related topics. "
                "My purpose is to provide an intuitive way for users to interact with AI and gain insights on topics related to artificial intelligence. "
                "I use advanced tools and technologies like the LLaMA model, a powerful large language model, to process natural language queries, and Chroma, a vector database management system, to efficiently store and retrieve information. "
                "In the future, I plan to expand my abilities to cover more topics, improve response accuracy, and possibly integrate video and voice interaction for a more dynamic user experience. "
                "Ask me anything about A.I. I am happy to help! 😊"
            )
        # Custom response for greetings
        elif "hello" in user_question.lower() or "hi" in user_question.lower():
            response = "Hello! How can I assist you today?"

        # Invoke conv_chain with both the question and chat_history for all other questions
        else:
            response = conv_chain.invoke({"question": user_question, "chat_history": chat_history})

            if isinstance(response, dict) and 'answer' in response:
                response = response['answer']

            # Add a conversational prompt occasionally
            if random.random() < 0.3:  # 30% chance to add small talk
                response += random.choice([
                    "By the way, feel free to ask me anything else or even chat casually!",
                    "Let me know if you need more information or if you'd like to discuss something else! 😊",
                    "Is there anything else you're curious about? I'm happy to help!",
                    "If you have more questions or just want to chat, I'm here!",
                    "Don't hesitate to ask me anything. I'm here to assist you!",
                    "Got any other interesting topics in mind? Let's chat!",
                    "Feel free to share more or ask anything else that's on your mind!"
                ])

        # Measure the response time
        end_time = time.time()
        response_time = f"Response time: {end_time - start_time:.2f} seconds."

        # Combine the response and the response time
        full_response = f"{response}\n\n{response_time}"

        # Update the history
        history.append((user_question, full_response))

        # Debug print
        print(f"Processed successfully. Response: {full_response}")

        return history, memory, full_response

    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        print(error_message)
        return history, memory, error_message


In [22]:
# Setup the Gradio interface
iface = gr.Interface(
    fn=process_question,
    inputs=[
        gr.Textbox(lines=2, placeholder="Type your question here..."),
        gr.State()
    ],
    outputs=[
        gr.Chatbot(),
        gr.State(),
        gr.Textbox(label="Latest Answer")
    ],
    title="Adika Chat Assistant",
    description="Ask any question about Artificial Intelligence."
)

# Launch the interface
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6441e236269fe0e0d8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


