## Requirements

In [None]:
# %pip install pypdf
# %pip install pandas
# %pip install chromadb
# %pip install langchain
# %pip install "unstructured[all-docs]"
# %pip install python-dotenv
# %pip install google-generativeai
# %pip install langchain-google-genai 
# %pip install selenium
# %pip install "astrapy>=0.5.3"

## Imports

In [87]:
import os
from pathlib import Path as p
from dotenv import load_dotenv

import warnings
from pprint import pprint
from IPython.display import display

import textwrap
import pandas as pd

import firebase_admin
import google.generativeai as genai
from firebase_admin import credentials

from astrapy.db import AstraDB as db

from langchain.schema import Document
from langchain import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain.document_loaders import SeleniumURLLoader, TextLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.conversation.memory import ConversationSummaryMemory

from langchain_community.vectorstores import AstraDB
# from langchain_community.vectorstores import MatchingEngine
from langchain_community.chat_message_histories.firestore import FirestoreChatMessageHistory

warnings.filterwarnings("ignore")

## Parameters

In [48]:
load_dotenv('./keys/.env')

creds_path = "./keys/serviceAccountKey.json"

data_list = [
    "https://borgenproject.org/tag/homelessness-in-india/",
    "https://www.noblenonprofit.org/homelessness-in-india-an-unsolved-problem/",
    # "https://caufsociety.com/homelessness-in-india/",
    # "https://en.wikipedia.org/wiki/Homelessness_in_India",
    # "https://www.hlrn.org.in/homelessness",
    # "https://blog.ipleaders.in/prevention-homelessness-problem-treated-legally/",
    # "https://borgenproject.org/homelessness-in-india/",
    # "https://www.aljazeera.com/gallery/2023/1/3/photos-new-delhis-homeless-shiver-in-biting-cold",
    # "https://www.quora.com/In-India-if-a-person-suddenly-becomes-homeless-what-are-some-tips-you-will-give-him",
    # "https://leaglesamiksha.com/2022/06/19/homelessness-in-india-causes-and-remedies/",
    # "https://www.orfonline.org/expert-speak/sheltering-the-urban-homeless"
]

## Auth

In [None]:
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

In [None]:
cred = credentials.Certificate(creds_path)
firebase_admin.initialize_app(cred)

In [71]:
ASTRA_DB_API_ENDPOINT = f"https://{os.environ.get('ASTRA_DB_ID')}-{os.environ.get('ASTRA_DB_REGION')}.apps.astra.datastax.com"
ASTRA_DB_APPLICATION_TOKEN=os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = os.environ.get('ASTRA_DB_ID')
ASTRA_DB_REGION = os.environ.get('ASTRA_DB_REGION')
ASTRA_DB_APPLICATION_TOKEN = os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_KEYSPACE = os.environ.get('ASTRA_DB_KEYSPACE')

COLLECTION_NAME = "HESTIA_chatbot_dev"

astra_db = db(
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_DB_API_ENDPOINT
)

## Configure LLM

In [25]:
model = ChatGoogleGenerativeAI(
    model="gemini-pro",
    temperature=0.3
)

In [27]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [28]:
message_history = FirestoreChatMessageHistory(
    collection_name='Chat_message',
    session_id     ='yolo',
    user_id        ='yolo'
)

## Loading Data

In [29]:
data_loader = SeleniumURLLoader(urls=data_list)
pages = data_loader.load()

## Prompt Design - In Context

In [31]:
prompt = PromptTemplate(
    input_variables=[
      "history",
      "context", 
      "question",

    ],
    template=(
        """Answer the question as precise as possible using the provided context. If the answer is
           not contained in the context also refreter summary of past Conversation, say "answer not available in context":\n

        History:{history}\n
        Context:{context}\n
        Question:{question}\n
        """
    )
)

In [32]:
summary_memory=ConversationSummaryMemory(
    llm=model,
    memory_key="history",
    input_key="question",
    chat_memory=message_history
)

In [41]:
conversation = LLMChain(
    llm=model, 
    verbose=False,
    memory=summary_memory,
    prompt=prompt
)

## RAG pipeline

In [99]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
context = "\n\n".join(str(p.page_content) for p in pages)
texts = text_splitter.split_text(context)

In [100]:
text = [Document(page_content=x) for x in texts]

In [96]:
vector_index = Chroma.from_texts(texts, embeddings).as_retriever()

In [97]:
vector_store = AstraDB(
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace='hestia'
)

In [101]:
for t in text:
    inserted_ids = vector_store.add_documents([t])

In [None]:
# vector_index = MatchingEngine.from_components(
#     texts=texts,
#     project_id="<my_project_id>",
#     region="<my_region>",
#     gcs_bucket_uri="<my_gcs_bucket>",
#     index_id="<my_matching_engine_index_id>",
#     endpoint_id="<my_matching_engine_endpoint_id>",
#     embedding=embeddings
# )
# question = 'tell me about homelessness in india?'
# docs = vector_index.get_relevant_documents(question)

In [102]:
question = 'tell me about homelessness in india?'
docs = vector_store.similarity_search(question, k=3)

In [37]:
docs

[Document(page_content="India is the second most unequal country in the world, with 55 percent of income going to the top 10 percent of its population. Since India’s population increased exponentially, many cities ran out of space to contain the growing population. According to the Homeless World Cup, there were about 1.8 million homeless individuals living in India as of 2019. Over half of this population was living in urban areas, such as slums on the edge of cities. Unfortunately, the majority of the homeless often experience displacement through government-endorsed city beautification programs or by natural disasters. Due to their lack of resources, those who are homeless and poor struggle to recover from these events.\n\nConditions\n\nMost homeless people in India live on the streets of cities, under bridges, on highways or in any place they can seek refuge. Some maintain a nomadic lifestyle, where they roam around to find the best area for themselves and/or their families to live

In [38]:
docs_str = [str(x.page_content) for x in docs]

In [39]:
stuff_answer = conversation.predict(
    context=docs_str,
    question=question,
)

In [None]:
# stuff_answer = conversation(
#     {"context": docs_str, "question": question}
# )

In [40]:
pprint(stuff_answer)

('        In India, there are about 1.8 million homeless individuals living on '
 'the streets, under bridges, on highways or in any place they can seek '
 'refuge.')


In [None]:
# stuff_answer = conversation(
#     {"context": docs_str, "question": question}
# )