## Requirements

In [1]:
# %pip install pypdf
# %pip install pandas
# %pip install chromadb
# %pip install langchain
# %pip install "unstructured[all-docs]"
# %pip install python-dotenv
# %pip install google-generativeai
# %pip install langchain-google-genai 
# %pip install selenium
# %pip install "astrapy>=0.5.3"

## Imports

In [1]:
import os
from dotenv import load_dotenv

import warnings
from pprint import pprint

import textwrap
import pandas as pd

import firebase_admin
import google.generativeai as genai
from firebase_admin import credentials

from langchain import PromptTemplate
from langchain.schema import Document
from langchain.chains import LLMChain
# from langchain.vectorstores import Chroma
from langchain.document_loaders import SeleniumURLLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.conversation.memory import ConversationSummaryMemory

from langchain_community.vectorstores import AstraDB
# from langchain_community.vectorstores import MatchingEngine
from langchain_community.chat_message_histories.firestore import FirestoreChatMessageHistory

# warnings.filterwarnings("ignore")

## Parameters

In [2]:
load_dotenv("./keys/.env")

creds_path = "./keys/serviceAccountKey.json"

data_list = [
    # "https://borgenproject.org/tag/homelessness-in-india/",
    # "https://www.noblenonprofit.org/homelessness-in-india-an-unsolved-problem/",
    # "https://caufsociety.com/homelessness-in-india/",
    # "https://en.wikipedia.org/wiki/Homelessness_in_India",
    # "https://www.hlrn.org.in/homelessness",
    # "https://blog.ipleaders.in/prevention-homelessness-problem-treated-legally/",
    # "https://borgenproject.org/homelessness-in-india/",
    # "https://www.aljazeera.com/gallery/2023/1/3/photos-new-delhis-homeless-shiver-in-biting-cold",
#     "https://www.quora.com/In-India-if-a-person-suddenly-becomes-homeless-what-are-some-tips-you-will-give-him",
#     "https://leaglesamiksha.com/2022/06/19/homelessness-in-india-causes-and-remedies/",
#     "https://www.orfonline.org/expert-speak/sheltering-the-urban-homeless"
]

## Auth

In [3]:
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

In [4]:
cred = credentials.Certificate(creds_path)
firebase_admin.initialize_app(cred)

<firebase_admin.App at 0x7f74bb367f10>

In [5]:
ASTRA_DB_API_ENDPOINT = f"https://{os.environ.get('ASTRA_DB_ID')}-{os.environ.get('ASTRA_DB_REGION')}.apps.astra.datastax.com"
ASTRA_DB_APPLICATION_TOKEN=os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = os.environ.get('ASTRA_DB_ID')
ASTRA_DB_REGION = os.environ.get('ASTRA_DB_REGION')
ASTRA_DB_APPLICATION_TOKEN = os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_KEYSPACE = os.environ.get('ASTRA_DB_KEYSPACE')

COLLECTION_NAME = "HESTIA_chatbot_dev"

## Configure LLM

In [6]:
model = ChatGoogleGenerativeAI(
    model="gemini-pro",
    temperature=0.3
) # type: ignore

In [7]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")# type: ignore

In [40]:
message_history = FirestoreChatMessageHistory(
    collection_name='Chat_message',
    session_id     ='string',
    user_id        ='string'
)

## Loading Data

In [9]:
data_loader = SeleniumURLLoader(urls=data_list)
pages = data_loader.load()

KeyboardInterrupt: 

## Prompt Design - In Context

In [41]:
prompt = PromptTemplate(
    input_variables=[
      "history",
      "context", 
      "question",

    ],
    # template=(
    #     """Answer the question as precise as possible using the provided context. If the answer is
    #        not contained in the context also refreter summary of past Conversation, say "answer not available in context":\n

    #     History:{history}\n
    #     Context:{context}\n
    #     Question:{question}\n
    #     """
    # )
    template=(
            """Act a a guide who answers queries regarding homelessness use the history and context to provide well informed answers, if dont know the
            answer as well as te context and history dont know it reply "I am nor aware of it yet", if the data is not in context just tell"contact your 
            support team for more answers" dont't mention any thing reading context or historoy in your reply saying that"result not context or history".

        History:{history}\n
        Context:{context}\n
        Question:{question}\n
        """
    )
)

In [42]:
summary_memory=ConversationSummaryMemory(
    llm=model,
    memory_key="history",
    input_key="question",
    chat_memory=message_history
)

In [43]:
conversation = LLMChain(
    llm=model, 
    verbose=False,
    memory=summary_memory,
    prompt=prompt
)

## RAG pipeline

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
context = "\n\n".join(str(p.page_content) for p in pages)
texts = text_splitter.split_text(context)

NameError: name 'pages' is not defined

In [None]:
text = [Document(page_content=x) for x in texts]

In [None]:
# vector_index = Chroma.from_texts(texts, embeddings).as_retriever()

In [44]:
vector_store = AstraDB(
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace='hestia'
)

In [None]:
for t in text:
    inserted_ids = vector_store.add_documents([t])

In [None]:
# vector_index = MatchingEngine.from_components(
#     texts=texts,
#     project_id="<my_project_id>",
#     region="<my_region>",
#     gcs_bucket_uri="<my_gcs_bucket>",
#     index_id="<my_matching_engine_index_id>",
#     endpoint_id="<my_matching_engine_endpoint_id>",
#     embedding=embeddings
# )
# question = 'tell me about homelessness in india?'
# docs = vector_index.get_relevant_documents(question)

In [54]:
question = 'Homelessness in India'
docs = vector_store.similarity_search(question, k=4)

In [55]:
docs

[Document(page_content='Test Mode\n\nNotifications'),
 Document(page_content='www.outlookindia.com\n\nChecking if the site connection is secure\n\nwww.outlookindia.com needs to review the security of your connection before proceeding.\n\nConnection is secure\n\nProceeding...\n\nEnable JavaScript and cookies to continue\n\nRay ID: \n\nPerformance & security by \n\nCloudflare'),
 Document(page_content="Why do People Become Homeless?\n\nWhy is Homelessness a Problem?\n\nTiny Homes for the Homeless\n\nWhat is it Like to Be Homeless?\n\nRECENT\n\nWhere Can A Homeless Woman Go For Help?\n\nWho Funds Homeless Shelters?\n\nWhat To Do If You Will Soon Become Homeless\n\nWhat to Do When Homeless\n\nWhat To Do With Belongings When Homeless\n\nWho Has The Largest Homeless Population?\n\nCATEGORIES\n\nBeing Homeless\n\nFeatured\n\nHelping the Homeless\n\nHomelessness by Country and Continent\n\nHomelessness in the United States\n\nResources for the Homeless\n\nSOCIETY\n\nAbout\n\nContact\n\nPrivacy

In [56]:
docs_str = [str(x.page_content) for x in docs]

In [57]:
docs_str

['Test Mode\n\nNotifications',
 'www.outlookindia.com\n\nChecking if the site connection is secure\n\nwww.outlookindia.com needs to review the security of your connection before proceeding.\n\nConnection is secure\n\nProceeding...\n\nEnable JavaScript and cookies to continue\n\nRay ID: \n\nPerformance & security by \n\nCloudflare',
 "Why do People Become Homeless?\n\nWhy is Homelessness a Problem?\n\nTiny Homes for the Homeless\n\nWhat is it Like to Be Homeless?\n\nRECENT\n\nWhere Can A Homeless Woman Go For Help?\n\nWho Funds Homeless Shelters?\n\nWhat To Do If You Will Soon Become Homeless\n\nWhat to Do When Homeless\n\nWhat To Do With Belongings When Homeless\n\nWho Has The Largest Homeless Population?\n\nCATEGORIES\n\nBeing Homeless\n\nFeatured\n\nHelping the Homeless\n\nHomelessness by Country and Continent\n\nHomelessness in the United States\n\nResources for the Homeless\n\nSOCIETY\n\nAbout\n\nContact\n\nPrivacy Policy\n\nDonate to Ending Homelessness\n\nTerms of Service\n\nImag

In [49]:
# stuff_answer = conversation.predict(
#     context=docs_str,
#     question=question,
# )

In [50]:
stuff_answer = conversation(
    {"context": docs_str, "question": question}
)

In [51]:
pprint(stuff_answer['text'])

('Homelessness in India is a major social issue, with a significant population '
 'living in census houses rather than regular residences. The United Nations '
 'Economic and Social Council defines homelessness as the lack of security of '
 'tenure, affordability, access to services, and cultural adequacy, as well as '
 'protection from forced eviction and displacement. Causes of homelessness in '
 'India include urbanization, lack of affordable housing, unemployment, and '
 'changes in industry. Policymakers attribute substance use, mental illness, '
 'relationship failures, and domestic abuse as the main causes of '
 'homelessness, placing responsibility and blame directly on the homeless. '
 'Street children in India face abuse, maltreatment, and lack of access to '
 'schools and healthcare. Efforts to assist the homeless include '
 'non-governmental and governmental services, such as shelters, food banks, '
 'and counseling.')
