## Requirements

In [1]:
# %pip install pypdf
# %pip install pandas
# %pip install chromadb
# %pip install langchain
# %pip install "unstructured[all-docs]"
# %pip install python-dotenv
# %pip install google-generativeai
# %pip install langchain-google-genai 
# %pip install selenium
# %pip install "astrapy>=0.5.3"

## Imports

In [2]:
import os
from dotenv import load_dotenv

import warnings
from pprint import pprint

import textwrap
import pandas as pd

import firebase_admin
import google.generativeai as genai
from firebase_admin import credentials

from langchain import PromptTemplate
from langchain.schema import Document
from langchain.chains import LLMChain
# from langchain.vectorstores import Chroma
from langchain.document_loaders import SeleniumURLLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.conversation.memory import ConversationSummaryMemory

from langchain_community.vectorstores import AstraDB
# from langchain_community.vectorstores import MatchingEngine
from langchain_community.chat_message_histories.firestore import FirestoreChatMessageHistory

warnings.filterwarnings("ignore")

## Parameters

In [3]:
load_dotenv("./keys/.env")

creds_path = "./keys/serviceAccountKey.json"

data_list = [
    # "https://borgenproject.org/tag/homelessness-in-india/",
    # "https://www.noblenonprofit.org/homelessness-in-india-an-unsolved-problem/",
    # "https://caufsociety.com/homelessness-in-india/",
    # "https://en.wikipedia.org/wiki/Homelessness_in_India",
    # "https://www.hlrn.org.in/homelessness",
    # "https://blog.ipleaders.in/prevention-homelessness-problem-treated-legally/",
    # "https://borgenproject.org/homelessness-in-india/",
    # "https://www.aljazeera.com/gallery/2023/1/3/photos-new-delhis-homeless-shiver-in-biting-cold",
#     "https://www.quora.com/In-India-if-a-person-suddenly-becomes-homeless-what-are-some-tips-you-will-give-him",
#     "https://leaglesamiksha.com/2022/06/19/homelessness-in-india-causes-and-remedies/",
#     "https://www.orfonline.org/expert-speak/sheltering-the-urban-homeless"
]

## Auth

In [4]:
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

In [5]:
cred = credentials.Certificate(creds_path)
firebase_admin.initialize_app(cred)

<firebase_admin.App at 0x7fb89072e1d0>

In [6]:
ASTRA_DB_API_ENDPOINT = f"https://{os.environ.get('ASTRA_DB_ID')}-{os.environ.get('ASTRA_DB_REGION')}.apps.astra.datastax.com"
ASTRA_DB_APPLICATION_TOKEN=os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = os.environ.get('ASTRA_DB_ID')
ASTRA_DB_REGION = os.environ.get('ASTRA_DB_REGION')
ASTRA_DB_APPLICATION_TOKEN = os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_KEYSPACE = os.environ.get('ASTRA_DB_KEYSPACE')

COLLECTION_NAME = "HESTIA_chatbot_dev"

## Configure LLM

In [7]:
model = ChatGoogleGenerativeAI(
    model="gemini-pro",
    temperature=0.3
)

In [8]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [9]:
message_history = FirestoreChatMessageHistory(
    collection_name='Chat_message',
    session_id     ='yolo',
    user_id        ='yolo'
)

## Loading Data

In [10]:
data_loader = SeleniumURLLoader(urls=data_list)
pages = data_loader.load()

## Prompt Design - In Context

In [None]:
prompt = PromptTemplate(
    input_variables=[
      "history",
      "context", 
      "question",

    ],
    template=(
        """Answer the question as precise as possible using the provided context. If the answer is
           not contained in the context also refreter summary of past Conversation, say "answer not available in context":\n

        History:{history}\n
        Context:{context}\n
        Question:{question}\n
        """
    )
)

In [None]:
summary_memory=ConversationSummaryMemory(
    llm=model,
    memory_key="history",
    input_key="question",
    chat_memory=message_history
)

In [None]:
conversation = LLMChain(
    llm=model, 
    verbose=False,
    memory=summary_memory,
    prompt=prompt
)

## RAG pipeline

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
context = "\n\n".join(str(p.page_content) for p in pages)
texts = text_splitter.split_text(context)

In [None]:
text = [Document(page_content=x) for x in texts]

In [None]:
# vector_index = Chroma.from_texts(texts, embeddings).as_retriever()

In [None]:
vector_store = AstraDB(
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace='hestia'
)

In [None]:
for t in text:
    inserted_ids = vector_store.add_documents([t])

In [None]:
# vector_index = MatchingEngine.from_components(
#     texts=texts,
#     project_id="<my_project_id>",
#     region="<my_region>",
#     gcs_bucket_uri="<my_gcs_bucket>",
#     index_id="<my_matching_engine_index_id>",
#     endpoint_id="<my_matching_engine_endpoint_id>",
#     embedding=embeddings
# )
# question = 'tell me about homelessness in india?'
# docs = vector_index.get_relevant_documents(question)

In [None]:
question = 'tell me about homelessness in india?'
docs = vector_store.similarity_search(question, k=3)

In [None]:
docs

[Document(page_content='Posts\n\nGlobal Poverty, Homeslessness\n\nHomeless in India: Causes and Aid Available\n\nHomelessness in India is on the rise. Many people are turning to the streets as a place to sleep and find income by performing hard labor. Children living on the street are also becoming very common for many reasons, including abuse and family abandonment.\n\nCauses\n\nFactors that contribute to homelessness include impairment, a shortage of housing affordability, irregular or long-term unemployment and shifts in business. Policymakers state that the cause of homelessness is substance addiction, mental illness, relationship failures and domestic abuse. Prime Minister Modi has set a goal to eradicate homelessness by 2022, but the government has made no progress thus far.\n\nStreet Children\n\nAs a result of urbanization, poverty and other factors, children end up on the streets. In India, there are more than 400,000 children living on the street. According to UNICEF, there ar

In [None]:
docs_str = [str(x.page_content) for x in docs]

In [None]:
docs_str

['Posts\n\nGlobal Poverty, Homeslessness\n\nHomeless in India: Causes and Aid Available\n\nHomelessness in India is on the rise. Many people are turning to the streets as a place to sleep and find income by performing hard labor. Children living on the street are also becoming very common for many reasons, including abuse and family abandonment.\n\nCauses\n\nFactors that contribute to homelessness include impairment, a shortage of housing affordability, irregular or long-term unemployment and shifts in business. Policymakers state that the cause of homelessness is substance addiction, mental illness, relationship failures and domestic abuse. Prime Minister Modi has set a goal to eradicate homelessness by 2022, but the government has made no progress thus far.\n\nStreet Children\n\nAs a result of urbanization, poverty and other factors, children end up on the streets. In India, there are more than 400,000 children living on the street. According to UNICEF, there are four categories of s

In [None]:
# stuff_answer = conversation.predict(
#     context=docs_str,
#     question=question,
# )

In [None]:
stuff_answer = conversation(
    {"context": docs_str, "question": question}
)

In [None]:
pprint(stuff_answer['text'])

('- Homelessness in India is on the rise due to factors like impairment, '
 'housing shortage, unemployment, and substance addiction.\n'
 '- The homeless population in India is estimated to be around 1.77 million, '
 'with a majority being men between the ages of 18 and 60.\n'
 '- Homelessness is particularly prevalent in urban areas, with an estimated '
 '85% of homeless people living in cities.\n'
 '- The number of homeless people who are mentally ill or have substance abuse '
 'problems is also on the rise.\n'
 '- Homelessness during the rainy season increases due to flooding and the '
 'closure of homeless shelters, leading to increased difficulties and health '
 'risks for the homeless population.\n'
 '- Organizations like Aashray Adhikar Abhiyan, URJA Trust, and Salaam Baalak '
 'Trust are working to provide aid and support to homeless people in India.')
