In [1]:
# Make sure to install the following packages if you're a first time user:

# !pip install langchain
# !pip install chromadb

In [2]:
# Initial Imports & Loading the OpenAI API Key from .env file. 
# Ensure that the API Key is properly configured. Please refer to the README.md for detailed instructions.

import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [4]:
# Load the WhatsApp chat history and save it as a langchain document

from langchain_community.document_loaders import WhatsAppChatLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = WhatsAppChatLoader("data/chat_history.txt")
loader.load()

docs = []
docs.extend(loader.load())

# Split the document into multiple chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

len(splits)

10

In [5]:
# Create embeddings from the document chunks and generate a persistent vector database for future reuse. 
# This cell should be executed only once, ensuring that the chroma vector database remains available for subsequent use.
# The number of vector database collections should match the number of splits above.

from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()
persist_directory = 'data/chroma_db/'

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

vectordb.persist()
print(vectordb._collection.count())

10


In [6]:
# Ask a question and get the relevant documents for it from the vector database

question = "What is discussed about AI Tax?"
docs = vectordb.similarity_search(question,k=5)
len(docs)
docs[0].page_content

'Manoj Joshi IITB on 4/14/24, 01:11: Think of this as a massive class action settlement of\n\nManoj Joshi IITB on 4/14/24, 01:34: Introducing such a tax on the makers of "Big Tech AI" will also give them a chance to consider if at all they want to invest in AI after all, as I suspect some of them may decide not to invest in AI at the same pace, once they realize it\'s not exclusively lucrative for them to operate under these new fiscal conditions. That will eliminate any false sense of a "heated market" and reduce the concern for a ballooning economy operating on hype rather than intrinsic value. One example of that is about the impact of GDPR on American companies who have sometimes made a conscious choice about not investing in reckless proliferation of software/apps in EU with little regard for data privacy and ethics. The net result is that the EU was compensated due to enforcement of GDPR on American big tech.\n\nManoj Joshi IITB on 4/14/24, 01:53: My points may make it seem like 

In [8]:
# Use LangChain Retrieval to answer your questions based on the context in the vector database

from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=llm_name, temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

result = qa_chain.invoke({"query": question})
result["result"]

'In the conversation, there is a proposal to introduce a special tax on "Big Tech AI" companies. The purpose of this tax would be to make these companies consider their investments in AI more carefully and to ensure that they contribute to society by paying taxes if they want to keep a larger portion of the wealth generated by AI. The idea is not to hinder AI progress but to promote balanced growth and ensure that companies do not replace human workers with AI without contributing to society through taxes. The discussion also touches on the need for a new form of economics if there is a full-blown AI takeover in the future.'