# Data and Data Science Chatbot

### Load the dotenv IPython extension

In [1]:
%load_ext dotenv
%dotenv

In [4]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain_core.output_parsers import StrOutputParser

### Load .docx, split on Markdown headers and by sentence length, clean up whitespace, and initialize embeddings

In [5]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"), 
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

### Build a Chroma vector store from the embedded document chunks

In [6]:
vectorstore = Chroma.from_documents(documents = pages_char_split, 
                                    embedding = embedding, 
                                    persist_directory = "./intro-to-ds-lectures")

### Load existing Chroma vector

In [7]:
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = OpenAIEmbeddings(model='text-embedding-ada-002'))

  vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures",


### Configure the vector store as an MMR retriever

In [8]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k':3, 
                                                      'lambda_mult':0.7})

### Define a prompt template for context-based Q&A

In [10]:
TEMPLATE = '''
Answer the following question:
{question}

To answer the question, use only the following context:
{context}

At the end of the response, specify the name of the lecture this context is taken from in the format:
Resources: *Lecture Title*
where *Lecture Title* should be substituted with the title of all resource lectures.
'''

prompt_template = PromptTemplate.from_template(TEMPLATE)

### Initialize the GPT-4 chat interface

In [11]:
chat = ChatOpenAI(model_name = 'gpt-4', 
                  seed = 365,
                  max_tokens = 250)

In [12]:
question = "What software do data scientists use?"

### Assemble the retrieval-QA pipeline

In [13]:
chain = ({'context': retriever, 
         'question': RunnablePassthrough()} 
         | prompt_template 
         | chat 
         | StrOutputParser())

### Chatbot Answer

In [14]:
chain.invoke(question)

'Data scientists use a variety of software and programming languages. R and Python are two of the most popular tools due to their ability to manipulate data and their integration within multiple data and data science software platforms. They can also solve a wide range of business and data-related problems. In addition to these, Hadoop, a software framework, is used to address the complexity of big data and its computational intensity. Furthermore, software like Power BI, SaS, Qlik, and Tableau are used for business intelligence visualizations.\n\nResources: Programming Languages & Software Employed in Data Science - All the Tools You Need'