In [95]:
# Need to understand what's going in and out of each step?
# this part is a lot of debuggers and stuff like that
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read the local .env file
openai.api_key = os.environ['OPENAI_API_KEY'] # this is now the API key to use

testEnvString = os.environ['TEST_ENV'] # test to see if .env is working
print(testEnvString)

llm_model = 'gpt-4o'

hello there!


In [96]:
# How can we make sure that what's happening is useful? How can we make sure our performance is working well?
# What if we use CHAINS THEMSELVES to evaluate them well?
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain_openai import OpenAIEmbeddings

In [97]:
file = './assets/TopicsList.csv'
loader = CSVLoader(file_path=file)

data = loader.load()
print(data)


[Document(metadata={'source': './assets/TopicsList.csv', 'row': 0}, page_content='\ufeffGeneral Topic: Excel Data Types\nTestable Topics: How numbers are formatted in Excel, logical values in Excel, text in Excel'), Document(metadata={'source': './assets/TopicsList.csv', 'row': 1}, page_content='\ufeffGeneral Topic: IF FUNCTION\nTestable Topics: Questions around using the Excel IF Function'), Document(metadata={'source': './assets/TopicsList.csv', 'row': 2}, page_content='\ufeffGeneral Topic: Cell References\nTestable Topics: Questions around absolute referencing, relative referencing, mixed referencing'), Document(metadata={'source': './assets/TopicsList.csv', 'row': 3}, page_content='\ufeffGeneral Topic: MAX Function\nTestable Topics: Questions around the MAX function in Excel'), Document(metadata={'source': './assets/TopicsList.csv', 'row': 4}, page_content='\ufeffGeneral Topic: MIN Function\nTestable Topics: Questions around the MIN function in Excel'), Document(metadata={'source':

In [98]:
embeddings = OpenAIEmbeddings()

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])

llm = ChatOpenAI(temperature = 0.0, model=llm_model)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

# Later on, we can come up with a few VALUES that could be used to evaluate it against

In [99]:
# coming up with test datapoints
# Hard-coded examples
examples = [
    {
        "query": "Is the CONCATENATE function tested as part of this list of general topics?",
        "answer": "Yes"
    },
    {
        "query": "What kind of R functions are tested as part of Data Wrangling?",
        "answer": "dyplr, magrittr functions like select, summarise, filter, mutate"
    },
    {'query': 'What are the three specific testable topics under the general topic of "Excel Data Types" as mentioned in the document?', 
     'answer': 'The three specific testable topics under the general topic of "Excel Data Types" are: '
    },
    {'query': 'What is the general topic and the specific testable topics listed in the document?', 'answer': 'The general topic is the IF FUNCTION, and the specific testable topics are questions around using the Excel IF Function.'},
    
    
]

In [100]:
# ^ These hard coded examples are NOT scalable
# LLMs themselves can be used to automate them
# A chain example, is the QAgeneration chain. creates Q + A pair with LLMs

In [101]:
from langchain.evaluation.qa import QAGenerateChain

example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)

print(new_examples)

# for new in new_examples:
#     examples += new['qa_pairs']

# how can we evaluate it? well, we can try running an example
qa.run(examples[0]["query"])

# It's usually not enough, to look at these parts of the chain. we have util here called langchain.debug



[{'qa_pairs': {'query': 'List the three testable topics related to Excel Data Types as mentioned in the document.', 'answer': 'The three testable topics related to Excel Data Types are:'}}, {'qa_pairs': {'query': 'What general topic does the document focus on, and what are the specific testable topics mentioned?', 'answer': 'The general topic the document focuses on is the Excel IF Function. The specific testable topics mentioned are questions around using the Excel IF Function.'}}, {'qa_pairs': {'query': 'What are the three types of cell referencing mentioned in the document, and what is one example of each type?', 'answer': 'The three types of cell referencing mentioned are absolute referencing, relative referencing, and mixed referencing. An example of absolute referencing is $A$1, an example of relative referencing is A1, and an example of mixed referencing is $A1 or A$1.'}}, {'qa_pairs': {'query': 'What is the general topic covered in the document, and what specific type of questi

'Yes, the CONCATENATE function is tested as part of the list of general topics. Specifically, it is included under the topic "How to use the CONCATENATE function to generate dynamic strings."'

In [102]:
import langchain
langchain.debug = True

# we rerun now. now this shows what debugging is like
qa.run(examples[0]["query"])

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Is the CONCATENATE function tested as part of this list of general topics?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Is the CONCATENATE function tested as part of this list of general topics?",
  "context": "﻿General Topic: CONCATENATE Function\nTestable Topics: How to use the CONCATENATE function to generate dynamic strings<<<<>>>>>﻿General Topic: AND, OR and NOT Function\nTestable Topics: Questions around the usage and evaluation of AND, OR and NOT functions<<<<>>>>>﻿General Topic: MATCH Function\nTestable Topics: Questions around how to use the MATCH function.<<<<>>>>>﻿General Topic: INDEX Function\nTestable Topics: Questions around how to use t

'Yes, the CONCATENATE function is tested as part of the list of general topics. Specifically, it falls under the general topic "CONCATENATE Function" with the testable topic "How to use the CONCATENATE function to generate dynamic strings."'

In [103]:
# turn off debug
langchain.debug = False

# How can we debug single inputs? how can we evaluate those? one way is manually, run chain and see what might be going on
# Can we get a language model to do it?
print(examples)

[{'query': 'Is the CONCATENATE function tested as part of this list of general topics?', 'answer': 'Yes'}, {'query': 'What kind of R functions are tested as part of Data Wrangling?', 'answer': 'dyplr, magrittr functions like select, summarise, filter, mutate'}, {'query': 'What are the three specific testable topics under the general topic of "Excel Data Types" as mentioned in the document?', 'answer': 'The three specific testable topics under the general topic of "Excel Data Types" are: '}, {'query': 'What is the general topic and the specific testable topics listed in the document?', 'answer': 'The general topic is the IF FUNCTION, and the specific testable topics are questions around using the Excel IF Function.'}]


LLM assisted evaluation

In [104]:
predictions = []
for example in examples:
    result = qa(example['query'])
    predictions.append({"query": example['query'], "result": result['result']})

from langchain.evaluation.qa import QAEvalChain

llm = ChatOpenAI(temperature=0, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

# Prepare the data for evaluation
eval_data = [
    {
        "query": ex['query'],
        "answer": ex['answer'],  # Assuming your examples have an 'answer' key
        "result": pred['result']
    }
    for ex, pred in zip(examples, predictions)
]

graded_outputs = eval_chain.evaluate(examples, predictions)

print(graded_outputs[0].keys())
print(predictions)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
dict_keys(['results'])
[{'query': 'Is the CONCATENATE function tested as part of this list of general topics?', 'result': 'Yes, the CONCATENATE function is tested as part of the list of general topics. Specifically, it falls under the general topic "CONCATENATE Function" with the testable topic "How to use the CONCATENATE function to generate dynamic strings."'}, {'query': 'What kind of R functions are tested as part of Data Wrangling?', 'result': 'As part of Data Wrangling in R, the functions from the `dplyr` and `magrittr` packages are commonly tested. These functions include:\n\n1. **`select`**: Used to select specific columns from a dataframe.\n2. **`filter`**: Used to filter rows based on certa

In [105]:
# Print out and see what the output is like
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['results'])
    print()

Example 0:
Question: Is the CONCATENATE function tested as part of this list of general topics?
Predicted Answer: Yes, the CONCATENATE function is tested as part of the list of general topics. Specifically, it falls under the general topic "CONCATENATE Function" with the testable topic "How to use the CONCATENATE function to generate dynamic strings."
Predicted Grade: CORRECT

Example 1:
Question: What kind of R functions are tested as part of Data Wrangling?
Predicted Answer: As part of Data Wrangling in R, the functions from the `dplyr` and `magrittr` packages are commonly tested. These functions include:

1. **`select`**: Used to select specific columns from a dataframe.
2. **`filter`**: Used to filter rows based on certain conditions.
3. **`mutate`**: Used to create new columns or modify existing ones.
4. **`summarise` (or `summarize`)**: Used to create summary statistics of data.
5. **`arrange`**: Used to sort data by one or more columns.
6. **`group_by`**: Used to group data by o

In [None]:
graded_outputs[0]