## Setting global variables(OPENAI_API_KEY)

In [21]:
!pip install docx2txt



In [22]:

import os
import logging
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.schema import Document

from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core import Settings


logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

os.environ['OPENAI_API_KEY'] = os.getenv('API_KEY')

VECTOR_STORAGE_DIR = "./vector"
DATA_STORAGE_DIR = "./data"

os.makedirs(DATA_STORAGE_DIR, exist_ok=True)


In [23]:
def get_file_names_in_directory(directory_path):
    """List all files in the directory"""
    file_paths = []
    names = []
    for dirpath, _, filenames in os.walk(directory_path):
        for filename in filenames:
            file_paths.append(os.path.join(dirpath, filename))
            names.append(filename)
    return file_paths, names

# Get file paths and file names
file_paths, file_names = get_file_names_in_directory(DATA_STORAGE_DIR)
for file_path, file_name in zip(file_paths, file_names):
    print(file_path)
    print(file_name)
    


./data/Tutor training handbook/Tutor training handbook.pdf
Tutor training handbook.pdf
./data/Pronouns and Prepositions/(Pronouns) Potencia - Lesson 3 - Nikita Goyal.docx
(Pronouns) Potencia - Lesson 3 - Nikita Goyal.docx
./data/Pronouns and Prepositions/(Pronouns_Prepositions) Class 6 - Amber Adelman.pptx
(Pronouns_Prepositions) Class 6 - Amber Adelman.pptx
./data/Pronouns and Prepositions/(Pronouns_Food) Class 5  - Amber Adelman.pdf
(Pronouns_Food) Class 5  - Amber Adelman.pdf
./data/Pronouns and Prepositions/(Prepositions_Verbs) Lesson 2 - Leila - Ashley Cornwell.pptx
(Prepositions_Verbs) Lesson 2 - Leila - Ashley Cornwell.pptx
./data/Pronouns and Prepositions/(Prepositions) - Paridhi Rathi.pptx
(Prepositions) - Paridhi Rathi.pptx
./data/Pronouns and Prepositions/(Preposition_Possession) Lesson #2 - Adriana Da Gama Henriques.pptx
(Preposition_Possession) Lesson #2 - Adriana Da Gama Henriques.pptx
./data/Pronouns and Prepositions/(Prepositions_Verbs) Lesson 3 - Leila - Ashley Cornwel

## Utility functions for datapreprocessing
- load_any_documents
- load_pptx_documents

In [None]:
!pip install pymupdf

In [39]:
from pptx import Presentation
import fitz
def load_pptx_text(file_path):
    """Given a pptx file path, extract all the text from it"""
    pres = Presentation(file_path)
    pptx_text = []
    for slide in pres.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                text_frame = shape.text_frame
                for paragraph in text_frame.paragraphs:
                    for run in paragraph.runs:
                        pptx_text.append(run.text)
                        
    return pptx_text

def extract_text_from_pdf(file_path):
    """Extract text from a PDF file using PyMuPDF."""
    text = []
    try:
        pdf_document = fitz.open(file_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text.append(page.get_text())
        pdf_document.close()
    except Exception as e:
        print(f"Error reading PDF file {file_path}: {e}")
    return "\n".join(text)

def load_any_documents(dirpath:str):
    """Given a directory path, load all the files in it: .pptx, .docx, .txt, .pdf, png"""
    documents = []
    for file_name in os.listdir(dirpath):
        file_path = os.path.join(dirpath, file_name)
        
        if file_name.endswith(".pptx"):
            # handle pptx file
            pptx_text = load_pptx_text(file_path)
            documents.append(Document(text="\n".join(pptx_text), doc_id=file_name))
        elif file_name.endswith(".pdf"):
            pdf_text = extract_text_from_pdf(file_path)
            documents.append(Document(text=pdf_text, doc_id=file_name))
        else:
            documents += (SimpleDirectoryReader(input_files=[file_path]).load_data())
            
    return documents


# documents = load_any_documents("./data/Verbs")
# for doc in documents:
#     print(f"{type(doc)}")

## Loading Data 
- read from desginated directory using SimpleDirectoryReader, returns a list of document objects. 
- docx, ppt, pdf file formats are all supported

## Storage Context
- represents the location for both **storing** and **loading** the index data
- In this demo, each subdirectory is represented as an index
  - We could explore other alternatives. 

```
./vector/Verbs/
├── Category1/
│   ├── SubcategoryA/
│   │   ├── file1.txt
│   │   └── file2.txt
│   └── SubcategoryB/
│       └── file3.txt
└── Category2/
    └── file4.txt

./data/Verbs/
├── Category1/
│   ├── SubcategoryA/       # Indexes for SubcategoryA files are stored here
│   └── SubcategoryB/       # Indexes for SubcategoryB files are stored here
└── Category2/              # Indexes for Category2 files are stored here
```

## Indexing
- generates embedding for each document, using designated embedding model. 
- VectorStoreIndex.from_documents: generate index from document nodes. 

In [48]:


# initialize an index list
index_list = []
description_list = []
## represent index represent a subdirectory
for dirpath, _, file_names in os.walk(DATA_STORAGE_DIR):
    if file_names:
        relative_path = os.path.relpath(dirpath, DATA_STORAGE_DIR)
        storage_dir = os.path.join(VECTOR_STORAGE_DIR, relative_path)
        os.makedirs(storage_dir, exist_ok=True)
        
        try:
            storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
            index = load_index_from_storage(storage_context)
            description = storage_dir.split('/')[-1]
            print(description)
            index_list.append(index)
            description_list.append(description)
            logger.info(f"Loaded existing index for subdirectory {relative_path}.")
        except FileNotFoundError:
            logger.info(f"Index for {relative_path} not found. Creating a new index.")
            
            # load all document in current subdirectory
            documents = load_any_documents(dirpath)
            # documents = [{"id": doc["id"], "content": doc["content"]} for doc in documents]
            index = VectorStoreIndex.from_documents(documents)
            index.storage_context.persist(storage_dir)
            # index = "Dummy"
            # add each index to index list
            index_list.append(index)
            
            
        except Exception as e:
            logger.error(f"Error processing subdirectory {relative_path}: {e}")     
        

2024-12-04 00:44:04,421 [INFO] Loading all indices.
2024-12-04 00:44:04,425 [INFO] Loaded existing index for subdirectory Tutor training handbook.


Tutor training handbook


2024-12-04 00:44:05,391 [INFO] Loading all indices.
2024-12-04 00:44:05,395 [INFO] Loaded existing index for subdirectory Pronouns and Prepositions.


Pronouns and Prepositions


2024-12-04 00:44:06,380 [INFO] Loading all indices.
2024-12-04 00:44:06,383 [INFO] Loaded existing index for subdirectory Verbs.


Verbs


### Query Engine
- in the previous code block, we converted all the files in Verbs example directory to index(Embeddings)
- Now we can retrieve relevant block using the query engine. 
- A query engine in the context of information retrieval and language models is a component that processes queries by searching through a collection of data (like a set of documents or a database) and returning the most relevant information. 

### Retriever
- Retrievers are responsible for fetching the most relevant context given a user query (or chat message).

### RouterQueryEngine
- Routers are modules that take in a user query and a set of "choices" (defined by metadata), and returns one or more selected choices.



In [65]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector
from llama_index.core.selectors import (
    PydanticMultiSelector,
    PydanticSingleSelector,
)
def pretty_print_nodes_with_scores(nodes):
    
    for node_with_score in nodes:
        node = node_with_score.node
        score = node_with_score.score
    
        # Print relevant content (text) and metadata
        print("Content:", node.text)
        print("Relevance Score:", score)
        print("Source:", node.metadata.get("source"))
        print("Page:", node.metadata.get("page", "N/A"))
        print("\n---\n")
    
# 1. query_engine
# Most likely, we would have multiple query engines
chat_engines = []
query_engine_tools = []
for description, index in zip(description_list, index_list):
    query_engine = index.as_query_engine(similarity_top_k=3)
    query_engine_tools.append(
        QueryEngineTool.from_defaults(query_engine=query_engine, 
                                      description=(f"used to handle queries related to {description}.")))
    # chat_engines.append(index.as_chat_engine())

# response = query_engine[2].query("How much does th tutor gets paid?")
# response = chat_engines[0].chat("How much does the tutor gets paid?")
query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=query_engine_tools
)
response = query_engine.query("What tools are recommended for virtual meetings?")
print(str(response))
# print(response)

# 2. retriever
# retriever = index_list[0].as_retriever()
# nodes = retriever.retrieve("Who contributed to the handbook?")

# pretty_print_nodes_with_scores(nodes[:3])

# 3. routerQueryEngine, if more than on query Engines are created then we can try this out.



2024-12-04 01:24:41,641 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:24:41,643 [INFO] Selecting query engine 0: The Tutor training handbook may contain information on tools recommended for virtual meetings..
2024-12-04 01:24:41,855 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:24:42,241 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Zoom or Google Meet are recommended for virtual meetings.


In [16]:
!pip install requests beautifulsoup4



## Test cases for QA

In [67]:
import pandas as pd

data = [
    {"Question": "How does Potencia facilitate communication between tutors and learners?",
     "Answer": "Through scheduled meetings and shared platforms."},
    {"Question": "What tools are recommended for virtual meetings?",
     "Answer": "Zoom, Google Meet, and consistent links."},
    {"Question": "How can tutors help learners overcome challenges with technology?",
     "Answer": "Provide guidance and resources."},
    {"Question": "Why is consistency in meeting links important for online sessions?",
     "Answer": "It reduces confusion and ensures reliability."},
    {"Question": "What questions should tutors ask themselves after each session for self-reflection?",
     "Answer": "What went well, and what can I improve?"},
    {"Question": "How can reflecting with learners after class improve teaching effectiveness?",
     "Answer": "It provides feedback and builds rapport."},
    {"Question": "What factors should tutors consider when deciding on the next topic to teach?",
     "Answer": "Learner goals and prior progress."},
    {"Question": "What is the recommended structure for a tutoring session?",
     "Answer": "Warm-up, main activity, wrap-up."},
    {"Question": "How should a session be wrapped up effectively?",
     "Answer": "Summarize and discuss next steps."},
    {"Question": "What is the tutoring session policy regarding session logging and cancellations?",
     "Answer": "Log sessions and inform in advance about cancellations."},
    {"Question": "What informal assessments are recommended during a class session?",
     "Answer": "Observation and on-the-spot questions."},
    {"Question": "How can quizzes and reading activities help assess a learner's knowledge?",
     "Answer": "They evaluate comprehension and retention."},
    {"Question": "What forms of support does Potencia offer to tutors during the semester?",
     "Answer": "Workshops, feedback sessions, and resources."}
]

# Create a filtered DataFrame
df = pd.DataFrame(data)

# Save to a new CSV file
output_file = "Tutor_Questions_and_Answers.csv"
df.to_csv(output_file, index=False)



In [71]:
df = pd.read_csv("Tutor_Questions_and_Answers.csv")
questions = df['Question']
rag_responses = []
for question in questions:
    print(question)
    rag_responses.append(query_engine.query(question))
    
df['RAG_Answer'] = rag_responses
df.to_csv(output_file, index=True)

How does Potencia facilitate communication between tutors and learners?


2024-12-04 01:52:27,118 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:27,123 [INFO] Selecting query engine 0: Potencia facilitates communication between tutors and learners by providing guidance on Tutor training handbook, which can help tutors improve their communication skills..
2024-12-04 01:52:27,517 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:28,386 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What tools are recommended for virtual meetings?


2024-12-04 01:52:28,848 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:28,851 [INFO] Selecting query engine 0: The Tutor training handbook may contain information on recommended tools for virtual meetings..
2024-12-04 01:52:29,212 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:29,609 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


How can tutors help learners overcome challenges with technology?


2024-12-04 01:52:30,087 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:30,090 [INFO] Selecting query engine 0: The Tutor training handbook may provide guidance on how tutors can help learners overcome challenges with technology..
2024-12-04 01:52:30,631 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:33,979 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Why is consistency in meeting links important for online sessions?


2024-12-04 01:52:34,465 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:34,467 [INFO] Selecting query engine 0: The choice related to Tutor training handbook may provide guidelines and best practices for maintaining consistency in meeting links for online sessions..
2024-12-04 01:52:34,942 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:35,717 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What questions should tutors ask themselves after each session for self-reflection?


2024-12-04 01:52:36,227 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:36,229 [INFO] Selecting query engine 0: The Tutor training handbook may contain guidelines and suggestions for tutors on self-reflection questions after each session..
2024-12-04 01:52:36,752 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:37,388 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


How can reflecting with learners after class improve teaching effectiveness?


2024-12-04 01:52:37,973 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:37,976 [INFO] Selecting query engine 0: Reflecting with learners after class can be a part of Tutor training handbook, which may provide guidance on effective reflection strategies for teachers to improve teaching effectiveness..
2024-12-04 01:52:38,418 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:39,549 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What factors should tutors consider when deciding on the next topic to teach?


2024-12-04 01:52:39,989 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:39,992 [INFO] Selecting query engine 0: The Tutor training handbook may provide guidelines and considerations for tutors when deciding on the next topic to teach..
2024-12-04 01:52:40,327 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:40,942 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What is the recommended structure for a tutoring session?


2024-12-04 01:52:41,757 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:41,760 [INFO] Selecting query engine 0: The Tutor training handbook would likely contain information on recommended structures for tutoring sessions..
2024-12-04 01:52:42,227 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:43,598 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


How should a session be wrapped up effectively?


2024-12-04 01:52:44,111 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:44,114 [INFO] Selecting query engine 2: Verbs are often used to indicate actions, which can be related to wrapping up a session effectively..
2024-12-04 01:52:44,476 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:45,080 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What is the tutoring session policy regarding session logging and cancellations?


2024-12-04 01:52:45,652 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:45,654 [INFO] Selecting query engine 0: The Tutor training handbook is likely to contain policies and guidelines related to tutoring sessions, including session logging and cancellations..
2024-12-04 01:52:46,137 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:46,732 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What informal assessments are recommended during a class session?


2024-12-04 01:52:47,188 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:47,191 [INFO] Selecting query engine 0: The Tutor training handbook may provide guidelines on informal assessments recommended during a class session..
2024-12-04 01:52:47,393 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:47,829 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


How can quizzes and reading activities help assess a learner's knowledge?


2024-12-04 01:52:48,417 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:48,420 [INFO] Selecting query engine 0: The Tutor training handbook may provide guidance on how quizzes and reading activities can be used to assess a learner's knowledge..
2024-12-04 01:52:48,724 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:49,729 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


What forms of support does Potencia offer to tutors during the semester?


2024-12-04 01:52:50,361 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-04 01:52:50,364 [INFO] Selecting query engine 0: The choice related to handling queries related to Tutor training handbook seems most relevant to the question about the forms of support Potencia offers to tutors during the semester..
2024-12-04 01:52:50,666 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-04 01:52:51,795 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


## ChatEngine seems to perform much better than routerQuery engine
Chat engine response: 

Here is a full lesson plan on verbs with specific practice examples:

1. Introduction to Irregular Verb Tenses:
   - Provide examples of irregular verbs in present and past tenses, such as "am/was," "write/wrote," "draw/drew," "do/did," "make/made," "meet/met," "pay/paid," "send/sent," "sleep/slept," "stand/stood," "read/read," "cut/cut," "buy/bought," and "see/saw."
   
2. Practice Activity:
   - Engage students in creating sentences using both the present and past forms of irregular verbs. For example, transform "I pay my bills every month" into "I paid the bills last month."
   
3. Future Tense:
   - Introduce future tense examples like "I am going to/I will" for "am/was," "I will write" for "write/wrote," and so on. Have students practice creating sentences using the future tense forms of irregular verbs.
   
4. Conclusion and Homework:
   - Review the irregular verb tenses covered in the lesson.
   - Assign homework that involves creating sentences using both past and future tense forms of irregular verbs.

Practice Examples:
1. Write sentences using both the present and past forms of irregular verbs, such as "I am a worker" and "I was a student."
2. Create sentences with irregular verb tenses like "I draw a flower" and "I drew a house."
3. Practice using past tense words like "Yesterday I made chicken" and "Yesterday, I stood in line at the store."
4. Form sentences with irregular verbs in present and past forms, such as "I send mail" and "I sent a letter."
5. Utilize phrases with irregular verbs like "I see you right now" and "I saw my mom yesterday."

Feel free to incorporate these practice examples into your lesson plan to enhance student understanding of irregular verb tenses.
