## Setting global variables(OPENAI_API_KEY)

In [66]:
!pip install docx2txt



In [None]:

import os
import logging
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.schema import Document

from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core import Settings


logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

os.environ['OPENAI_API_KEY'] = "..."

VECTOR_STORAGE_DIR = "./vector"
DATA_STORAGE_DIR = "./data"

os.makedirs(DATA_STORAGE_DIR, exist_ok=True)


In [68]:
def get_file_names_in_directory(directory_path):
    """List all files in the directory"""
    file_paths = []
    names = []
    for dirpath, _, filenames in os.walk(directory_path):
        for filename in filenames:
            file_paths.append(os.path.join(dirpath, filename))
            names.append(filename)
    return file_paths, names

# Get file paths and file names
file_paths, file_names = get_file_names_in_directory(DATA_STORAGE_DIR)
for file_path, file_name in zip(file_paths, file_names):
    print(file_path)
    print(file_name)
    


./data/Verbs/(Verbs Practice) Potencia Worksheet 1_Elaine - Shilpi Dey.docx
(Verbs Practice) Potencia Worksheet 1_Elaine - Shilpi Dey.docx
./data/Verbs/(Verbs) 5 Nitieixan & Deinielle - Jakob Lattanzi.pptx
(Verbs) 5 Nitieixan & Deinielle - Jakob Lattanzi.pptx
./data/Verbs/(Verbs) 2 To Be - Lara Creyghton.pptx
(Verbs) 2 To Be - Lara Creyghton.pptx
./data/Verbs/(Verbs) English Class #3 - Danielle Coan.pptx
(Verbs) English Class #3 - Danielle Coan.pptx
./data/Verbs/(Verbs) - Paridhi Rathi.pptx
(Verbs) - Paridhi Rathi.pptx
./data/Verbs/(Verbs) Lesson on irregular verbs - Dana.docx
(Verbs) Lesson on irregular verbs - Dana.docx
./data/Verbs/(Verbs_Citizenship Practice) Potencia Tutoring (1) - Nicole Page.docx
(Verbs_Citizenship Practice) Potencia Tutoring (1) - Nicole Page.docx
./data/Verbs/(Verbs) 10 Lesson with Nitieixan - Jakob Lattanzi.pptx
(Verbs) 10 Lesson with Nitieixan - Jakob Lattanzi.pptx
./data/Verbs/(Verb Tenses) Lesson #4   - Adriana Da Gama Henriques.pptx
(Verb Tenses) Lesson #

## Utility functions for datapreprocessing
- load_any_documents
- load_pptx_documents

In [None]:
from pptx import Presentation

def load_pptx_text(file_path):
    """Given a pptx file path, extract all the text from it"""
    pres = Presentation(file_path)
    pptx_text = []
    for slide in pres.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                text_frame = shape.text_frame
                for paragraph in text_frame.paragraphs:
                    for run in paragraph.runs:
                        pptx_text.append(run.text)
                        
    return pptx_text

def load_any_documents(dirpath:str):
    """Given a directory path, load all the files in it: .pptx, .docx, .txt, .pdf, png"""
    documents = []
    for file_name in os.listdir(dirpath):
        file_path = os.path.join(dirpath, file_name)
        
        if file_name.endswith(".pptx"):
            # handle pptx file
            pptx_text = load_pptx_text(file_path)
            documents.append(Document(text="\n".join(pptx_text), doc_id=file_name))
        else:
            documents += (SimpleDirectoryReader(input_files=[file_path]).load_data())
            
    return documents


# documents = load_any_documents("./data/Verbs")
# for doc in documents:
#     print(f"{type(doc)}")

<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.ImageDocument'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class 'llama_index.core.schema.Document'>
<class

## Loading Data 
- read from desginated directory using SimpleDirectoryReader, returns a list of document objects. 
- docx, ppt, pdf file formats are all supported

## Storage Context
- represents the location for both **storing** and **loading** the index data
- In this demo, each subdirectory is represented as an index
  - We could explore other alternatives. 

```
./vector/Verbs/
├── Category1/
│   ├── SubcategoryA/
│   │   ├── file1.txt
│   │   └── file2.txt
│   └── SubcategoryB/
│       └── file3.txt
└── Category2/
    └── file4.txt

./data/Verbs/
├── Category1/
│   ├── SubcategoryA/       # Indexes for SubcategoryA files are stored here
│   └── SubcategoryB/       # Indexes for SubcategoryB files are stored here
└── Category2/              # Indexes for Category2 files are stored here
```

## Indexing
- generates embedding for each document, using designated embedding model. 
- VectorStoreIndex.from_documents: generate index from document nodes. 

In [78]:

# initialize an index list
index_list = []

## represent index represent a subdirectory
for dirpath, _, file_names in os.walk(DATA_STORAGE_DIR):
    if file_names:
        relative_path = os.path.relpath(dirpath, DATA_STORAGE_DIR)
        storage_dir = os.path.join(VECTOR_STORAGE_DIR, relative_path)
        os.makedirs(storage_dir, exist_ok=True)
        
        try:
            storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
            index = load_index_from_storage(storage_context)
            index_list.append(index)
            logger.info(f"Loaded existing index for subdirectory {relative_path}.")
        except FileNotFoundError:
            logger.info(f"Index for {relative_path} not found. Creating a new index.")
            
            # load all document in current subdirectory
            documents = load_any_documents(dirpath)
            # documents = [{"id": doc["id"], "content": doc["content"]} for doc in documents]
            index = VectorStoreIndex.from_documents(documents)
            index.storage_context.persist(storage_dir)
            # index = "Dummy"
            # add each index to index list
            index_list.append(index)
            
        except Exception as e:
            logger.error(f"Error processing subdirectory {relative_path}: {e}")     
        

2024-11-08 22:44:31,522 [INFO] Loading all indices.
2024-11-08 22:44:31,534 [INFO] Loaded existing index for subdirectory Verbs.


### Query Engine
- in the previous code block, we converted all the files in Verbs example directory to index(Embeddings)
- Now we can retrieve relevant block using the query engine. 
- A query engine in the context of information retrieval and language models is a component that processes queries by searching through a collection of data (like a set of documents or a database) and returning the most relevant information. 

### Retriever
- Retrievers are responsible for fetching the most relevant context given a user query (or chat message).

### RouterQueryEngine
- Routers are modules that take in a user query and a set of "choices" (defined by metadata), and returns one or more selected choices.



In [None]:
def pretty_print_nodes_with_scores(nodes):
    
    for node_with_score in nodes:
        node = node_with_score.node
        score = node_with_score.score
    
        # Print relevant content (text) and metadata
        print("Content:", node.text)
        print("Relevance Score:", score)
        print("Source:", node.metadata.get("source"))
        print("Page:", node.metadata.get("page", "N/A"))  # Default to "N/A" if no page info
        print("\n---\n")
    
# 1. query_engine
# Most likely, we would have multiple query engines
query_engine = []
for index in index_list:
    query_engine.append(index.as_query_engine(similarity_top_k=3))

# response = query_engine[0].query("Help me generate a lesson plan for verb past tense")

# print(response)

# 2. retriever
# retriever = index_list[0].as_retriever()
# nodes = retriever.retrieve("What are verb past tense?")

# pretty_print_nodes_with_scores(nodes[:1])

# 3. routerQueryEngine, if more than on query Engines are created then we can try this out. 


2024-11-08 23:53:41,049 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-08 23:53:43,222 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generate a lesson plan for verb past tense by starting with a warm-up activity where students reflect on their previous week's goals and performance. Introduce a list of irregular verbs in the present/past tenses and have students practice writing sentences using the past tense forms of these verbs. Include a grammar topic discussion on irregular verb tenses, focusing on examples like "I write" to "I wrote" and "I see" to "I saw." Incorporate a writing and reading aloud activity using an excerpt from a text to practice reading comprehension and advanced tenses. Conclude the lesson with a citizenship test activity where students write down answers to questions related to the President, Vice President, Cabinet positions, and branches of government. Assign homework that includes adding new words to a vocabulary list, practicing speaking in English, and writing out citizenship questions and answers daily.
