# Create Llama Indices from the Grey Literature corpus

References  
[Getting Started With LlamaIndex](https://betterprogramming.pub/getting-started-with-llamaindex-169bbf475a94)  

## Import Relevant libraries and set API key

In [1]:
from pathlib import Path
from llama_index import download_loader

In [2]:
import os

In [16]:
import openai
openai.api_key = '<OpenAI key>'
os.environ["OPENAI_API_KEY"] = '<OpenAI key>'

In [42]:
## 1. settings to Load pdf documents
PyMuPDFReader = download_loader("PyMuPDFReader")
loader = PyMuPDFReader()

# Run code to load documents into document list

# 2. Parse the docs into nodes
# Load requirements
from llama_index.node_parser import SimpleNodeParser
parser = SimpleNodeParser()

# Code to run for specific files
# nodes = parser.get_nodes_from_documents(documents)

# 3. Build an index
from llama_index import GPTVectorStoreIndex

# Code to run for specific files
# index = GPTVectorStoreIndex(nodes)

# 4. Store the index
# Code to persist the index, alter persist_dir as needed
# index.storage_context.persist(persist_dir="index")

## Custom Functions 
Functions to:  
    (a) generate a list pdfs from a directory for investigation  
    (b) load all pdfs from a given directory into a documents list

In [31]:
def get_list_of_pdfs_in_directory(directory_filepath):
    pdf_list = []
    for root, dirs, files in os.walk(directory_filepath):
        for file in files:
            # add pdf files to list
            if file.endswith(".pdf"):
                pdf_list.append(file)
                
    return pdf_list

In [29]:
def load_pdf_files_in_directory(directory_filepath):
    # Initiate empty documents list
    documents_list = []
    
    # Walk directory to find the pdf files
    for root, dirs, files in os.walk(directory_filepath):
        for file in files:
            # if pdf, load to documents
            if file.endswith(".pdf"):
                filepath = os.path.join(directory_filepath, file)
                documents_list = documents_list + loader.load(file_path=filepath, metadata=True)
    
    return documents_list

## Generate Llama-index for Hand Coded files

In [35]:
# Set location of file directory
hand_coded_filefolder = os.path.abspath("../azure/code/data/raw/Gray_Literature_Round_1/Hand-Coded")

In [36]:
# Investigate files
hand_coded_files = get_list_of_pdfs_in_directory(hand_coded_filefolder)

In [37]:
hand_coded_files[0:5]

['ABC_2000_VSWSGuidebook.pdf',
 'ABC_2013_Validating Your Certification Exam.pdf',
 'ABC_2018_Guide Collect ClassI.pdf',
 'ABC_2018_Guide Collect ClassII.pdf',
 'ABC_2018_Guide Distr ClassII.pdf']

In [None]:
# 1. Load pdfs
hand_coded_documents = load_pdf_files_in_directory(hand_coded_filefolder)

In [12]:
# 2. Parse the docs into nodes
hand_coded_nodes = parser.get_nodes_from_documents(hand_coded_documents)

In [17]:
# 3. Build an index
hand_coded_index = GPTVectorStoreIndex(hand_coded_nodes)

In [18]:
# 4. Store the index
hand_coded_index.storage_context.persist(persist_dir="indices/index-hand-coded")

## Llama-index for non-Hand Coded files

In [20]:
# Set location of file directory
not_coded_folder = os.path.abspath("../azure/code/data/raw/Gray_Literature_Round_1/Not-Hand-Coded")

In [32]:
# Investigate files
not_coded_pdfs = get_list_of_pdfs_in_directory(not_coded_folder)

In [33]:
not_coded_pdfs[0:5]

['ABC_2008_SampleBPAT.pdf',
 'ABC_2008_SampleVSWS.pdf',
 'ABC_2008_WaterTreatmentOperatorCertificationApplication031716.pdf',
 'ABC_2009_ExamEquivalencyChart0709.pdf',
 'ABC_2009_WastewaterTestingServiceExamEquivalencyChart0709.pdf']

In [30]:
# 1. Load pdfs
not_coded_documents = load_pdf_files_in_directory(not_coded_folder)

In [39]:
# 2. Parse the docs into nodes
not_coded_nodes = parser.get_nodes_from_documents(not_coded_documents)

In [40]:
# 3. Build an index
not_coded_index = GPTVectorStoreIndex(not_coded_nodes)

In [41]:
# 4. Store the index
not_coded_index.storage_context.persist(persist_dir="indices/index-not-coded")