See https://github.com/techleadhd/chatgpt-retrieval/blob/main/chatgpt.py and
[Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI?si=Vblh5hzNao4LO_GF) from the [TechLead](https://www.youtube.com/@TechLead)

In [6]:
%%time
from langchain.chains import ConversationalRetrievalChain
# In the __init__.py of langchain/libs/langchain/langchain/indexes/
from langchain.indexes import VectorstoreIndexCreator
# This code for DirectoryLoader is in
# langchain/libs/community/langchain_community/document_loaders/directory.py
from langchain_community.document_loaders import DirectoryLoader

CPU times: user 5 µs, sys: 5 µs, total: 10 µs
Wall time: 11.7 µs


In [2]:
from pathlib import Path
import sys

# Make this path be the project's "base" directory, so we can include modules
notebook_directory_ancestor = Path.cwd().resolve().parent.parent
print(notebook_directory_ancestor)
core_code_directory = notebook_directory_ancestor / "CoreCode/"

is_core_code_directory_in_sys_path = str(core_code_directory) in sys.path
is_notebook_directory_ancestor_in_sys_path = str(notebook_directory_ancestor) in sys.path
print("Is CoreCode directory in sys.path?", is_core_code_directory_in_sys_path)
print("Is notebook directory's ancestor in sys.path?", is_notebook_directory_ancestor_in_sys_path)

/InServiceOfX
Is CoreCode directory in sys.path? False
Is notebook directory's ancestor in sys.path? False


In [3]:
if not is_core_code_directory_in_sys_path:
    sys.path.append(str(core_code_directory))

In [17]:
from CoreCode.FileIO.get_filepaths import get_filepaths
from CoreCode.Utilities.LoadConfigurationFile import LoadConfigurationFile

In [5]:
configuration = LoadConfigurationFile.load_configuration_file()

In [6]:
print(configuration.keys())
finance_path = configuration['BASE_DATA_PATH'] / "Public" / "Finances/"
print(finance_path)
print(finance_path.exists())

dict_keys(['BASE_DATA_PATH'])
/Data/Public/Finances
True


By default, [DirectoryLoader](https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory) uses `UnstructuredLoader` class.

In [7]:
dir_loader = DirectoryLoader(path=finance_path, recursive=True, show_progress=True, use_multithreading=True)
print(dir_loader.path)
print(dir_loader.glob)
print(dir_loader.exclude)
print(dir_loader.loader_cls)

/Data/Public/Finances
**/[!.]*
()
<class 'langchain_community.document_loaders.unstructured.UnstructuredFileLoader'>


In [8]:
vector_store_index = VectorstoreIndexCreator(vectorstore_kwargs={"persist_directory":"chroma_persist"})

  warn_deprecated(


ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [15]:
from CoreCode.Utilities.LoadEnvironmentFile import load_environment_file
import os
load_environment_file()

See `langchain/libs/langchain/langchain/indexes/vectorstore.py`: see that embedding, default factory, is OpenAIEmbeddings. That's why you needed the OpenAI key.

In [9]:
chroma_persist_location = configuration['BASE_DATA_PATH'] / "chroma_persist"
print(chroma_persist_location)
print(type(chroma_persist_location))
vector_store_index = VectorstoreIndexCreator(vectorstore_kwargs={"persist_directory":str(chroma_persist_location)})

/Data/chroma_persist
<class 'pathlib.PosixPath'>


ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [10]:
%time
index = vector_store_index.from_loaders([dir_loader,])

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.58 µs


NameError: name 'vector_store_index' is not defined

[Retrieval, Document loaders, PDF](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf)

In [11]:
from langchain_community.document_loaders import UnstructuredPDFLoader

In [12]:
%%time
paths = get_filepaths(finance_path, suffix=".pdf")
print(paths[:4])
loaders = []
datas = []
for path in paths:
    loader = UnstructuredPDFLoader(path)
    loaders.append(loader)
    data = loader.load()
    datas.append(data)
    
print(type(loaders[0]))
print(type(datas[0]))

[PosixPath('/Data/Public/Finances/llc-10.pdf'), PosixPath('/Data/Public/Finances/llc-12-112021.pdf'), PosixPath('/Data/Public/Finances/llc-2.pdf'), PosixPath('/Data/Public/Finances/llc-2-na.pdf')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


<class 'langchain_community.document_loaders.pdf.UnstructuredPDFLoader'>
<class 'list'>


In [13]:
from langchain_community.document_loaders import UnstructuredFileLoader

In [14]:
loader = UnstructuredFileLoader(paths)
print(type(loader))
docs = loader.load()
print(type(docs))

<class 'langchain_community.document_loaders.unstructured.UnstructuredFileLoader'>


ImportError: cannot import name 'getPerspectiveTransform' from 'cv2' (unknown location)

#### On Interface Control Documents

In [44]:
print(configuration.keys())
icds_path = configuration['BASE_DATA_PATH'] / "Public" / "InterfaceControlDocuments/"
print(icds_path)
print(icds_path.exists())
icd_paths = get_filepaths(icds_path, suffix=".pdf")
print(len(icd_paths))
print(icd_paths[:3])

dict_keys(['BASE_DATA_PATH'])
/Data/Public/InterfaceControlDocuments
True
7
[PosixPath('/Data/Public/InterfaceControlDocuments/Rocket-4-Payload-Users-Guide-v1.1-November-22.pdf'), PosixPath('/Data/Public/InterfaceControlDocuments/icd.pdf'), PosixPath('/Data/Public/InterfaceControlDocuments/1030-mNms-RW-1.01-RW-0.03-ICD.pdf')]


In [45]:
from langchain_community.document_loaders import PDFPlumberLoader

In [9]:
print(icd_paths[4])
example_loader = PDFPlumberLoader(str(icd_paths[4]))
print(type(example_loader))
example_data = example_loader.load()
print(type(example_data))
print(len(example_data))
print(type(example_data[0]))

/Data/Public/InterfaceControlDocuments/RL-ICD-RW4-2.0.pdf
<class 'langchain_community.document_loaders.pdf.PDFPlumberLoader'>
<class 'list'>
47
<class 'langchain_core.documents.base.Document'>


In [16]:
#print(dir(example_loader))
print(help(example_loader.load_and_split))

Help on method load_and_split in module langchain_community.document_loaders.base:

load_and_split(text_splitter: 'Optional[TextSplitter]' = None) -> 'List[Document]' method of langchain_community.document_loaders.pdf.PDFPlumberLoader instance
    Load Documents and split into chunks. Chunks are returned as Documents.
    
    Do not override this method. It should be considered to be deprecated!
    
    Args:
        text_splitter: TextSplitter instance to use for splitting documents.
          Defaults to RecursiveCharacterTextSplitter.
    
    Returns:
        List of Documents.

None


47
<class 'langchain_core.documents.base.Document'>


In [18]:
print(dir(example_data[0]))

['Config', '__abstractmethods__', '__annotations__', '__class__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__try_update_forward_refs__', '__validators__', '_abc_impl', '_calculate_keys', '_copy_and_set_values', '_decompose_class', '_enforce_dict_if_root', '_get_value', '_init_private_attribute

In [11]:
print(configuration.keys())
queue_path = configuration['BASE_DATA_PATH'] / "Public" / "Queue/"
print(queue_path)
print(queue_path.exists())
queue_paths = get_filepaths(queue_path, suffix=".pdf")
print(len(queue_paths))
print(queue_paths[:3])

dict_keys(['BASE_DATA_PATH'])
/Data/Public/Queue
True
7
[PosixPath('/Data/Public/Queue/Rocket-4-Payload-Users-Guide-v1.1-November-22.pdf'), PosixPath('/Data/Public/Queue/icd.pdf'), PosixPath('/Data/Public/Queue/1030-mNms-RW-1.01-RW-0.03-ICD.pdf')]


In [46]:
chroma_persist_location = configuration['BASE_DATA_PATH'] / "Public/" / "InterfaceControlDocuments/" / "chroma_persist"
print(chroma_persist_location)
print(type(chroma_persist_location))
vector_store_index = VectorstoreIndexCreator(vectorstore_kwargs={"persist_directory":str(chroma_persist_location)})

/Data/Public/InterfaceControlDocuments/chroma_persist
<class 'pathlib.PosixPath'>


In [13]:
%%time
index = vector_store_index.from_documents(example_data)

VectorStoreIndexWrapper(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x70a5b9439d80>)

In [47]:
%%time
icd_indices = []
for document_path in icd_paths:
    icd_indices.append(vector_store_index.from_documents(PDFPlumberLoader(str(document_path)).load()))

CPU times: user 29 s, sys: 176 ms, total: 29.2 s
Wall time: 36.9 s


#### Reobtain vector store

In [48]:
from langchain.vectorstores import Chroma

In [49]:
from langchain.embeddings import OpenAIEmbeddings

In [50]:
vectorstore_from_persistence = Chroma(persist_directory=str(chroma_persist_location),
                                     embedding_function=OpenAIEmbeddings())

In [43]:
print(dir(vectorstore_from_persistence))

['_Chroma__query_collection', '_LANGCHAIN_DEFAULT_COLLECTION_NAME', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_asimilarity_search_with_relevance_scores', '_client', '_client_settings', '_collection', '_cosine_relevance_score_fn', '_embedding_function', '_euclidean_relevance_score_fn', '_get_retriever_tags', '_max_inner_product_relevance_score_fn', '_persist_directory', '_select_relevance_score_fn', '_similarity_search_with_relevance_scores', 'aadd_documents', 'aadd_texts', 'add_documents', 'add_images', 'add_texts', 'adelete', 'afrom_documents', 'afrom_texts', 'amax_marginal_relevance_search', 'amax_marginal_relevance_sea

In [51]:
retriever = vectorstore_from_persistence.as_retriever()
print(type(retriever))

<class 'langchain_core.vectorstores.VectorStoreRetriever'>


In [39]:
# print(dir(retriever))
print(help(retriever.add_documents))

Help on method add_documents in module langchain_core.vectorstores:

add_documents(documents: 'List[Document]', **kwargs: 'Any') -> 'List[str]' method of langchain_core.vectorstores.VectorStoreRetriever instance
    Add documents to vectorstore.

None


In [52]:
from langchain.chains import RetrievalQA

In [53]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [55]:
query="Falcon countdown duration for a Falcon 9?"
result = qa({"query": query})
print(result['result'])

 Based on the information provided, Falcon launch vehicles are designed to support a countdown duration as short as one hour. The passage states:

"Falcon launch vehicles are designed to support a countdown duration as short as one hour. Early in the countdown, the vehicle performs LOX, RP-1 and pressurant loading, and it executes a series of vehicle and range checkouts."

So a Falcon 9 countdown can be as short as one hour according to the context. It does not provide a specific or exact countdown duration for a Falcon 9, but says the countdown for Falcon launch vehicles in general can be as short as one hour.


In [56]:
query="What is the first-stage flight time duration or i.e. how long it lasts?"
result = qa({"query": query})
print(result['result'])

 Based on the information provided, the passage states that first-stage powered flight lasts approximately three minutes:

"First-stage powered flight lasts approximately three minutes, with commanded shutdown of the nine first-stage engines based on remaining propellant levels."

So the duration of the first-stage flight is approximately three minutes. The passage does not provide an exact or more precise duration.


In [57]:
query="What is the SpaceX Falcon fairing diameters and sizes?"
result = qa({"query": query})
print(result['result'])

 Based on the context provided, here are the key details about SpaceX Falcon fairing sizes:

- The standard fairing is 5.2 m (17.2 ft) in outer diameter and 13.2 m (43.5 ft) high overall.

- SpaceX also offers an extended fairing as a non-standard service. The extended fairing has the same 5.2 m diameter but is taller at 18.7 m (61.25 ft) overall height.

- There are two standard mechanical interfaces offered - a 1,575 mm (62.01 in) diameter bolted interface and a 2,624 mm (103.307 in) bolted interface. 

- The payload static envelope inside the fairing accounts for payload dynamic deflections and is smaller than the full inner dimensions of the fairing. Specific dimensions are shown in the figures referenced in the context.

So in summary - the key outer dimensions are 5.2 m diameter for the standard fairing, with options for extended height and different interface diameters. The payload capacity inside has smaller static envelope dimensions due to dynamics. Let me know if you need an

In [20]:
# deprecated
# from langchain.llms import Anthropic
from langchain_community.chat_models import ChatAnthropic

In [21]:
llm = ChatAnthropic(anthropic_api_key=os.environ["CLAUDE_API_KEY"])
print(type(llm))

<class 'langchain_community.chat_models.anthropic.ChatAnthropic'>


['AI_PROMPT',
 'Config',
 'HUMAN_PROMPT',
 'InputType',
 'OutputType',
 '__abstractmethods__',
 '__annotations__',
 '__call__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__config__',
 '__custom_root_type__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__exclude_fields__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_validators__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__include_fields__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__json_encoder__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__orig_bases__',
 '__parameters__',
 '__post_root_validators__',
 '__pre_root_validators__',
 '__pretty__',
 '__private_attributes__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_args__',
 '__repr_name__',
 '__repr_str__',
 '__rich_repr__',
 '__ror__',
 '__schema_cache__',
 '__setattr__',
 '__setstate__',
 '__signature__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '_

##### User State
Consider User State version

In [58]:
import json
from typing import List
from pydantic import BaseModel, Field

In [59]:
class QuestionAndAnswer(BaseModel):
    question: str = Field(default=None, descriptions="Question that was asked")
    answer: str = Field(default=None, descriptions="Answer that was accepted")

In [60]:
langmem_client

NameError: name 'langmem_client' is not defined

In [61]:
from langmem import AsyncClient, Client

In [62]:
langmem_client = AsyncClient()

In [63]:
qanda_as_user_state = await langmem_client.create_memory_function(QuestionAndAnswer, target_type="user_state")
qanda_as_user_append_state = await langmem_client.create_memory_function(QuestionAndAnswer, target_type="user_append_state")
