In [None]:
import os
os.environ["OPENAI_API_KEY"] = 
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=


## Data Loaders and Splitters

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter

"""
pip install unstructured
pip install "unstructured[pdf]"
pip install "unstructured[docx]"
"""
chat = ChatOpenAI(
                model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

loader.load_and_split(text_splitter = splitter)


## Vector Store

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

vector = embedder.embed_documents([
    "hi",
    "how",
    "are",
    "you? my name is Jay!"
    
])

len(vector[2])

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
"""
pip install chromadb
pip install chroma --upgrade
"""

cache_dir = LocalFileStore("./cache/")

chat = ChatOpenAI(
                model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

docs = loader.load_and_split(text_splitter = splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

result = vectorstore.similarity_search("where does winston live")

result

## RetrievalQA

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

"""

"""

cache_dir = LocalFileStore("./cache/")

chat = ChatOpenAI(
#                 model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

docs = loader.load_and_split(text_splitter = splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type = "stuff",  # stuff, refine,map_reduce, map_rerank
    retriever = vectorstore.as_retriever(),
)

# chain.run("where does winston live")
chain.run("Describe Victory Mansions")