In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=""


## Data Loaders and Splitters

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter

"""
pip install unstructured
pip install "unstructured[pdf]"
pip install "unstructured[docx]"
"""
chat = ChatOpenAI(
                model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

loader.load_and_split(text_splitter = splitter)


## Vector Store

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

vector = embedder.embed_documents([
    "hi",
    "how",
    "are",
    "you? my name is Jay!"
    
])

len(vector[2])

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
"""
pip install chromadb
pip install chroma --upgrade
"""

cache_dir = LocalFileStore("./cache/")

chat = ChatOpenAI(
                model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

docs = loader.load_and_split(text_splitter = splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

result = vectorstore.similarity_search("where does winston live")

result

## RetrievalQA

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

"""

"""

cache_dir = LocalFileStore("./cache/")

chat = ChatOpenAI(
#                 model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

docs = loader.load_and_split(text_splitter = splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type = "stuff",  # stuff, refine,map_reduce, map_rerank
    retriever = vectorstore.as_retriever(),
)

# chain.run("where does winston live")
chain.run("Describe Victory Mansions")

## Stuff LCEL Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

"""

"""

cache_dir = LocalFileStore("./cache/")

chat = ChatOpenAI(
#                 model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

docs = loader.load_and_split(text_splitter = splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}",
        ),
        ("human", "{question}"),
    ]
)

chain = {"context":retriver, "question":RunnablePassthrough()} | prompt | chat

chain.invoke("Describe Victory Mansions")

## Map Reduce LCEL Chain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

"""

"""

cache_dir = LocalFileStore("./cache/")

chat = ChatOpenAI(
#                 model="gpt-4.1-nano",
                temperature=0.1,
                tiktoken_model_name="gpt-3.5-turbo",
                streaming = True,
                callbacks=[StreamingStdOutCallbackHandler()])

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size = 600,
    chunk_overlap = 100,
)

loader = UnstructuredFileLoader("./study_file.docx")

docs = loader.load_and_split(text_splitter = splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | chat

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    results = []
    for document in documents:
        result = map_doc_chain.invoke({
            "context":document.page_content,
            "question":question,
            
        }).content
        results.append(result)
    results = "\n\n".join(results)
    return results

map_chain = {"documents":retriever, "question":RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain,"question":RunnablePassthrough()} |final_prompt | chat

chain.invoke("where does Winston go to work?") 