In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [None]:
data_dir = "./Big Star Collectibles"

In [None]:
files = os.listdir(data_dir)
file_texts = []
for file in files:
    with open(f"{data_dir}/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=128, chunk_overlap=32, # this is the critical line
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text,metadata={ # critical new code from last section
                    "doc_title": file.split(".")[0], # critical new code from last section
                    "chunk_num": i})) # critical new code from last section))

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
embeddings = HuggingFaceEmbeddings() # embed your data

In [None]:
# store the embedded data into a vector database
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

In [None]:
retriever = vector_store.as_retriever()

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from langchain_openai import OpenAI
llm = OpenAI()

In [None]:
from langchain.prompts import ChatPromptTemplate
template="""You are a helpful assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
response = chain.invoke("When did Big Star Collectibles Launch?")

In [None]:
response