In [None]:
%pip install -U langchain-community langchain_chroma pypdf gigachat -q

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains import ConversationChain
from langchain.chat_models.gigachat import GigaChat
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
import os
giga_key = os.getenv("GIGACHAT_API_KEY")

In [None]:
loader = PyPDFLoader('Machine_Learning_System_Design.pdf')
pages = loader.load_and_split()
pages[:1]

[Document(metadata={'source': 'Machine_Learning_System_Design.pdf', 'page': 1}, page_content='MEAP Edition\nManning Early Access Program\nMachine Learning System Design\nWith end-to-end examples\nVersion 12\nCopyright 2024 Manning Publications\nFor more information on this and other Manning titles go to manning.com.\n© Manning Publications Co. To comment go to liveBook\nLicensed to Valerii Babushkin <venheads@yandex.ru>')]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(pages)

In [None]:
giga = GigaChat(credentials=giga_key,
                    model="GigaChat-Pro", timeout=30, verify_ssl_certs=False)

  giga = GigaChat(credentials=giga_key,


In [None]:
EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
CHUNK_SIZE = 1000
CHUNK_OVERLAP_SCALE = 0.1

In [None]:
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "], # separates either on words or paragraphs
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_SIZE*CHUNK_OVERLAP_SCALE,
)

In [None]:
splits = splitter.split_documents(documents)

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    encode_kwargs={"normalize_embeddings": True}
)

  embedding_model = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
prompt = hub.pull("rlm/rag-prompt")



In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | giga
    | StrOutputParser()
)

In [None]:
print(rag_chain.invoke("Write design document for a sales forecasting ML system"))

A design document for a sales forecasting ML system should outline the steps involved in preparing the data for modeling, including data cleaning, feature engineering, and data normalization. Additionally, it should specify the types of models that will be used, such as baseline methods (moving average, exponential smoothing) and machine learning algorithms (decision trees, random forests, etc.). Finally, the document should address potential risks and considerations, such as the impact of different forecast accuracy levels and the need for scalability and robustness against unexpected events.


# Save vector store

In [4]:
%pip install -U langchain-community pypdf faiss-gpu -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

class DocumentEmbedder:
    def __init__(self, file_path, chunk_size=1000, chunk_overlap=200, embedding_model_name="sentence-transformers/all-mpnet-base-v2"):
        self.file_path = file_path
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            multi_process=True,
            encode_kwargs={"normalize_embeddings": True}
        )

    def load_and_split(self):
        loader = PyPDFLoader(self.file_path)
        pages = loader.load_and_split()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        return text_splitter.split_documents(pages)

    def create_embeddings(self):
        documents = self.load_and_split()
        return FAISS.from_documents(documents, self.embedding_model)

    def save_embeddings(self, vectorstore, path):
        vectorstore.save_local(path)

def main():
    pdf_path = "/content/Machine_Learning_System_Design.pdf"
    vectorstore_path = "vectorstore"

    embedder = DocumentEmbedder(pdf_path)

    print("Creating embeddings...")
    vectorstore = embedder.create_embeddings()

    print("Saving embeddings...")
    embedder.save_embeddings(vectorstore, vectorstore_path)

    print(f"Embeddings saved to {vectorstore_path}")


main()

Creating embeddings...
Saving embeddings...
Embeddings saved to vectorstore
