In [13]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader

In [14]:
DATA_PATH = r"Data"

In [15]:
loader = PyPDFDirectoryLoader(DATA_PATH)

raw_documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

In [16]:
text_splitter.split_documents(raw_documents)[0:3]

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-03-06T02:04:27+00:00', 'author': 'Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas Müller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, Dustin Podell, Tim Dockhorn, Zion English, Kyle Lacey, Alex Goodwin, Yannik Marek, Robin Rombach', 'keywords': 'Machine Learning, ICML', 'moddate': '2024-03-06T02:04:27+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': 'Proceedings of the International Conference on Machine Learning 2024', 'title': 'Scaling Rectified Flow Transformers for High-Resolution Image Synthesis', 'trapped': '/False', 'source': 'Data\\2403.03206v1 (2).pdf', 'total_pages': 28, 'page': 0, 'page_label': '1'}, page_content='Scaling Rectified Flow Transformers for High-Resolution Image Synthesis\nPatrick Esser * Sumith Kulal Andreas Blattmann Rahim Entezari J

In [7]:
FAISS_PATH = r"faiss_index"

from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")

embeddings_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=api_key  
)

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.5,
    google_api_key=api_key
)

In [8]:
llm.invoke("What is the capital of France?").content # Example query to test the LLM

'The capital of France is **Paris**.'

In [9]:
embeddings_model.embed_query("What is the capital of France?") # Example query to test the embeddings

[0.03737647458910942,
 -0.08005280047655106,
 0.0094971414655447,
 -0.054259900003671646,
 0.0401969812810421,
 -0.014677672646939754,
 -0.003037121379747987,
 -0.027584392577409744,
 0.01653120666742325,
 0.05325475335121155,
 0.02014581300318241,
 0.0065550850704312325,
 0.007738979533314705,
 0.01040461752563715,
 0.04651215299963951,
 -0.042970314621925354,
 -0.0028412053361535072,
 0.010426606982946396,
 0.016972925513982773,
 -0.027879023924469948,
 -0.0009504062472842634,
 0.0009890897199511528,
 -0.05100862681865692,
 0.012239427305758,
 0.005591430701315403,
 -0.06119317188858986,
 0.04015737771987915,
 -0.061326831579208374,
 -0.015330948866903782,
 0.035843804478645325,
 -0.08919006586074829,
 0.021800706163048744,
 -0.014829910360276699,
 -0.029963767156004906,
 0.025640642270445824,
 -0.04249396175146103,
 0.03838176280260086,
 0.023290880024433136,
 0.019195951521396637,
 0.046537891030311584,
 -0.028027748689055443,
 0.0122469961643219,
 -0.031047120690345764,
 -0.036568

In [10]:
loaded_db = FAISS.load_local(FAISS_PATH, embeddings_model, allow_dangerous_deserialization=True)

query = "What is MMDit?"
docs = loaded_db.similarity_search(query)
print(docs[0].page_content)

MM-DiT on videos. To this end we start from the pretrained
image weights and additionally use a 2x temporal patching.
We follow Blattmann et al. (2023b) and feed data to the
pretrained model by collapsing the temporal into the batch
axis. In each attention layer we rearrange the representation
in the visual stream and add a full attention over all spatio-
temporal tokens after the spatial attention operation before
the final feedforward layer. Our video models are trained for
140k steps with a batch size of 512 on videos comprising
16 frames with 2562 pixels. We report validation losses on
the Kinetics dataset (Carreira & Zisserman, 2018) every 5k
steps. Note that our reported FLOPs for video training in
Figure 8 are only FLOPs from video training and do not
include the FLOPs from image pretraining.
For both the image and video domains, we observe a smooth
decrease in the validation loss when increasing model size
and training steps. We find the validation loss to be highly


In [11]:
len(docs)

4

In [1]:
from langchain_core.prompts import PromptTemplate


In [2]:
prompt_template = PromptTemplate.from_template("Tell me a joke about {topic}")

prompt_template.invoke({"topic": "cats"})

StringPromptValue(text='Tell me a joke about cats')

In [3]:
prompt_template.format(topic="dogs")  

'Tell me a joke about dogs'

In [4]:
from langchain_core.prompts import ChatPromptTemplate

In [10]:
template = ChatPromptTemplate([
    ("system", "You are a helpful AI bot."),
    ("placeholder", "{conversation}")
])

conversation = [
    ("human", "Hi!"),
    ("ai", "How can I assist you today?"),
    ("human", "Can you make me an ice cream sundae?"),
    ("ai", "No.")
]

prompt_value = template.invoke(
    {
        "conversation": conversation
    }
)


In [11]:
prompt_value.to_string()

'System: You are a helpful AI bot.\nHuman: Hi!\nAI: How can I assist you today?\nHuman: Can you make me an ice cream sundae?\nAI: No.'

In [12]:
conversation.append(("human", "Can you make me a pizza?"))
prompt_value = template.invoke(
    {
        "conversation": conversation
    }
)

In [13]:
prompt_value.to_string()

'System: You are a helpful AI bot.\nHuman: Hi!\nAI: How can I assist you today?\nHuman: Can you make me an ice cream sundae?\nAI: No.\nHuman: Can you make me a pizza?'