In [23]:
import os
import getpass

try:
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass(
        prompt="Enter your LangSmith API key (optional): "
    )
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
        prompt='Enter your LangSmith Project Name (default = "default"): '
    )
    if not os.environ.get("LANGSMITH_PROJECT"):
        os.environ["LANGSMITH_PROJECT"] = "default"

## Concepts

This guide focuses on retrieval of text data. We will cover the following concepts:

    Documents and document loaders;
    Text splitters;
    Embeddings;
    Vector stores and retrievers.


In [1]:
from langchain_core.documents import Document

# Generate sample document
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = '../nur_amin_sifat_python_ai.pdf'
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

2
Md Nur Amin Sifat
/envel⌢pemdnuraminsifat380@gmail.com|♂phone+8801758809212|/mediumnastech.medium.com|/linkedinlinkedin.com/in/nur-amin-sifat
/githubgithub.com/GenesisBlock3301|/codeleetcode.com/Genes

{'source': './nur_amin_sifat_python_ai.pdf', 'page': 0, 'page_label': ''}


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

all_splits = text_splitter.split_documents(docs)
for i, item in enumerate(all_splits): print(f"{i}:-> {item}\n")

0:-> page_content='Md Nur Amin Sifat
/envel⌢pemdnuraminsifat380@gmail.com|♂phone+8801758809212|/mediumnastech.medium.com|/linkedinlinkedin.com/in/nur-amin-sifat
/githubgithub.com/GenesisBlock3301|/codeleetcode.com/GenesisBlock3301|/kagglekaggle.com/genesisblock3301
♂¶ap-¶arker-altDhaka, Bangladesh
Professional Summary
Software Engineer with 4 years of experience in Backend, specializing in Python, GoLang, and JavaScript.
Strong expertise in microservices architecture, database optimization, and scalable application development.
Passionate about artificial intelligence. Hands-on experience with machine learning, deep learning, and
real-world model deployment. Enthusiastic about large language models (LLMs) and their applications in NLP,
automation, and generative AI. Proven ability to deliver high-quality solutions within tight deadlines.
Research Interests
Machine Learning · Deep Learning · NLP · Computer Vision · Drone · Robotics & Perception · LLMs · RAG · Generative
AI
Education' me

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_1 = embedding.embed_query(all_splits[0].page_content)
vector_2 = embedding.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)

print(vector_1[:20])

  embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


[-0.04704954847693443, -0.07763168215751648, 0.04998462647199631, 0.0042217145673930645, -0.024292338639497757, -0.032140664756298065, -0.0243839044123888, 0.019588157534599304, -0.06551358103752136, 0.010316074825823307, -0.04696815833449364, -0.07748743891716003, 0.0355636365711689, 0.026479754596948624, 0.03369459882378578, 0.08663653582334518, 0.042259469628334045, -0.02866727113723755, -0.07380245625972748, -0.09452342987060547]


In [5]:
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embedding=embedding)
ids = vector_store.add_documents(all_splits)

result = vector_store.similarity_search(
    "ielts",
    k=3,
)
print(result[0])

page_content='• Intermediate Machine Learning — Kaggle — Model validation, Feature engineering, Random Forest, XGBoost,
Data leakage handling
• Intro to Machine Learning — Kaggle — Supervised learning, Data Preprocessing, Decision trees, Overfitting,
Model selection
• Cyber Security and Physical Protection at Workplace (Malware) — Brain Station 23 — Malware analysis,
Cyber hygiene, Office system protection
• A Blockchain-Enabled Distributed Advanced Metering Infrastructure Secure Communication (BC-AMI) —
MDPI — Blockchain, IoT security, Secure communication protocols
• Python (Intermediate) — HackerRank — Data structures, Algorithms, Python functions, OOP, File I/O
Languages
• English — Proficient (IELTS Band 6)
• Bangla — Native Speaker' metadata={'source': './nur_amin_sifat_python_ai.pdf', 'page': 1, 'page_label': '', 'start_index': 1515}


In [46]:
# score
result = vector_store.similarity_search_with_score("ielts", k=3)
doc, score = result[0]

print(f"score: {score}\n")
print(f"{doc}\n")

score: 0.2895670086786331

page_content='• Intermediate Machine Learning — Kaggle — Model validation, Feature engineering, Random Forest, XGBoost,
Data leakage handling
• Intro to Machine Learning — Kaggle — Supervised learning, Data Preprocessing, Decision trees, Overfitting,
Model selection
• Cyber Security and Physical Protection at Workplace (Malware) — Brain Station 23 — Malware analysis,
Cyber hygiene, Office system protection
• A Blockchain-Enabled Distributed Advanced Metering Infrastructure Secure Communication (BC-AMI) —
MDPI — Blockchain, IoT security, Secure communication protocols
• Python (Intermediate) — HackerRank — Data structures, Algorithms, Python functions, OOP, File I/O
Languages
• English — Proficient (IELTS Band 6)
• Bangla — Native Speaker' metadata={'source': './nur_amin_sifat_python_ai.pdf', 'page': 1, 'page_label': '', 'start_index': 1515}



In [47]:
embedding = embedding.embed_query("ielst")
result = vector_store.similarity_search_by_vector(embedding)
print(result[0])

page_content='• Intermediate Machine Learning — Kaggle — Model validation, Feature engineering, Random Forest, XGBoost,
Data leakage handling
• Intro to Machine Learning — Kaggle — Supervised learning, Data Preprocessing, Decision trees, Overfitting,
Model selection
• Cyber Security and Physical Protection at Workplace (Malware) — Brain Station 23 — Malware analysis,
Cyber hygiene, Office system protection
• A Blockchain-Enabled Distributed Advanced Metering Infrastructure Secure Communication (BC-AMI) —
MDPI — Blockchain, IoT security, Secure communication protocols
• Python (Intermediate) — HackerRank — Data structures, Algorithms, Python functions, OOP, File I/O
Languages
• English — Proficient (IELTS Band 6)
• Bangla — Native Speaker' metadata={'source': './nur_amin_sifat_python_ai.pdf', 'page': 1, 'page_label': '', 'start_index': 1515}
