In [1]:
# # Install LangChain and PyPDF if not already installed
# !pip install langchain pypdf

# Import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader

# Path to your PDF file
file_path = "data/365_Data_Science_Courses.pdf"

# Create loader instance
loader = PyPDFLoader(file_path)

# Load the PDF
pages_pdf = loader.load()

# Check the length
print("Number of documents (pages) in pages_pdf:", len(pages_pdf))


Number of documents (pages) in pages_pdf: 3


In [2]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Step 1: Combine all PDF page contents into a single string
pages_string = ""
for page in pages_pdf:
    pages_string += page.page_content + "\n"

# Step 2: Define headers for splitting
headers = [
    ("#", "Course Title"),
    ("##", "Lecture Title")
]

# Step 3: Create MarkdownHeaderTextSplitter instance
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers)

# Step 4: Apply the split_text() method
pages_md_split = md_splitter.split_text(pages_string)

# Step 5: Print the length
print("Length of pages_md_split:", len(pages_md_split))


Length of pages_md_split: 4


In [3]:
from langchain.text_splitter import CharacterTextSplitter

# Create the CharacterTextSplitter
char_splitter = CharacterTextSplitter(
    separator=". ",      # ensures we split on sentence boundaries
    chunk_size=400,      # max length per chunk
    chunk_overlap=0,     # no characters repeated across chunks
)

# Apply split_documents on pages_md_split
pages_char_split = char_splitter.split_documents(pages_md_split)

# Print length of resulting list
print("Length of pages_char_split:", len(pages_char_split))


Length of pages_char_split: 17


In [None]:
# installs (if needed)
# !pip install -q langchain langchain-openai numpy

import os, numpy as np
from langchain_openai import OpenAIEmbeddings

# --- set your key ---
os.environ["OPENAI_API_KEY"] = "YOUR_KEY_HERE"

# embeddings model
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")

# grab the two chunks' text
t1 = pages_char_split[1].page_content
t2 = pages_char_split[8].page_content

# get vectors
v1 = np.array(embedding.embed_query(t1), dtype=float)
v2 = np.array(embedding.embed_query(t2), dtype=float)

# cosine-style dot product (normalize first so dot==cosine similarity)
v1n, v2n = v1/np.linalg.norm(v1), v2/np.linalg.norm(v2)
sim = float(np.dot(v1n, v2n))

print("Dot product:", sim)
print("Rounded to 2 decimals:", round(sim, 2))


In [5]:
# Install sentence-transformers if not already
# !pip install -q sentence-transformers

import numpy as np
from sentence_transformers import SentenceTransformer


model = SentenceTransformer("all-mpnet-base-v2")


t1 = pages_char_split[1].page_content
t2 = pages_char_split[8].page_content

# Encode
v1 = model.encode(t1, convert_to_numpy=True)
v2 = model.encode(t2, convert_to_numpy=True)

# Normalize
v1n, v2n = v1 / np.linalg.norm(v1), v2 / np.linalg.norm(v2)

# Cosine similarity = dot product of normalized vectors
sim = float(np.dot(v1n, v2n))

print("Dot product:", sim)
print("Rounded:", round(sim, 2))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Dot product: 0.2556152939796448
Rounded: 0.26
