In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = model.encode("Test sentence about AI ethics.")
print(embedding.shape)  # Output: (384,) – vector dimension

In [27]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model='gpt2')
set_seed(2025)

Device set to use cpu


In [None]:
prompt = "I know the name of the 46th President of the United States. His name is"  # Post-training cutoff
response = generator(prompt, max_length=100, num_return_sequences=5, temperature=0.1)
print(response[0]['generated_text'])  # Likely hallucinates a fake event

Data processing

In [None]:
# import os
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import TextLoader
# import pickle  # For saving chunks if needed

# # Path to your data folder
# data_dir = "data"  # Assumes /data in your repo root

# # List to hold all chunks
# all_chunks = []

# # Loop over files 1.txt to 15.txt (skips missing ones)
# for i in range(1, 11):  # 1 to 15
#     file_path = os.path.join(data_dir, f"{i}.txt")
#     if os.path.exists(file_path):
#         print(f"Loading {file_path}...")
#         loader = TextLoader(file_path, encoding="utf-8")  # Handles standard text
#         docs = loader.load()
#         text = " ".join([doc.page_content for doc in docs])  # Just the text

#         # Clean: Remove extra whitespace, newlines (basic)
#         text = ' '.join(text.split())  # Collapses multiples
#         text = text.replace('\n', ' ')  # Flatten newlines if any

#         # Chunk this file's text
#         splitter = RecursiveCharacterTextSplitter(
#             chunk_size=500,  # ~500 chars/tokens
#             chunk_overlap=50  # Overlap for context continuity
#         )
#         file_chunks = splitter.split_text(text)
#         all_chunks.extend(file_chunks)  # Add to total pool
#         print(f"Added {len(file_chunks)} chunks from {i}.txt")
#     else:
#         print(f"Skipping {file_path} (not found)")

# # Final count
# print(f"Total chunks created: {len(all_chunks)}")

# # Optional: Save for later (e.g., Day 2 embedding)
# with open("chunks.pkl", "wb") as f:
#     pickle.dump(all_chunks, f)
# print("Chunks saved to chunks.pkl")

Embedding, Indexing, and Retrieval process

In [1]:
import pickle
with open("chunks.pkl", "rb") as f:
    all_chunks = pickle.load(f)
print(f"Loaded {len(all_chunks)} chunks")  # e.g., 30 chunks

Loaded 426 chunks


In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # From Day 1
embeddings = embed_model.encode(all_chunks)  # List of arrays -> one big array
embeddings = np.array(embeddings).astype('float32')  # FAISS needs this
print(f"Embeddings shape: {embeddings.shape}")  # e.g., (30, 384) – chunks x dimensions

  from .autonotebook import tqdm as notebook_tqdm


Embeddings shape: (426, 384)


In [3]:
import faiss

dimension = embeddings.shape[1]  # e.g., 384
index = faiss.IndexFlatL2(dimension)  # Basic flat index (exact search, good for small data)
index.add(embeddings)  # Train/add your vectors
print(f"Index built with {index.ntotal} vectors")  # Matches chunk count

Index built with 426 vectors


In [4]:
#Test
dummy_query = embed_model.encode(["AI bias in ethics"])
distances, indices = index.search(dummy_query, k=3)  # Top 3 nearest
print("Top indices:", indices)  # e.g., [5, 12, 3] – chunk IDs

Top indices: [[190 212  51]]


In [5]:
from retrieval import *
sample_query = "Joe Biden name is"
results = retrieve_chunks(sample_query, embed_model, index, all_chunks)
for chunk, score in results:
    print(f"Score: {score:.2f} | Chunk: {chunk[:100]}...")  # Preview

Score: 0.73 | Chunk: Jr. was born on November 20, 1942, in Scranton, Pennsylvania. In 1953, the Biden family moved from P...
Score: 0.79 | Chunk: Joe Biden (born November 20, 1942, Scranton, Pennsylvania, U.S.) is the 46th president of the United...
Score: 0.79 | Chunk: Their Years in Office Quiz For a full transcript of Biden’s farewell address, click here. Early life...
