In [None]:
#Data Structures
#Lists - Ordered collections of items

documents = ["doc1.txt", "doc2.txt", "doc3.txt"]
chunks = []  # Empty list to store text chunks

In [None]:
#Dictionaries - Key-value pairs (perfect for storing metadata)

document_info = {
    "filename": "article.pdf",
    "page_count": 10,
    "author": "John Doe",
    "chunks": []
}

In [None]:
#Tuples - Immutable ordered collections

api_config = ("https://api.openai.com", "v1", "gpt-4")

In [None]:
#File Handling
#Reading and writing files is essential for RAG:
#File → Read → Process → Store

#Reading text files:
with open("document.txt","r", encoding="utf-8") as file:
    content = file.read()

In [None]:
#Writing to files:
with open("document.txt" , "w", encoding="utf-8") as file:
    file.write("Processed content.")

In [None]:
#append in data to existing file
with open("document.txt" , "a", encoding="utf-8") as file:
    file.write("\nAdditional content.")

In [3]:
#Functions and Classes
#Functions - Reusable blocks of code

def chunk_text(text, chunk_size):
    """Split text into chunks of specified size"""
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size]) #text[start_index : end_index] --> text[0 : 4] --> text[4 : 8] --> text[8 : 12]....
    return chunks

In [4]:
text = "Hello, I am learning how to split text into chunks."
chunks = chunk_text(text, chunk_size=10)
print(chunks)

['Hello, I a', 'm learning', ' how to sp', 'lit text i', 'nto chunks', '.']


In [None]:
#Classes - Organizing related functionality
class DocumnentProcessor:
    def __init__(self, filename):
        self.filename = filename
        self.content = ""

    def load(self):
        with open(self.filemame, "r", encoding= "utf-8") as file:
            self.content = file.read()

    def get_word_count(self):
        return len(self.content.split())

In [None]:
#Working with APIs
#RAG systems interact with APIs (like OpenAI):

#Your Code → HTTP Request → API → Response → Your Code

#Basic API interaction pattern:
import requests

def call_api(url, data):
    response = requests.post(url, json=data)
    return response.json()

In [5]:
#List Comprehensions and Generators
#List comprehensions - Concise way to create lists

# Traditional way
squares = []
for x in range(10):
    squares.append(x**2)

# List comprehension
squares = [x**2 for x in range(10)]

In [6]:
#Generators - Memory-efficient iteration
def chunk_generator(text, chunk_size):
    for i in range(0, len(text), chunk_size):
        yield text[i:i+chunk_size]

In [None]:
#Using the generator 
for chunk in chunk_generator(text, 10):
    print(chunk)


Hello, I a
m learning
 how to sp
lit text i
nto chunks
.


In [1]:
# Instructor Examples

In [None]:
# Example 1: Reading and Processing a Text File
def read_and_process_file(filename):
    #Read a file and return processed content
    try:
        with open(filename, "r", encoding="utf-8") as file:
            content = file.read()

            #basic processing:
            lines = content.split("\n")
            word_count = len(content.split())

            return{
                  "content": content,
                  "lines": len(lines),
                  "words": word_count
            }
    except FileNotFoundError:
        print(f"Filename {filename} not found.")
        return None
    
# Usage
result = read_and_process_file("sample.txt")
if result:
    print(f"Lines: {result['lines']}, Words: {result['words']}")

In [None]:
#Example 2: Text Chunking Function
def chunk_text(text, chunk_size=200, overlap=50):
    """
    Split text into overlapping chunks

    Args:
        text: Input text to chunk
        chunk_size: Size of each chunk
        overlap: Number of characters to overlap between chunks
    """
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap  # Overlap for context

    return chunks

# Usage
long_text = "This is a very long document..." * 100
chunks = chunk_text(long_text, chunk_size=200, overlap=50)
print(f"Created {len(chunks)} chunks")

In [2]:
# Example 3: Working with Dictionaries for Document Metadata

class Document:
    def __init__(self, filename, content):
        self.filename = filename
        self.content = content
        self.metadata = {
            "word_count": len(content.split()),
            "char_count": len(content),
            "chunks": []
        }

    def add_chunk(self, chunk_text, chunk_id):
        chunk_data = {
            "id": chunk_id,
            "text": chunk_text,
            "length": len(chunk_text)
        }
        self.metadata["chunks"].append(chunk_data)

    def get_summary(self):
        return {
            "filename": self.filename,
            "words": self.metadata["word_count"],
            "chunks": len(self.metadata["chunks"])
        }

# Usage
doc = Document("article.txt", "This is the content of the article...")
doc.add_chunk("First chunk", 1)
doc.add_chunk("Second chunk", 2)
print(doc.get_summary())

{'filename': 'article.txt', 'words': 7, 'chunks': 2}


In [None]:
# Example 4: Simple API Request Pattern
import requests
import json

def make_api_request(url, payload, headers=None):
    """Make a POST request to an API"""
    default_headers = {"Content-Type": "application/json"}
    if headers:
        default_headers.update(headers)

    try:
        response = requests.post(url, json=payload, headers=default_headers)
        response.raise_for_status()  # Raises exception for bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None

# Usage pattern (you'll use this with OpenAI API later)
# payload = {"prompt": "Hello, world!"}
# result = make_api_request("https://api.example.com/endpoint", payload)