This file generates the seed for the rag db / cheatsheet

In [1]:
import re
from src.utils import save_json, logger, read_json
from src.client import openai_client
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Extract the paragraphs that could contain equations

In [2]:
# Load the file content
with open("data/RT_textbook.txt", "r") as file:
    content = file.read()

# Split content into paragraphs separated by "\n\n"
paragraphs = content.split("\n\n")


# Function to extract paragraphs containing equations and their context
def extract_equation_context(paragraphs):
    equation_pattern = re.compile(r".*\s=\s.*")  # Regex to detect equations
    extracted_content = []

    for idx, paragraph in enumerate(paragraphs):
        if equation_pattern.search(paragraph):  # Detect if the paragraph contains " = "
            # Get previous, current, and next paragraphs
            previous_paragraph = (
                paragraphs[idx - 1].strip() if idx > 0 else "No previous paragraph"
            )
            next_paragraph = (
                paragraphs[idx + 1].strip()
                if idx + 1 < len(paragraphs)
                else "No next paragraph"
            )
            extracted_content.append(
                previous_paragraph + "\n\n" + paragraph + "\n\n" + next_paragraph
            )

    return extracted_content


# Extract paragraphs with equations and their context
all_content_raw = extract_equation_context(paragraphs)

# Save results to JSON
save_json(all_content_raw, fname="generated/extracted_rag.json")

logger.info(f"Extracted {len(all_content_raw)} entries with equations.")

Extracted 467 entries with equations.


Sanitize the output and condense the paragraphs with chatgpt

In [20]:

# Condense the extracted paragraphs
import os

SYST_PROMPT = """
The following paragraph is extracted from a reliability textbook, containing equations and their context. It was extracted from a pdf,
so some data might not be perfectly organised. I want you to condense the information in this paragraph, to:
- make it more concise
- clear the equations
If you come across an equation, rewrite it in "humanly understandable" language, for example: 
f(x) = x^2 + 2*x + (some formulation of an integral easily understandable)
The paragraph has to have the structure : 
**(Title that summarizes the paragraph)**
(Condensed paragraph)
Your must make the shortest paragraph possible, without losing the main information. Ideally, make it less than 800 symbols.
"""
first_index = 0
if os.path.exists("generated/condensed_rag.json"):
    condensed_data = read_json("generated/condensed_rag.json")
first_index = len(condensed_data)

added_length = 110

print(
    f"Starting from index {first_index} to {min(first_index + added_length, len(all_content_raw))} (excluded)"
)

for i in tqdm(
    range(first_index, min(first_index + added_length, len(all_content_raw))),
    desc="Processing paragraphs",
):
    paragraph = all_content_raw[i]
    condensed = (
        openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYST_PROMPT},
                {"role": "user", "content": paragraph},
            ],
        )
        .choices[0]
        .message.content
    )
    condensed_data.append(condensed)

save_json(condensed_data, fname="generated/condensed_rag.json")

Starting from index 486 to 467 (excluded)


Processing paragraphs: 0it [00:00, ?it/s]


add the data specific to probabilities and the first handmade data

In [21]:
masterclass = read_json("generated/condensed_rag.json")
data_proba = read_json("data/rag_crafting/probstat_rag.json")["RAG"]
first_data = read_json("data/rag_crafting/general_rag_data.json")
for i in range(len(data_proba)):
    title = list(data_proba[i].keys())[0]
    entry =  list(data_proba[i].values())[0]
    new_entry = f"**{title}**\n{entry}"
    masterclass.append(new_entry)

for i in range(len(first_data)):
    new_entry = first_data[i]
    masterclass.append(new_entry)

save_json(masterclass, fname="data/rag_db_seed.json")
print("final length", len(masterclass))
    

final length 547


Output a PDF from the JSON

In [17]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from textwrap import wrap
import json


def sanitize_text(text):
    """Remove unsupported characters from the text."""
    return "".join(
        c if ord(c) < 128 else "?" for c in text
    )  # Replace non-ASCII characters


def generate_pdf(data, output_file="data/cheatsheet.pdf"):
    c = canvas.Canvas(output_file, pagesize=letter)
    width, height = letter

    # Margins
    margin_x = 50
    margin_y = 50
    x, y = margin_x, height - margin_y

    max_width = width - 2 * margin_x  # Text area width
    line_height = 15

    def wrap_text(text, max_chars):
        return "\n".join(wrap(text, max_chars))

    # Loop through strings and add them to the PDF
    for text in data:
        text = sanitize_text(text)  # Clean the text
        # Split title and paragraph
        parts = text.split("\n\n", 1)
        if len(parts) == 2:
            title, paragraph = parts
        else:
            title = parts[0]
            paragraph = ""

        # Add title in bold font
        c.setFont("Times-Bold", 14)
        title_lines = wrap_text(
            title, int(max_width / 7)
        )  # Approx char width in points
        for line in title_lines.split("\n"):
            c.drawString(x, y, line)
            y -= line_height
            if y < margin_y:  # Add a new page if needed
                c.showPage()
                c.setFont("Times-Bold", 14)
                y = height - margin_y

        # Add paragraph in normal font
        c.setFont("Times-Roman", 12)
        paragraph_lines = wrap_text(paragraph, int(max_width / 7))
        for line in paragraph_lines.split("\n"):
            c.drawString(x, y, line)
            y -= line_height
            if y < margin_y:  # Add a new page if needed
                c.showPage()
                c.setFont("Times-Roman", 12)
                y = height - margin_y

        y -= 30  # Add space after each block

    c.save()


data_proba = json.load(open("data/rag_db_seed.json"))

generate_pdf(data_proba)
