This file generates the seed for the rag database (json containing all the elements). Be careful, it takes a lot of time (25min) to run, and requires a text file of the reliability notebook (not commited)

In [47]:
import re
from src.utils import save_json, logger, read_json
from src.client import openai_client
from tqdm import tqdm

Extract the paragraphs that could contain equations

In [48]:
# Load the file content
with open("data/RT_textbook.txt", "r") as file:
    content = file.read()

# Split content into paragraphs separated by "\n\n"
paragraphs = content.split("\n\n")


# Function to extract paragraphs containing equations and their context
def extract_equation_context(paragraphs):
    equation_pattern = re.compile(r".*\s=\s.*")  # Regex to detect equations
    extracted_content = []

    for idx, paragraph in enumerate(paragraphs):
        if equation_pattern.search(paragraph):  # Detect if the paragraph contains " = "
            # Get previous, current, and next paragraphs
            previous_paragraph = (
                paragraphs[idx - 1].strip() if idx > 0 else "No previous paragraph"
            )
            next_paragraph = (
                paragraphs[idx + 1].strip()
                if idx + 1 < len(paragraphs)
                else "No next paragraph"
            )
            extracted_content.append(
                previous_paragraph + "\n\n" + paragraph + "\n\n" + next_paragraph
            )

    return extracted_content


# Extract paragraphs with equations and their context
all_content_raw = extract_equation_context(paragraphs)

# Save results to JSON
save_json(all_content_raw, fname="generated/extracted_rag.json")

logger.info(f"Extracted {len(all_content_raw)} entries with equations.")

Extracted 467 entries with equations.


Sanitize the output and condense the paragraphs with chatgpt

In [49]:
# Condense the extracted paragraphs
import os

SYST_PROMPT = """
The following paragraph is extracted from a reliability textbook, containing equations and their context. It was extracted from a pdf,
so some data might not be perfectly organised. I want you to condense the information in this paragraph, to:
- make it more concise
- clear the equations
If you come across an equation, rewrite it in "humanly understandable" language, for example: 
f(x) = x^2 + 2*x + (some formulation of an integral easily understandable)
The paragraph has to have the structure : 
**(Title that summarizes the paragraph)**
(Condensed paragraph)
Your must make the shortest paragraph possible, without losing the main information. Ideally, make it less than 800 symbols.
"""
first_index = 0
if os.path.exists("generated/condensed_rag.json"):
    condensed_data = read_json("generated/condensed_rag.json")
first_index = len(condensed_data)

added_length = 110

print(
    f"Starting from index {first_index} to {min(first_index + added_length, len(all_content_raw))} (excluded)"
)

for i in tqdm(
    range(first_index, min(first_index + added_length, len(all_content_raw))),
    desc="Processing paragraphs",
):
    paragraph = all_content_raw[i]
    condensed = (
        openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYST_PROMPT},
                {"role": "user", "content": paragraph},
            ],
        )
        .choices[0]
        .message.content
    )
    condensed_data.append(condensed)

save_json(condensed_data, fname="generated/condensed_rag.json")

Starting from index 486 to 467 (excluded)


Processing paragraphs: 0it [00:00, ?it/s]


add the data specific to probabilities and the first handmade data

In [50]:
masterclass = read_json("generated/condensed_rag.json")
data_proba = read_json("data/rag_crafting/probstat_rag.json")["RAG"]
data_risk_management = read_json("data/rag_crafting/risk_management_rag.json")
first_data = read_json("data/rag_crafting/general_rag_data.json")
for i in range(len(data_proba)):
    title = list(data_proba[i].keys())[0]
    entry = list(data_proba[i].values())[0]
    new_entry = f"**{title}**\n{entry}"
    masterclass.append(new_entry)

for i in range(len(first_data)):
    new_entry = first_data[i]
    masterclass.append(new_entry)

for i in range(len(data_risk_management)):
    entry = data_risk_management[i]["entry"]
    masterclass.append(entry)

save_json(masterclass, fname="data/rag_db_seed.json")
print("final length", len(masterclass))

final length 553


In [57]:
from src.handler_rag import HandMadeRAG, cosine_similarity
import numpy as np
from tqdm import tqdm


# Load the RAG database
manager = HandMadeRAG(db_path="data/rag_db.json")
seed_list = read_json("data/rag_db_seed.json")

# Define similarity threshold and clustering parameters
THRESHOLD = 0.87
MIN_SAMPLES = 2

# Extract embeddings and calculate cosine similarity matrix
embeddings = [np.array(entry["embedding"]) for entry in manager.db]
similarity_matrix = np.array([
    [cosine_similarity(e1, e2) for e2 in embeddings]
    for e1 in embeddings
])

# Filter pairs with similarity above the threshold
valid_pairs = [
    (i, j) for i in range(len(manager.db)) for j in range(i + 1, len(manager.db))
    if similarity_matrix[i, j] > THRESHOLD
]

# Create clusters based only on valid pairs
clusters = []
visited = set()

for i, j in valid_pairs:
    if manager.db[i]["text"] in visited and manager.db[j]["text"] in visited:
        continue
    cluster = set()
    for x, y in valid_pairs:
        if x == i or x == j or y == i or y == j:
            cluster.add(manager.db[x]["text"])
            cluster.add(manager.db[y]["text"])
    visited.update(cluster)
    clusters.append(list(cluster))

print("Number of clusters: ", len(clusters))
print("Clusters: ", clusters)

merge_prompt = """
These paragraphs seem related. Merge them into the best single paragraph that retains the most relevant information and ensures clear and concise math formulas and explanations. 
If any formulas or sections overlap significantly, include only the best or clearest version. Keep the final output under 1000 characters.
"""

# Interact with GPT for each cluster

for i in tqdm(range(len(clusters)), desc="Processing clusters"):
    cluster = clusters[i]
    if len(cluster) > 1:
        # Combine all texts in the cluster
        combined_text = "\n\n".join(cluster)

        # Ask GPT to refine the cluster
        merged_paragraph = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": merge_prompt},
                {"role": "user", "content": combined_text},
            ],
        ).choices[0].message.content

        for entry in manager.db:
            if entry["text"] in cluster:
                manager.db.remove(entry)
        for entry in seed_list:
            if entry in cluster:
                seed_list.remove(entry)
        # Update the database with the refined paragraph
        manager.db.append({"text": merged_paragraph, "embedding": list(manager._get_embedding(merged_paragraph))})
        seed_list.append(merged_paragraph)

# Push the updated database
print("Final length of db: ", len(manager.db))
print("(check if the seed has same length) : ", len(seed_list))
manager.push_to_db()
save_json(seed_list, fname="data/rag_db_seed.json")


Number of clusters:  40
Clusters:  [["**Average Availability and Key Metrics for Repairable Systems**\n\nA machine with a Mean Time to Failure (MTTF) of 1,000 hours and a Mean Downtime (MDT) of 5 hours has an average availability (A_avg) of approximately 99.5%. This implies about 44 hours of downtime annually. The MTTF equals Mean Uptime (MUT) under perfect repair conditions. For items with constant rates, availability A(t) transitions to a limit A as time approaches infinity, calculated as (MUT)/(MUT + MDT). The approximation for average unavailability is MDT/(λ MDT), where λ is the failure rate. Operational availability (A_OP) is calculated based on planned and unplanned downtimes within a mission period. Additionally, production metrics include deliverability (actual deliveries/planned deliveries) and on-stream availability, assessing the system's operational performance against benchmarks.", '**Availability and Unavailability of Repairable Items**\n\nThe average availability \\( A_

Processing clusters: 100%|██████████| 40/40 [02:11<00:00,  3.28s/it]


Final length of db:  316
(check if the seed has same length) :  545


In [69]:
db_texts = [entry["text"] for entry in manager.db]

while len(seed_list) > len(db_texts):
    for el in seed_list:
        if el not in db_texts:
            seed_list.remove(el)

while len(manager.db) > len(seed_list):
    for el in manager.db:
        if el["text"] not in seed_list:
            manager.db.remove(el)

save_json(seed_list, fname="data/rag_db_seed.json")
manager.push_to_db()

print("Final length of db: ", len(manager.db))
print("(check if the seed has same length) : ", len(seed_list))

Final length of db:  306
(check if the seed has same length) :  306


Output a PDF from the JSON

In [67]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from textwrap import wrap
import json


def sanitize_text(text):
    """Remove unsupported characters from the text."""
    return "".join(
        c if ord(c) < 128 else "?" for c in text
    )  # Replace non-ASCII characters


def generate_pdf(data, output_file="data/cheatsheet.pdf"):
    c = canvas.Canvas(output_file, pagesize=letter)
    width, height = letter

    # Margins
    margin_x = 50
    margin_y = 50
    x, y = margin_x, height - margin_y

    max_width = width - 2 * margin_x  # Text area width
    line_height = 15

    def wrap_text(text, max_chars):
        return "\n".join(wrap(text, max_chars))

    # Loop through strings and add them to the PDF
    for text in data:
        text = sanitize_text(text)  # Clean the text
        # Split title and paragraph
        parts = text.split("\n\n", 1)
        if len(parts) == 2:
            title, paragraph = parts
        else:
            title = parts[0]
            paragraph = ""

        # Add title in bold font
        c.setFont("Times-Bold", 14)
        title_lines = wrap_text(
            title, int(max_width / 7)
        )  # Approx char width in points
        for line in title_lines.split("\n"):
            c.drawString(x, y, line)
            y -= line_height
            if y < margin_y:  # Add a new page if needed
                c.showPage()
                c.setFont("Times-Bold", 14)
                y = height - margin_y

        # Add paragraph in normal font
        c.setFont("Times-Roman", 12)
        paragraph_lines = wrap_text(paragraph, int(max_width / 7))
        for line in paragraph_lines.split("\n"):
            c.drawString(x, y, line)
            y -= line_height
            if y < margin_y:  # Add a new page if needed
                c.showPage()
                c.setFont("Times-Roman", 12)
                y = height - margin_y

        y -= 30  # Add space after each block

    c.save()


data_proba = json.load(open("data/rag_db_seed.json"))

generate_pdf(data_proba)
