In [1]:
pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from src.kaggle_submission import SubmissionBase, test_submission
from src.client import openai_client, qdrant_client
from pydantic import BaseModel
from typing import Literal
from src.python_agent import PythonAgent
from src.handler_rag import QdrantRAG
from src.utils import save_json

test_questions = pd.read_csv("data/test.csv")
train_questions = pd.read_csv("data/train.csv")


  from .autonotebook import tqdm as notebook_tqdm


**V1 - Simple Context Improvement**



In [2]:
context = """
You are an AI expert in reliability engineering. Your task is to answer multiple-choice questions (MCQs) accurately and concisely. Each question will have exactly one correct answer.

Instructions:
- Read the question and the possible answers.
- Identify the single correct answer based on your expertise in reliability engineering.
- Respond only with the letter of the correct answer (e.g., a, b, c, or d). Do not provide any explanations or additional text.
For example, if you want to say that answer [d] is the right one, you should only retun "d".


Example usage:
Question: Which metric measures the average time between system failures?
a. MTTR
b. MTBF
c. Availability
d. Failure Rate

Expected response:
b
"""


class SimpleContext(SubmissionBase):
    def get_1_answer(self, q):
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": context},
                {"role": "user", "content": q},
            ],
        )

        return response.choices[0].message.content


v1 = SimpleContext(test_questions, openai_client)

In [4]:
test_submission(v1, fake_multiple_attempts=True)

 --> Prediction 1 for question 1 : d <-- 
 --> Prediction 1 for question 2 : b <-- 
 --> Prediction 1 for question 3 : a <-- 
 --> Prediction 1 for question 4 : d <-- 
 --> Prediction 1 for question 5 : a <-- 
 --> Prediction 1 for question 6 : d <-- 
 --> Prediction 1 for question 7 : b <-- 
 --> Prediction 1 for question 8 : b <-- 
 --> Prediction 1 for question 9 : c <-- 
 --> Prediction 1 for question 10 : d <-- 
 --> Prediction 1 for question 11 : d <-- 
 --> Prediction 1 for question 12 : d <-- 
 --> Prediction 1 for question 13 : b <-- 
 --> Prediction 1 for question 14 : b <-- 
 --> Prediction 1 for question 15 : c <-- 
 --> Prediction 1 for question 16 : b <-- 
 --> Prediction 1 for question 17 : d <-- 
 --> Prediction 1 for question 18 : d <-- 
 --> Prediction 1 for question 19 : a <-- 
 --> Prediction 1 for question 20 : c <-- 
 --> Prediction 1 for question 21 : b <-- 
 --> Prediction 1 for question 22 : c <-- 
 --> Prediction 1 for question 23 : a <-- 
 --> Prediction 1 fo

np.float64(0.6)

**Double Prompting**


In [5]:
class FullReasoning(BaseModel):
    steps: list[str]
    final_answer: Literal["a", "b", "c", "d"]


SYSTEM_PROMPT = context

DOUBT_PROMPT = """
I have a doubt. Are you totally sure ? Double-check your answer and explain briefly in 2 steps.
"""


class DoublePrompting(SubmissionBase):
    messages_to_save = []
    def get_1_answer(self, q):
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": q},
        ]

        first_response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            temperature=0.7,
        )

        messages += [
            {"role": "assistant", "content": first_response.choices[0].message.content},
            {"role": "user", "content": DOUBT_PROMPT},
        ]

        response = openai_client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=messages,
            temperature=0.25,
            response_format=FullReasoning,
        )
        answer = response.choices[0].message.parsed
        messages += [
            {"role": "assistant", "content": answer.steps},
            {"role": "assistant", "content": answer.final_answer},
        ]
        self.messages_to_save += messages
        return answer.final_answer
    
    def get_submission(self, save_path = "generated/submission.csv", fake_multiple_attempts=False):
        self.messages_to_save = []
        sub = super().get_submission(save_path, fake_multiple_attempts)
        save_json(self.messages_to_save, "generated/all_messages.json")
        return sub
        


double_prompting = DoublePrompting(test_questions, openai_client)

In [4]:
test_submission(double_prompting, fake_multiple_attempts=True)

 --> Prediction 1 for question 1 : d
 --> Prediction 1 for question 2 : b
 --> Prediction 1 for question 3 : a
 --> Prediction 1 for question 4 : d
 --> Prediction 1 for question 5 : a
 --> Prediction 1 for question 6 : d
 --> Prediction 1 for question 7 : b
 --> Prediction 1 for question 8 : b
 --> Prediction 1 for question 9 : c
 --> Prediction 1 for question 10 : c
 --> Prediction 1 for question 11 : d
 --> Prediction 1 for question 12 : d
 --> Prediction 1 for question 13 : b
 --> Prediction 1 for question 14 : b
 --> Prediction 1 for question 15 : b
 --> Prediction 1 for question 16 : b
 --> Prediction 1 for question 17 : d
 --> Prediction 1 for question 18 : d
 --> Prediction 1 for question 19 : c
 --> Prediction 1 for question 20 : c
 --> Prediction 1 for question 21 : c
 --> Prediction 1 for question 22 : c
 --> Prediction 1 for question 23 : a
 --> Prediction 1 for question 24 : b
 --> Prediction 1 for question 25 : a
--------------------
Score : 0.6 for model DoublePrompting


np.float64(0.6)

**Multiway prompting**

This time we initially don't provide the choices to Chat. We first let it think about an answer, and then provide it the MCQ in order to choose the best answer.

In [4]:
SYSTEM_PROMPT = """You are a reliability expert. You will be asked to answer to several questions based on your knowledge and the definitions you know.
You will need to explain your reasonning and explain the steps that allowed you to choose your answers.
"""


class SimpleAnswer(BaseModel):
    choice: Literal["a", "b", "c", "d"]


def provide_choices(choices):
    return f"""Based on your previous answer, you should now assess the veracity of each of the following possible answers one by one:
{choices}"""


SELECTION_PROMPT = """Now please select the 1 possibility that fits bests the initial question.
It is possible that none of the possible answer seems acceptable to you. In this case, please choose the one that is the closest to your opinion."""


class MultiPrompting(SubmissionBase):
    def get_1_answer(self, q):
        question, choices = q.split("[Choices]")

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": question},
        ]

        first_response = (
            openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                temperature=0.61,
            )
            .choices[0]
            .message.content
        )

        messages += [
            {"role": "assistant", "content": first_response},
            {"role": "user", "content": provide_choices(choices)},
        ]

        chat_opinion = (
            openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                temperature=0.61,
            )
            .choices[0]
            .message.content
        )

        messages += [
            {"role": "assistant", "content": chat_opinion},
            {"role": "user", "content": SELECTION_PROMPT},
        ]

        return (
            openai_client.beta.chat.completions.parse(
                model="gpt-4o-mini",
                messages=messages,
                temperature=0.61,
                response_format=SimpleAnswer,
            )
            .choices[0]
            .message.parsed.choice
        )


mp = MultiPrompting(test_questions, openai_client)

In [7]:
test_submission(mp, fake_multiple_attempts=True)

 --> Prediction 1 for question 1 : d
 --> Prediction 1 for question 2 : d
 --> Prediction 1 for question 3 : a
 --> Prediction 1 for question 4 : d
 --> Prediction 1 for question 5 : a
 --> Prediction 1 for question 6 : d
 --> Prediction 1 for question 7 : b
 --> Prediction 1 for question 8 : c
 --> Prediction 1 for question 9 : c
 --> Prediction 1 for question 10 : d
 --> Prediction 1 for question 11 : d
 --> Prediction 1 for question 12 : a
 --> Prediction 1 for question 13 : a
 --> Prediction 1 for question 14 : b
 --> Prediction 1 for question 15 : b
 --> Prediction 1 for question 16 : a
 --> Prediction 1 for question 17 : d
 --> Prediction 1 for question 18 : d
 --> Prediction 1 for question 19 : c
 --> Prediction 1 for question 20 : b
 --> Prediction 1 for question 21 : a
 --> Prediction 1 for question 22 : a
 --> Prediction 1 for question 23 : a
 --> Prediction 1 for question 24 : c
 --> Prediction 1 for question 25 : a
--------------------
Score : 0.72 for model MultiPrompting


np.float64(0.72)

**Agentic system**

Now we will give the model the capacity to write and execute Python scripts

In [11]:
PythonAgent.inject_python(openai_client=openai_client)
test_submission(double_prompting, fake_multiple_attempts=True)

Found a python script to execute.
Executing the following script: 

import scipy.stats as stats

# Given values
mean = 150  # Mean (μ)
std_dev = 20  # Standard deviation (σ)
percentile = 0.10  # 10th percentile

# Find the z-score for the 10th percentile
z_score = stats.norm.ppf(percentile)

# Calculate B10 life
B10_life = mean + z_score * std_dev
print(B10_life)
```
Script executed successfully.
Prompting with the result: 124.36896868910799
 --> Prediction 1 for question 1 : b
 --> Prediction 1 for question 2 : a
 --> Prediction 1 for question 3 : c
Found a python script to execute.
Executing the following script: 

import scipy.stats as stats

# Parameters
n = 20  # sample size
alpha = 0.05

# Degrees of freedom
df = n - 1

# Chi-squared critical values
chi2_lower = stats.chi2.ppf(alpha / 2, df)
chi2_upper = stats.chi2.ppf(1 - alpha / 2, df)

(chi2_lower, chi2_upper)
```
The script returned no output. Trying again
Found a python script to execute.
Executing the following script: 

im

Unnamed: 0,question_id,prediction_1,prediction_2,prediction_3,prediction_4,prediction_5
0,1,b,b,b,b,b
1,2,a,a,a,a,a
2,3,c,c,c,c,c
3,4,a,a,a,a,a
4,5,d,d,d,d,d
5,6,a,a,a,a,a
6,7,d,d,d,d,d
7,8,a,a,a,a,a
8,9,a,a,a,a,a
9,10,c,c,c,c,c


**RAG prompting**

We now augment the knowledge of our model using Retrieval Augmented Generation. We built a database containing specific information about reliability engineering, and will use it to augment our prompts.

In [6]:
from src.utils import init_string

rag = QdrantRAG(rag_path="generated/extracted_rag.json", qdrant_client=qdrant_client)

MAX_CONTEXT_LENGTH = 1000


class RiskHive(DoublePrompting):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.augment_max_length = MAX_CONTEXT_LENGTH
        init_string()

    def inject_context(self, q: str):
        question, _ = q.split("[Choices]")
        search = rag.search(question)
        element = f"""
Information ranked by relevance:
1 : {search[0]}
2 : {search[1]}
3 : {search[2]}
"""
        return f"{q}\n\n Here is some information that could help. If there appears to be the correct mathematical formula, you can use it :  ```{element[0 : self.augment_max_length]}...```"

    def get_1_answer(self, q):
        return super().get_1_answer(self.inject_context(q))


rh = RiskHive(test_questions, openai_client)

100%|██████████| 467/467 [00:40<00:00, 11.56it/s]


||                               ||
||          _________            ||
||         |         |           ||
||         |R I S K  |           ||
||         |  H I V E|           ||
||         |_________|           ||
||                               ||
[STATUS]: All systems up. Ready to analyze some risk!






In [7]:
PythonAgent.inject_python(openai_client=openai_client)
# test_submission(rh, fake_multiple_attempts=True)
rh.get_submission(save_path="generated/risk_hive_submission.csv", fake_multiple_attempts=True)

--- Python agent initialized ---

Found a python script to execute.
Executing the following script: 

```python
import numpy as np

# Given parameters
mean = 150  # mean life
variance = 400  # variance
std_dev = np.sqrt(variance)  # standard deviation

# To find B10 life, we need to find the 10th percentile of the normal distribution
b10_life = np.percentile(np.random.normal(mean, std_dev, 100000), 10)
print(b10_life)
print(print(b10_life))
```
Script executed successfully.
Prompting with the result: 124.49725094465978
124.49725094465978
None
 --> Prediction 1 for question 1 : b <-- 
 --> Prediction 1 for question 2 : a <-- 
 --> Prediction 1 for question 3 : c <-- 
Found a python script to execute.
Executing the following script: 

```python
import scipy.stats as stats

# Given data
sample_variance = 4  # ohms^2
n = 20  # sample size
alpha = 0.05  # significance level for 95% confidence interval

# Degrees of freedom
df = n - 1

# Chi-squared distribution critical values
chi2_lower = 

Unnamed: 0,question_id,prediction_1,prediction_2,prediction_3,prediction_4,prediction_5
0,1,b,b,b,b,b
1,2,a,a,a,a,a
2,3,c,c,c,c,c
3,4,a,a,a,a,a
4,5,a,a,a,a,a
5,6,b,b,b,b,b
6,7,d,d,d,d,d
7,8,d,d,d,d,d
8,9,b,b,b,b,b
9,10,b,b,b,b,b
