# IMPORTS

In [1]:
import os
import pandas
import json

In [2]:
import sys

sys.path.append("./..")

In [3]:
import utils.dspy_templates as dspy_templates
import utils.outputs as outputs

### _ENV vars_
Set the location of your local `.env` file.

In [4]:
import dotenv

dotenv.load_dotenv(dotenv_path="../.env")

True

# DATA

### _Golden Dataset_

In [5]:
GOLDEN_DATASET_PATH = "../dataset/arithmetic_expressions_golden_dataset.parquet"

expressions = pandas.read_parquet(path=GOLDEN_DATASET_PATH)

print(expressions.shape)

(100, 3)


### _Model selection among all available results_
A 'model' consists in a **language model** and a **defined prompt**.

In [6]:
RESULT_PATH = os.getenv("RESULT_PATH")

In [7]:
for folder in sorted(os.listdir(RESULT_PATH)):
    print(folder)

original
student-LocalSmolLM135m-teacher-LocalSmolLM135m
student-LocalSmolLM135m-teacher-gpt-4.1
student-gpt-4.1-nano-teacher-gpt-4.1
student-gpt-4.1-nano-teacher-gpt-4.1-nano
student-gpt-4.1-teacher-gpt-4.1


In [8]:
PROMPT_NAME = "original"

AVAILABLE_RESULTS = RESULT_PATH + "/" + PROMPT_NAME + "/" + "results" + "/"

for folder in sorted(os.listdir(AVAILABLE_RESULTS)):
    print(folder.split(".parquet")[0])

LocalSmolLM135m
gpt-4.1-nano
gpt-4.1


In [9]:
RESULT_NAME = "LocalSmolLM135m"

In [10]:
results = pandas.read_parquet(path=AVAILABLE_RESULTS + RESULT_NAME + ".parquet")

print(f"Result DataFrame has {results.shape[0]} rows and {results.shape[1]} columns.")
display(results.head(n=3))

Result DataFrame has 100 rows and 3 columns.


Unnamed: 0,id,process_error,raw_result
0,0,,```python\n# Define the arithmetic expression\nar
1,1,,```python\n# Define the arithmetic expression\nar
2,2,,```python\ndef arithmetic_expression(arithmetic


# EXPLORATION

### _Errors in LM responses_

In [11]:
nb_errors_in_responses = results["process_error"].notna().sum()
print(f"Response error count = {nb_errors_in_responses}")

Response error count = 0


In [12]:
results.iloc[0]["raw_result"]

'```python\n# Define the arithmetic expression\nar'

### _Merging responses with original golden dataset (ground truth)_

In [13]:
retrieved_results = pandas.merge(
    expressions.rename(columns={"result": "actual_result"}),
    results.loc[results["process_error"].isna()],
    on="id",
    how="inner",
)

print(retrieved_results.shape)
display(retrieved_results.head(n=3))

(100, 5)


Unnamed: 0,id,expression,actual_result,process_error,raw_result
0,0,((3 + 5 * (8 - 4)) / (6 + 2)) * (7 - 2),14.375,,```python\n# Define the arithmetic expression\nar
1,1,(((9 + 3) * (6 - 2)) / 4) + (15 / (5 + 5)),13.5,,```python\n# Define the arithmetic expression\nar
2,2,((8 * (3 + 5)) - (20 / (4 + 1))) * (2 + 3),300.0,,```python\ndef arithmetic_expression(arithmetic


### _Process raw responses_

In [14]:
def get_float_result_or_error(row: pandas.Series):
    result = row["raw_result"]
    try:
        result = dspy_templates.get_result_from_dspy_template(
            result=result, final_strip=True
        )
        result = outputs.string_result_to_numeric(result=result)
        return {"inferred_result": result, "inference_error": False}
    except:
        return {"inferred_result": result, "inference_error": True}

In [15]:
retrieved_results[["inferred_result", "inference_error"]] = retrieved_results.apply(
    get_float_result_or_error, axis=1, result_type="expand"
)

display(
    retrieved_results["inference_error"]
    .rename("Error in output format")
    .value_counts(dropna=False, normalize=False)
    .reindex([True, False])
    .fillna(0)
    .rename({True: "Yes", False: "No"})
    .astype(int)
    .to_frame()
    .transpose()
)

Error in output format,Yes,No
count,100,0


In [16]:
correctly_inferred_results = retrieved_results.loc[
    ~retrieved_results["inference_error"]
]

print(correctly_inferred_results.shape)

(0, 7)


In [17]:
correctly_inferred_results["actual_result"] = correctly_inferred_results[
    "actual_result"
].astype(float)
correctly_inferred_results["inferred_result"] = correctly_inferred_results[
    "inferred_result"
].astype(float)

In [18]:
display(correctly_inferred_results.head(n=3))

Unnamed: 0,id,expression,actual_result,process_error,raw_result,inferred_result,inference_error


### _Metrics_

##### Global metrics from raw output

In [19]:
def accuracy(df: pandas.DataFrame):
    return (df["actual_result"] == df["inferred_result"]).sum() / df.shape[0]


def mape(df: pandas.DataFrame):
    return (
        (df["actual_result"] - df["inferred_result"]).abs() / abs(df["actual_result"])
    ).mean()

In [20]:
if correctly_inferred_results.shape[0] > 0:
    print(f"Accuracy = {accuracy(df=correctly_inferred_results)* 100:.1f}%")
    print(f"MAPE = {mape(df=correctly_inferred_results)*100:.1f}%")
else:
    print("No correctly inferred results, thus no metrics can be computed.")

No correctly inferred results, thus no metrics can be computed.


##### Global metrics using `DSPy`

In [21]:
with open(
    RESULT_PATH
    + "/"
    + PROMPT_NAME
    + "/"
    + "metrics"
    + "/"
    + "dspy"
    + "/"
    + RESULT_NAME
    + ".json",
    "r",
) as json_file:
    dspy_metrics = json.load(json_file)

print(len(dspy_metrics))

2


In [22]:
for metrics in dspy_metrics:
    print(f"{metrics} = {dspy_metrics[metrics]}%")

binary_accuracy_metrics = 6.0%
mape_metrics = 63.74%
