## Set up environment

In [19]:
from dotenv import load_dotenv

_ = load_dotenv(override=True)

In [20]:
from common.context import TagPredictionContext

EXPERIMENT_ID = "2024-10-16-baseline-data-ddrs"
RUN_ID = "on-eval-dataset-v5"


CONTEXT = TagPredictionContext(
    description="Evaluate performance of the existing solution to set a baseline",
    experiment_id=EXPERIMENT_ID,
    run_id=RUN_ID,
    tags_in_scope=sorted(
        [
            "wait",
            "surfeqfailure",
            "tighthole",
            "lostcirculation",
            "packoff",
            "harddrilling",
            "stuckpipe",
            "wellcontrol",
            "wellborestability",
            "lowrop",
            "dircontrol",
            "highrop",
            "wellborebreathing",
            "boulders",
            "shallowgas",
            "dheqfailure",
            "holecleaning",
            "shallowwater",
        ]
    ),
    with_notags=True,
)

## Fetch datasets

The file "reviewed_distributed_ddr_v3.csv" was added with further corrections that were made with input from Informatiq.

In [21]:
import pandas as pd
import re
from common.datasets import load_input_dataset

dataset_name = "reviewed_distributed_ddr_v5.csv"

# Check if version number in dataset_name is 5 or greater
version_match = re.search(r'v(\d+)', dataset_name)
if version_match and int(version_match.group(1)) >= 5:
    reviewed_tags_column = "reviewedTags"
else:
    reviewed_tags_column = "Reviewed tags"

# Use the dynamically determined reviewed_tags_column
dataset_df = load_input_dataset(
    dataset_name, columns_to_convert_to_sets=["tags", reviewed_tags_column]
)

CONTEXT.used_datasets = [dataset_name]

## Apply the model

In [22]:
# nothing to do, DDR tagging using regex rules is already applied to the dataset in this experiment
from common.assessment import expand_tags

assessed_df = dataset_df.filter(
    items=["id", "Text", "tags", reviewed_tags_column, "Comments"]
)
assessed_df = expand_tags(
    assessed_df,
    tags_in_scope=CONTEXT.tags_in_scope,
    ground_truth_tags_column=reviewed_tags_column,
    predicted_tags_column="tags",
)
assessed_df

Unnamed: 0,id,expected__boulders,actual__boulders,expected__dheqfailure,actual__dheqfailure,expected__dircontrol,actual__dircontrol,expected__harddrilling,actual__harddrilling,expected__highrop,...,expected__tighthole,actual__tighthole,expected__wait,actual__wait,expected__wellborebreathing,actual__wellborebreathing,expected__wellborestability,actual__wellborestability,expected__wellcontrol,actual__wellcontrol
0,a1f86f80-135e-458b-aafc-3af30d2476f2_main_1e78...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,a1f86f80-135e-458b-aafc-3af30d2476f2_main_0c00...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,a1f86f80-135e-458b-aafc-3af30d2476f2_main_0f2c...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,a1f86f80-135e-458b-aafc-3af30d2476f2_main_ff18...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,a1f86f80-135e-458b-aafc-3af30d2476f2_main_821d...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,a1f86f80-135e-458b-aafc-3af30d2476f2_main_aa5e...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
296,a1f86f80-135e-458b-aafc-3af30d2476f2_main_5d39...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
297,a1f86f80-135e-458b-aafc-3af30d2476f2_main_c0a8...,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
298,a1f86f80-135e-458b-aafc-3af30d2476f2_main_03d2...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
from common.datasets import save_assessed_dataset


save_assessed_dataset(
    assessed_df,
    context=CONTEXT,
    experiment_id=EXPERIMENT_ID,
    run_id=RUN_ID,
)

2_assessed_datasets\2024-10-16-baseline-data-ddrs-on-eval-dataset-v5


## Evaluate predicted tags

In [24]:
from common.evaluation import TagMatchingEvaluator


evaluator = TagMatchingEvaluator(
    assessed_df=assessed_df,
    tags_in_scope=CONTEXT.tags_in_scope,
    with_notags=CONTEXT.with_notags,
)

In [25]:
evaluator.eval_individual_ddrs()

Unnamed: 0,id,expected__boulders,actual__boulders,expected__dheqfailure,actual__dheqfailure,expected__dircontrol,actual__dircontrol,expected__harddrilling,actual__harddrilling,expected__highrop,...,expected__wellborestability,actual__wellborestability,expected__wellcontrol,actual__wellcontrol,expected__notags,actual__notags,precision,recall,f1,true_positives
0,a1f86f80-135e-458b-aafc-3af30d2476f2_main_1e78...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,1.0,1.0,1.0,1
1,a1f86f80-135e-458b-aafc-3af30d2476f2_main_0c00...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,1.0,1.0,1.0,1
2,a1f86f80-135e-458b-aafc-3af30d2476f2_main_0f2c...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,1.0,1.0,1.0,1
3,a1f86f80-135e-458b-aafc-3af30d2476f2_main_ff18...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,1.0,1.0,1.0,1
4,a1f86f80-135e-458b-aafc-3af30d2476f2_main_821d...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,1.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,a1f86f80-135e-458b-aafc-3af30d2476f2_main_aa5e...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,1.0,1.0,1.0,1
296,a1f86f80-135e-458b-aafc-3af30d2476f2_main_5d39...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,1.0,1.0,1.0,1
297,a1f86f80-135e-458b-aafc-3af30d2476f2_main_c0a8...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,1.0,1.0,1.0,1
298,a1f86f80-135e-458b-aafc-3af30d2476f2_main_03d2...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,1.0,1.0,1.0,1


In [26]:
evaluator.eval_per_tag()

Unnamed: 0,tag,precision,recall,f1,true_positives,positives_in_ground_truth,negatives_in_ground_truth
0,boulders,0.0,1.0,0.0,0,0,300
1,dheqfailure,0.0,0.0,0.0,0,4,296
2,dircontrol,0.0,1.0,0.0,0,0,300
3,harddrilling,0.5,1.0,0.666667,2,2,298
4,highrop,0.0,1.0,0.0,0,0,300
5,holecleaning,1.0,0.0,0.0,0,2,298
6,lostcirculation,0.777778,0.933333,0.848485,14,15,285
7,lowrop,0.727273,0.8,0.761905,8,10,290
8,packoff,0.8,1.0,0.888889,4,4,296
9,shallowgas,1.0,1.0,1.0,0,0,300


In [27]:
evaluator.average_metrics()

Unnamed: 0,Type,precision,recall,f1
0,Average per DDR,0.824444,0.818333,0.818667
1,Average per Tag,0.641093,0.819587,0.609585


## Save evaluation report

In [28]:
from common.datasets import save_evaluation_report

save_evaluation_report(
    experiment_id=EXPERIMENT_ID,
    run_id=RUN_ID,
    dataset_df=dataset_df,
    assessed_df=assessed_df,
    evaluator=evaluator,
    context=CONTEXT,
)