# File for Prompt Evaluation. 
#### Using Levenshtein Distance, BLEU/ROUGE and XML Validation for Scoring Prompt Candidates

In [26]:
#!pip3 install levenshtein
#!pip3 install nltk
#!pip3 install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml): started
  Building wheel for rouge-score (pyproject.toml): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25025 sha256=af642edaa5f42fd45cfca649627be1ec5fb956d8f524ac7f2f39948474331789
  Stored in directory: c:\users\maxim\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef54687

In [4]:
import os


if not os.environ.get("OPENAI_API_KEY"):
    print("OPEN AI API KEY IS MISSING")

# Initializing ChatGPT/Openai
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
    # base_url="...",
    # organization="...",
    # other params...
)

#### Reading Process Descs

In [None]:
file_process_desc_nebentaetigkeiten = open("assets/process_desc_nebentaetigkeiten.txt", encoding="utf8")
file_process_desc_debriefing = open("assets/process_desc_debriefing.txt", encoding="utf8")
file_process_desc_bedarfsermittlung = open("assets/process_desc_bedarfsermittlung.txt", encoding="utf8")
file_input_example_1 = open("assets/example_for_ai_1.drawio", encoding="utf8")
file_input_example_2 = open("assets/example_for_ai_2.drawio", encoding="utf8")

# save process descriptions
process_desc_nebentaetigkeiten = file_process_desc_nebentaetigkeiten.read()
process_desc_debriefing = file_process_desc_debriefing.read()
process_desc_bedarfsermittlung = file_process_desc_bedarfsermittlung.read()
example_input_file_1 = file_input_example_1
example_input_file_2 = file_input_example_2

# close files
file_process_desc_nebentaetigkeiten.close()
file_process_desc_debriefing.close()
file_process_desc_bedarfsermittlung.close()
file_input_example_1.close()
file_input_example_2.close()

#### Prompts for Evaluation

In [None]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt Extracting Roles
prompt_extracting_roles = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            'Du bist ein Prozessmanager extrahierst Rollen aus einer Prozessbeschreibung heraus.',
        ),
        ("human", 'Extrahiere aus folgender Prozessbeschreibung alle Beteiligten Rollen und gib diese als Liste zurück. Prozessbeschreibung: "{prozessbeschreibung}"'),
    ]
)

# Prompt Extracting Activities
prompt_extracting_activities = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            'Du bist ein Prozessmanager extrahierst Rollen aus einer Prozessbeschreibung heraus.',
        ),
        ("human", 'Extrahiere aus folgender Prozessbeschreibung alle Beteiligten Rollen und gib diese als Liste zurück. Prozessbeschreibung: "{prozessbeschreibung}"'),
    ]
)

# Prompt creating Model
prompt_creating_model = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            'Du bist ein Prozessmanager extrahierst Rollen aus einer Prozessbeschreibung heraus.',
        ),
        ("human", 'Extrahiere aus folgender Prozessbeschreibung alle Beteiligten Rollen und gib diese als Liste zurück. Prozessbeschreibung: "{prozessbeschreibung}"'),
    ]
)


#### Initialising Test Cases

In [16]:
class TestCaseExtraction:
    def __init__(self, case_name, process_desc, ground_truth, prompt):
        self.case_name = case_name
        self.process_desc = process_desc
        self.ground_truth = ground_truth
        self.prompt = prompt

class TestCaseModelCreation:
    def __init__(self, case_name, process_desc, input_1, input_2, ground_truth, prompt):
        self.case_name = case_name
        self.input_1 = input_1
        self.input_2 = input_2
        self.process_desc = process_desc
        self.ground_truth = ground_truth
        self.prompt = prompt

# Test Cases for Role Extraction
test_case_roles_bedarfsermittlung = [""]
test_case_roles_debriefing = TestCaseExtraction("Debriefing_role_extraction", process_desc_debriefing, ["HR"])
test_case_roles_nebentaetigkeiten = TestCaseExtraction("Nebentaetigkeiten_role_extraction", process_desc_nebentaetigkeiten, ["Mitarbeiter", "HR"])

list_test_cases_role_extraction = [test_case_roles_debriefing, test_case_roles_nebentaetigkeiten]

# Test Cases for Activity Extraction
activities_bedarfsermittlung = [""]

activity_list_debriefing = [""]
test_case_activities_debriefing = TestCaseExtraction("Debriefing_activity_extraction", process_desc_debriefing, activity_list_debriefing)

activity_list_nebentaetigkeiten = [""]
test_case_activities_nebentaetigkeiten = TestCaseExtraction("Nebentaetigkeiten_activity_extraction", process_desc_nebentaetigkeiten, activity_list_nebentaetigkeiten)

list_test_cases_activity_extraction = [test_case_activities_debriefing, test_case_activities_nebentaetigkeiten]

# Test Cases for Model Creation


#### Bleu, Rouge and Meteor Test

In [36]:
import nltk
import nltk.translate.bleu_score

import nltk.translate.meteor_score
from rouge_score import rouge_scorer

# Bleu Score for 2-grams
nltk.translate.bleu_score.sentence_bleu([roles_nebentaetigkeiten], roles_extracted,(0.5,0.5))

# Rouge Score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Converting List to String for Rouge
string1 = ' '.join(roles_nebentaetigkeiten)
string2 = ' '.join(roles_extracted)

# Rouge Scoring
scores = scorer.score(string1, string2)
for key in scores:
    print(f'{key}: {scores[key]}')
    
print(string1)
print(string2)

# Meteor Score
nltk.translate.meteor_score.meteor_score([roles_nebentaetigkeiten], roles_extracted)

rouge1: Score(precision=1.0, recall=1.0, fmeasure=1.0)
rouge2: Score(precision=1.0, recall=1.0, fmeasure=1.0)
rougeL: Score(precision=1.0, recall=1.0, fmeasure=1.0)
HR Mitarbeiter
HR Mitarbeiter


0.9375

## Evaluating Role Extraction Prompt

In [None]:
import Levenshtein as lev

roles_nebentaetigkeiten.sort()
roles_extracted.sort()

lev.distance(roles_nebentaetigkeiten, roles_extracted)

chain = prompt_extracting_roles | llm
response_role_extraction = chain.invoke({
    "prozessbeschreibung" :
})
