# File for Prompt Evaluation. 
#### Using Levenshtein Distance, BLEU/ROUGE and XML Validation for Scoring Prompt Candidates

In [3]:
#!pip3 install levenshtein
#!pip3 install nltk
#!pip3 install rouge-score
#!pip3 install langchain_openai

In [4]:
import os


if not os.environ.get("OPENAI_API_KEY"):
    print("OPEN AI API KEY IS MISSING")

# Initializing ChatGPT/Openai
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
    # base_url="...",
    # organization="...",
    # other params...
)

#### Reading Process Descs

In [5]:
file_process_desc_nebentaetigkeiten = open("assets/process_desc_nebentaetigkeiten.txt", encoding="utf8")
file_process_desc_debriefing = open("assets/process_desc_debriefing.txt", encoding="utf8")
file_process_desc_bedarfsermittlung = open("assets/process_desc_bedarfsermittlung.txt", encoding="utf8")
file_input_example_1 = open("assets/example_for_ai_1.drawio", encoding="utf8")
file_input_example_2 = open("assets/example_for_ai_2.drawio", encoding="utf8")

# save process descriptions
process_desc_nebentaetigkeiten = file_process_desc_nebentaetigkeiten.read()
process_desc_debriefing = file_process_desc_debriefing.read()
process_desc_bedarfsermittlung = file_process_desc_bedarfsermittlung.read()
example_input_file_1 = file_input_example_1
example_input_file_2 = file_input_example_2

# close files
file_process_desc_nebentaetigkeiten.close()
file_process_desc_debriefing.close()
file_process_desc_bedarfsermittlung.close()
file_input_example_1.close()
file_input_example_2.close()

#### Prompts for Evaluation

In [33]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt Extracting Roles
prompt_extracting_roles = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            'Du bist ein Prozessmanager extrahierst Rollen aus einer Prozessbeschreibung heraus.',
        ),
        ("human", 'Extrahiere aus folgender Prozessbeschreibung alle Beteiligten Rollen und gib diese als nicht Liste in folgendem Format zurück. Format: ["Eintrag1", "Eintrag2", "Eintrage3"],  Prozessbeschreibung: "{prozessbeschreibung}"'),
    ]
)

# Prompt Extracting Activities
prompt_extracting_activities = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            'Du bist ein Prozessmanager extrahierst Rollen aus einer Prozessbeschreibung heraus.',
        ),
        ("human", 'Extrahiere aus folgender Prozessbeschreibung alle Beteiligten Rollen und gib diese als Liste zurück. Prozessbeschreibung: "{prozessbeschreibung}"'),
    ]
)

# Prompt creating Model
prompt_creating_model = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            'Du bist ein Prozessmanager extrahierst Rollen aus einer Prozessbeschreibung heraus.',
        ),
        ("human", 'Extrahiere aus folgender Prozessbeschreibung alle Beteiligten Rollen und gib diese als Liste zurück. Prozessbeschreibung: "{prozessbeschreibung}"'),
    ]
)


#### Initialising Test Cases

In [60]:
class TestCaseExtraction:
    def __init__(self, case_name, process_desc, ground_truth, prompt):
        self.case_name = case_name
        self.process_desc = process_desc
        self.ground_truth = ground_truth
        self.prompt = prompt

class TestCaseModelCreation:
    def __init__(self, case_name, process_desc, input_1, input_2, ground_truth, prompt):
        self.case_name = case_name
        self.input_1 = input_1
        self.input_2 = input_2
        self.process_desc = process_desc
        self.ground_truth = ground_truth
        self.prompt = prompt

# Test Cases for Role Extraction
test_case_roles_bedarfsermittlung = [""]
test_case_roles_debriefing = TestCaseExtraction("Debriefing_role_extraction", process_desc_debriefing, ["HR"], prompt_extracting_roles)
test_case_roles_nebentaetigkeiten = TestCaseExtraction("Nebentaetigkeiten_role_extraction", process_desc_nebentaetigkeiten, ["Mitarbeiter", "HR"], prompt_extracting_roles)

list_test_cases_role_extraction = [test_case_roles_debriefing, test_case_roles_nebentaetigkeiten]

# Test Cases for Activity Extraction
activities_bedarfsermittlung = [""]

activity_list_debriefing = [""]
test_case_activities_debriefing = TestCaseExtraction("Debriefing_activity_extraction", process_desc_debriefing, activity_list_debriefing, prompt_extracting_roles)

activity_list_nebentaetigkeiten = [""]
test_case_activities_nebentaetigkeiten = TestCaseExtraction("Nebentaetigkeiten_activity_extraction", process_desc_nebentaetigkeiten, activity_list_nebentaetigkeiten, prompt_extracting_roles)

list_test_cases_activity_extraction = [test_case_activities_debriefing, test_case_activities_nebentaetigkeiten]

# Test Cases for Model Creation


#### Bleu, Rouge and Meteor Test

In [8]:
import nltk
import nltk.translate.bleu_score

import nltk.translate.meteor_score
from rouge_score import rouge_scorer

# Bleu Score for 2-grams
nltk.translate.bleu_score.sentence_bleu([roles_nebentaetigkeiten], roles_extracted,(1,0,0,0))

# Rouge Score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Converting List to String for Rouge
string1 = ' '.join(roles_nebentaetigkeiten)
string2 = ' '.join(roles_extracted)

# Rouge Scoring
scores = scorer.score(string1, string2)
for key in scores:
    print(f'{key}: {scores[key]}')
    
print(string1)
print(string2)

# Meteor Score
nltk.translate.meteor_score.meteor_score([roles_nebentaetigkeiten], roles_extracted)

NameError: name 'roles_nebentaetigkeiten' is not defined

## Evaluating Role Extraction Prompt

In [None]:
import Levenshtein as lev
import ast
import pandas as pd

#roles_nebentaetigkeiten = []
#roles_nebentaetigkeiten.sort()
#roles_extracted.sort()

#lev.distance(roles_nebentaetigkeiten, roles_extracted)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(columns=["Case Name", "Bleu Score", "Rouge 1 Score", "Rouge L Score", "Meteor Score", "Prompt"])

# init rouge scorer with only 1 grams and RougeL
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

for case in list_test_cases_role_extraction:

    chain = case.prompt | llm
    response_role_extraction = chain.invoke({
        "prozessbeschreibung" : case.process_desc
    })
    
    # Convertion to string list
    extracted_roles = ast.literal_eval(response_role_extraction.content)
    #print(extracted_roles)
    
    extracted_roles.sort()
    case.ground_truth.sort()
    
    # Calc Bleu Score with only 1 grams for roles
    bleu_score = nltk.translate.bleu_score.sentence_bleu([case.ground_truth], extracted_roles, (1, 0, 0, 0))
    
    # Calc Rouge Score
    scores = scorer.score(str(case.ground_truth), str(extracted_roles))
    rouge1_score = scores["rouge1"].fmeasure
    rougeL_score = scores["rougeL"].fmeasure
    
    

    df.loc[len(df)] =(case.case_name, bleu_score, rouge1_score, rougeL_score, 3, case.prompt)
    
    
    
display(df)


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

Unnamed: 0,Case Name,Bleu Score,Rouge 1 Score,Rouge L Score,Meteor Score,Prompt
0,Debriefing_role_extraction,0.25,0.333333,0.333333,3,"((name, None), (input_variables, [prozessbesch..."
1,Nebentaetigkeiten_role_extraction,0.666667,0.571429,0.571429,3,"((name, None), (input_variables, [prozessbesch..."


In [81]:
print(df)

                           Case Name  Bleu Score
0         Debriefing_role_extraction    0.250000
1  Nebentaetigkeiten_role_extraction    0.666667


In [73]:
print(nltk.translate.bleu_score.sentence_bleu([['HR']], ["HR", "Mitarbeiter/in", "Abteilungen", "GF"], (1, 0, 0, 0)))

0.25


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
