# Event Extraction for WCCP
For executing the event extraction for the WCCP dataset, you need to generate the embeddings and store them at `resources/embeddings.csv`. See the `etl/common/analyze_dataset.ipynb` for how to generate embeddings.

In [None]:
import json
import os
import openai
import pandas as pd
from openai.types.chat import (
    ChatCompletionSystemMessageParam,
    ChatCompletionUserMessageParam,
)

from .helpers import select_dynamic_exmaples, prepare_arrival_condition, save_result_to_cache
from .build_prompt import build_prompt
from common.event_extraction.helpers import load_data_source
from etl.etltools.cache import JsonCache

# Loading the embeddings
embeddings = pd.read_csv("./resources/embeddings.csv")

# Loading the dynamic examples
with open("./resources/dynamic_examples.json") as file:
    dynamic_examples = json.load(file)

# Loading the event types
with open("../../../common/event_extraction/event_types.json", "r") as file:
    event_types = json.load(file)

# Loading the cache
cache = JsonCache("./result_cache.json")

# Load data source and extract relevant columns
EVENT_EXTRACTION_RELEVANT_COLS = ["history-and-ownership", "depot-possessor", "depot-number", "condition-and-repair-record", "arrival-condition", "arrival-date", "exit-date"]
DATA_SOURCE_FILE_PATH = os.path.join("..", "..", "..", "..", "data", "wccp", "wiesbaden-ccp-property-cards-ocr-export-postprocessed-16-11-23.csv")
data_source = load_data_source(DATA_SOURCE_FILE_PATH, EVENT_EXTRACTION_RELEVANT_COLS)

# Executes the prompt for each row in the wccp data source
for _, row in data_source.iterrows():
    print(f"Started to parse: {row["history-and-ownership"][:50]}...")

    print("Preparing System Message...")
    examples = select_dynamic_exmaples(
        free_text_value_to_parse=row["history-and-ownership"],
        embeddings_df=embeddings,
        dynamic_examples=dynamic_examples,
        number_of_examples=2
    )

    sys_msg = build_prompt(event_types=event_types, examples=examples)


    print("Running prompt...")

    current_value_to_parse = {
        "history-and-ownership": row["history-and-ownership"],
        "depot-possessor": row["depot-possessor"],
        "depot-number": row["depot-number"],
        "arrival-condition": prepare_arrival_condition(row["arrival-condition"], row["condition-and-repair-record"]),
        "arrival-date": row["arrival-date"],
        "exit-date": row["exit-date"]
    }

    # Run prompt
    user_param = ChatCompletionUserMessageParam(
        role="user", content=json.dumps(current_value_to_parse)
    )
    sys_param = ChatCompletionSystemMessageParam(role="system", content=sys_msg)
    response = openai.chat.completions.create(
        model="gpt-4-preview",
        temperature=0,
        messages=[sys_param, user_param],
    )

    # Print results
    parsed_result = response.choices[0].message.content
    print("Received results!")
    print(f"Result: {parsed_result}")
    print(f"Token Usage: {response.usage}")

    # Save result to cache
    print("Saving to cache..")
    error = save_result_to_cache(cache, current_value_to_parse, parsed_result)
    if error is not None:
        print("There has been an error parsing the result to json:")
        print(error)

    print("\n\n")