In [None]:
import pandas as pd

In [None]:
intranet_df = pd.read_csv("data/intranet_data.tsv", encoding="utf-8", sep="\t")

In [None]:
keyword_df = intranet_df.copy()
substrings = ["Maske", "Beschränkungen", "Verbot", "Besuch", "Corona", "Covid", "Infektion", "Maßnahmen", "Pflicht", "Gebot", "Hygiene", "Abstand"]
for substring in substrings:
    keyword_df[substring] = keyword_df.message.str.lower().str.contains(substring.lower())

# n_matches refers to the number of matched keywords per message
keyword_df["n_matches"] = keyword_df[substrings].sum(axis=1)

In [None]:
# calculate cumulative sum of counts
keyword_count = keyword_df.groupby("n_matches").size().reset_index(name="count")
keyword_count = keyword_count.sort_values("n_matches", ascending=False)
keyword_count["cumulative_sum"] = keyword_count["count"].cumsum()
keyword_count

In [None]:
import matplotlib.pyplot as plt
plt.plot(
    "n_matches",
    "cumulative_sum",
    data=keyword_count.loc[keyword_count.n_matches > 0],
)
plt.xlabel("≥ n distinct keywords in a message")
plt.ylabel("Number of messages")
plt.title("Number of messages vs. Distinct keywords per message")

Setting the threshold of required keywords in a message to ≥3 seems reasonable.

In [None]:
keyword_frequency = pd.DataFrame({"keyword": substrings, "frequency": keyword_df[substrings].sum()}).reset_index(drop=True)
keyword_frequency = keyword_frequency.sort_values(by="frequency", ascending=False)
keyword_frequency

In [None]:
keyword_frequency_3_matches = pd.DataFrame({"keyword": substrings, "frequency": keyword_df.loc[keyword_df.n_matches > 2][substrings].sum(axis=0)}).reset_index(drop=True)
keyword_frequency_3_matches = keyword_frequency_3_matches.sort_values(by="frequency", ascending=False)
keyword_frequency_3_matches

In [None]:
keyword_factor = keyword_frequency_3_matches.copy()
keyword_factor = keyword_factor.rename(columns={"frequency": "frequency_3_matches"})
keyword_factor = pd.merge(keyword_factor, keyword_frequency, on="keyword")
keyword_factor.frequency_3_matches = keyword_factor.frequency_3_matches.astype(float)
keyword_factor.frequency = keyword_factor.frequency.astype(float)

# calculate the ratio of the frequency of keywords in messages with 3 or more keywords vs all messages
keyword_factor["ratio"] = round(keyword_factor.frequency_3_matches / keyword_factor.frequency, 2)
keyword_factor

In [None]:
# calculate cumulative sum of counts
keyword_count = keyword_df.groupby("n_matches").size().reset_index(name="count")
keyword_count = keyword_count.sort_values("n_matches", ascending=False)
keyword_count["cumulative_sum"] = keyword_count["count"].cumsum()
keyword_count

In [None]:
import matplotlib.pyplot as plt
plt.plot(
    "n_matches",
    "cumulative_sum",
    data=keyword_count.loc[keyword_count.n_matches > 0]
)

In [None]:
import spacy

# Load the German language model
nlp = spacy.load("de_core_news_sm")

def remove_names(text):
    # Process the text through the spaCy NLP pipeline
    doc = nlp(text)
    # Iterate over the detected entities
    for ent in doc.ents:
        # Check if the entity is a person's name
        if ent.label_ == "PER" or ent.label_ == "PERSON":
            # Replace the person's name with an empty string
            text = text.replace(ent.text, '')
    return text

In [None]:
# Example text
text = "Michael und Julia haben heute das Projekt abgeschlossen."

# Remove names from the text
cleaned_text = remove_names(text)

print(cleaned_text)

In [None]:
keyword_df_3_or_more_matches = keyword_df.loc[keyword_df.n_matches > 2].reset_index()
keyword_df_3_or_more_matches["message_anonymized"] = keyword_df_3_or_more_matches.message.map(remove_names)

In [None]:
keyword_df_3_or_more_matches

## OpenAI for measure classification
The following code will be used to classify the messages content into "relaxation", "tightening" or "unclear".

In [None]:
from openai import OpenAI
from os import getenv
import json

client = OpenAI(api_key=getenv("OPENAI_API_KEY"))
# Helper function to send messages to OpenAI API (ChatGPT model)
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        messages=messages,
        model="gpt-3.5-turbo",
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content.replace('```', '')

In [None]:
prompt_few_shot = (
    'I will give you a text, which stems from the intranet of a hospital and is most certainly related to antiinfection measures to prevent the spread of airway infections.\n'
    'Your task is to determine whether there is a tightening of the measures, a relaxation or none of both in the text. The valid options are: "tightened", "relaxed", "unclear"\n'
    'Please return the answer as a JSON of the format {{"measure_type": <label>, "evidence" : <keyphrase> }} without any explanations.\n'
    ' '
    'Here is an example:\n'
    'Input: ```… Die Geschäftsführung und Klinikumsleitung haben folgende Anpassungen der Corona-Verfahrensanweisungen für Mitarbeitenden und Patienten für das Klinikum EvB, Campus Potsdam vorgenommen. Damit fallen auch die letzten Testregelungen für die Patient*innen. Die wichtigsten Änderungen, die ab dem 1. Mai gelten, finden Sie hier im Überblick: • Testungen von Patient*innen entfallen ab dem 1.5.2023:- keine Routine-Testungen mehr bei der Aufnahme- keine Testung mehr bei Notfallpatienten - kein Verlaufsscreening mehr • Testungen erfolgen jedoch weiterhin bei Verdacht nach klinischer Maßgabe.``` Output: ```{{"measure_type": "relaxed", "evidence" : "Damit fallen auch die letzten Testregelungen für die Patient*innen." }}```\n'
    'Input: ```{}``` Output: ')

In [None]:
classification_df = pd.DataFrame(columns=["message", "classification", "evidence"])
for message in keyword_df_3_or_more_matches["message_anonymized"][:3]:
    completion = get_completion(prompt_few_shot.format(message))
    jsoned = json.loads(completion)
    classification_df = pd.concat([classification_df, pd.DataFrame([[message, jsoned["measure_type"], jsoned["evidence"]]], columns=["message", "measure_type", "evidence"])])

In [None]:
classification_df