In [1]:
import os
from datetime import datetime
import json
from collections import Counter
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Informativeness

## Preparation for informativeness calculation

In [None]:
# Funktion zum Zählen von Wortstämmen
def count_words(file_path):
    try:
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()
            
        # Tokenize the text into words
        tokens = word_tokenize(text.lower())  # lowercase for uniformity

        # Remove stopwords (common words that don't add much information)
        stop_words = set(stopwords.words('german'))
        tokens = [word for word in tokens if word not in stop_words and word.isalpha() and word != "b"]  # Keep only alphabetic words

        # Perform stemming
        stemmer = SnowballStemmer("german")
        stems = [stemmer.stem(word) for word in tokens]

        # Häufigkeiten der Wortstämme zählen
        word_counts = Counter(stems)

        return word_counts
    except FileNotFoundError:
        print(f"Datei nicht gefunden: {file_path}")
        return {}
    except Exception as e:
        print(f"Ein Fehler ist aufgetreten: {e}")
        return {}

# Funktion zum Speichern als Pickle
def save_as_pickle(data, output_file):
    try:
        with open(output_file, 'wb') as file:
            pickle.dump(data, file)
        print(f"Dictionary gespeichert als: {output_file}")
    except Exception as e:
        print(f"Fehler beim Speichern der Datei: {e}")

# Pfad zur Textdatei
file_path = 'DeReKo_text.txt'   
pickle_file_path = 'word_counts.pkl'

if os.path.exists(pickle_file_path):
    print(f"Die Datei {pickle_file_path} existiert bereits.")
else:
    # Wortstämme zählen
    word_counts = count_words(file_path)

    # Normalisierte Dictionary als Pickle speichern
    save_as_pickle(word_counts, pickle_file_path)

    # Ausgabe
    print(word_counts)


Die Datei word_counts.pkl existiert bereits.


## Informativeness function

In [None]:
import math
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def load_word_frequencies():
    # Load the pickle file
    try:
        with open(pickle_file_path, 'rb') as file:
            word_frequencies = pickle.load(file)
        print("Pickle file loaded successfully.")
    except Exception as e:
        print(f"An error occurred while loading the pickle file: {e}")
    return word_frequencies

# Preprocess the user input (tokenization, removing stopwords, stemming)
def preprocess_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())  # lowercase for uniformity

    # Remove stopwords (common words that don't add much information)
    stop_words = set(stopwords.words('german'))
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]  # Keep only alphabetic words

    # Perform stemming
    stemmer = SnowballStemmer("german")
    stems = [stemmer.stem(word) for word in tokens]
    
    return stems

# Calculate the surprisal of a word based on its frequency
def calculate_surprisal(word, word_frequencies):
    total_words = sum(word_frequencies.values())
    word_freq = word_frequencies.get(word, 1)  # Default to 1 to avoid log(0)
    probability = word_freq / total_words
    return -math.log(probability)  

# Calculate informativeness of a user's input (U)
def calculate_informativeness(user_input, word_frequencies):
    # Preprocess the user's input
    words = preprocess_text(user_input)

    if len(words) == 0:
        return 0

    # Calculate the surprisal for each word
    surprisals = [calculate_surprisal(word, word_frequencies) for word in words]

    # Normalize the surprisals
    min_surprisal = min(surprisals)
    max_surprisal = max(surprisals)
    
    normalized_surprisals = [
        (surprisal - min_surprisal) / (max_surprisal - min_surprisal) if max_surprisal != min_surprisal else 1
        for surprisal in surprisals
    ]   

    # Calculate informativeness
    informativeness = sum(normalized_surprisals)
    
    return informativeness

# Example usage
word_frequencies = load_word_frequencies()  
user_input = "Ich bin Manuel Neuer und liebe eckbälle und phillip lahm und wohne in Mannheim"

informativeness_score = calculate_informativeness(user_input, word_frequencies)

print(f"Informativeness of user input: {informativeness_score}")


Pickle file loaded successfully.
Informativeness of user input: 5.219114520662927


# Word counting

In [6]:
import re

def calculate_metrics(dir:str, type:str, word_freqeuncies):
    word_count = 0
    answer_count = 0
    informativeness = 0
    if type == "chatbot":
        for root, dirs, files in os.walk(dir):
            for file in files:
                if file.endswith("Chat.txt"):
                    with open(os.path.join(dir, file), 'r', encoding='ISO-8859-1') as file:
                        text = file.read()
                        split_string = re.split(r'\d{2}:\d{2}:\d{2} ', text)
                        for absatz in split_string:
                            if absatz.startswith("user:"):
                                if absatz[5:] != "":
                                    answer_count += 1
                                    word_count += len(absatz[5:].split())
                                    informativeness += calculate_informativeness(absatz[5:], word_frequencies)

    elif type == "human":
        for root, dirs, files in os.walk(dir):
            for file in files:
                if file.endswith("Transkript.txt"):
                    with open(os.path.join(dir, file), 'r', encoding='utf-8') as file:
                        text = file.read()
                        absatz_split = text.split("\n\n")
                        for absatz in absatz_split:
                            if absatz.lstrip("\n").startswith("Proband"):
                                answer_text = "\n".join(absatz.split("\n")[1:])
                                if answer_text != "":
                                    answer_count += 1
                                    word_count += len(answer_text.split())
                                    informativeness += calculate_informativeness(answer_text, word_frequencies)

    else:
        raise ValueError(f"type has to be human or chatbot not: {type}")

    return (word_count, answer_count, informativeness)
        


# Metrics

In [None]:
def calculate_metrics_for_all_dirs(base_directory:str,):
    types = ["human", "chatbot"]
    extracted_surveys = []
    current_date = datetime.now().strftime('%Y-%m-%d-%H-%M')  
    word_frequencies = load_word_frequencies()
    for root, dirs, files in os.walk(base_directory):
        for dir in dirs:
            for type in types:
                word_count_total, answer_count, informativeness = calculate_metrics(os.path.join(root, dir), type, word_frequencies)
                # create placeholder for manual evaluation of other metrics
                metrics = {
                    "Erkenntnisse": {
                        "Nutzer": ["Aspekt 1", "Aspekt 2"],
                        "Ziele": ["Aspekt 1", "Aspekt 2"],
                        "Aufgaben": ["Aspekt 1", "Aspekt 2"],
                        "Umgebung": ["Aspekt 1", "Aspekt 2"],
                        "Ressourcen": ["Aspekt 1", "Aspekt 2"]
                    },
                    "Woerter_gesamt": word_count_total,
                    "informativität": informativeness,
                    "fragen_gesamt": answer_count,
                    "klarstellungsanfragen": -1,
                    "leading_biased_fragen": -1,
                    "geschlossene_fragen": -1,
                    "interview_laenge": -1,
                }

                

                json_file_name = f"metrics_{type}_{current_date}.json"
                json_file_path = os.path.join(root, dir, json_file_name)
                with open(json_file_path, 'w') as json_file:
                    json.dump(metrics, json_file, indent=4)
            

    return extracted_surveys

base_directory = "..\..\Interviewdaten"
extracted_human_surveys = calculate_metrics_for_all_dirs(base_directory)

Pickle file loaded successfully.
Versuch_17
