In [28]:
import os
import re
import numpy as np
import pandas as pd
from math import log

In [29]:
df = pd.read_csv('../PROCESS-V1/dem-info.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE
0,Process-rec-001,train,MCI,male,62,25.0
1,Process-rec-002,dev,MCI,male,61,25.0
2,Process-rec-003,train,MCI,female,62,29.0
3,Process-rec-004,dev,MCI,female,67,29.0
4,Process-rec-005,train,MCI,male,65,27.0


In [30]:
# fix 66* string value in Age column
# df['Age'].value_counts()

df['Class'].value_counts()
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)
    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    # fix 66* string value in Age column
    df_train = df.loc[df['TrainOrDev'] == 'train']
    df_dev = df.loc[df['TrainOrDev'] == 'dev']
    return df_train, df_dev

In [31]:
def calculate_tf_idf(texts):
    """
    Berechnet die Term Frequency (TF) und Inverse Document Frequency (IDF) für eine Sammlung von Texten.

    Parameter:
        texts (list of str): Eine Liste von Texten, in denen die Häufigkeit der Wörter analysiert wird.

    Rückgabe:
        tf (list of dict): Liste von Dictionaries, die die TF-Werte pro Dokument enthalten.
        idf (dict): Dictionary, das die IDF-Werte für alle Wörter enthält.
    """
    # Tokenisierung der Texte
    tokenized_texts = [re.findall(r'\b\w+\b', text.lower()) for text in texts]

    # Berechnung der TF
    tf = []
    for tokens in tokenized_texts:
        word_counts = {}
        total_words = len(tokens)
        for word in tokens:
            word_counts[word] = word_counts.get(word, 0) + 1
        tf.append({word: count / total_words for word, count in word_counts.items()})

    # Berechnung der IDF
    doc_count = len(tokenized_texts)
    word_doc_counts = {}
    for tokens in tokenized_texts:
        unique_words = set(tokens)
        for word in unique_words:
            word_doc_counts[word] = word_doc_counts.get(word, 0) + 1

    idf = {word: log(doc_count / (1 + count)) for word, count in word_doc_counts.items()}

    return tf, idf

In [32]:
def process_text_files(df, text_column):
    """
    Liest Textdateien ein, berechnet TF und IDF und fügt die Ergebnisse dem DataFrame hinzu.

    Parameter:
        df (pd.DataFrame): DataFrame mit Informationen zu Textdateien.
        text_column (str): Spaltenname, der die Textdateipfade enthält.

    Rückgabe:
        pd.DataFrame: Aktualisierter DataFrame mit TF und IDF-Informationen.
    """
    # Texte aus den Dateien lesen
    texts = []
    for path in df[text_column]:
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
        else:
            texts.append("")

    # Berechnung von TF und IDF
    tf, idf = calculate_tf_idf(texts)

    # Ergebnisse in den DataFrame einfügen
    df['TF'] = tf
    df['IDF'] = [idf] * len(df)

    return df

In [33]:
# CTD_txt
df_train, df_dev = load_process("../PROCESS-V1/")
result_df = process_text_files(df_train, 'CTD_txt')
result_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE,CTD_wav,CTD_txt,PFT_wav,PFT_txt,SFT_wav,SFT_txt,TF,IDF
0,Process-rec-001,train,MCI,male,62,25.0,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,"{'pat': 0.0091324200913242, 'but': 0.004566210...","{'looks': 0.5877866649021191, 'about': 0.71912..."
2,Process-rec-003,train,MCI,female,62,29.0,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,"{'pat': 0.006493506493506494, '3': 0.006493506...","{'looks': 0.5877866649021191, 'about': 0.71912..."
4,Process-rec-005,train,MCI,male,65,27.0,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,"{'pat': 0.020833333333333332, '50s': 0.0208333...","{'looks': 0.5877866649021191, 'about': 0.71912..."
5,Process-rec-006,train,Dementia,male,83,26.0,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,"{'pat': 0.0072992700729927005, '1': 0.02189781...","{'looks': 0.5877866649021191, 'about': 0.71912..."
8,Process-rec-009,train,HC,female,68,,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,"{'pat': 0.0053475935828877, '2': 0.00534759358...","{'looks': 0.5877866649021191, 'about': 0.71912..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,63,28.0,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,"{'pat': 0.008333333333333333, 'the': 0.075, 's...","{'looks': 0.5877866649021191, 'about': 0.71912..."
153,Process-rec-154,train,HC,female,79,30.0,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,"{'pat': 0.004149377593360996, 'i': 0.029045643...","{'looks': 0.5877866649021191, 'about': 0.71912..."
154,Process-rec-155,train,HC,male,86,29.0,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,"{'pat': 0.010869565217391304, 'little': 0.0108...","{'looks': 0.5877866649021191, 'about': 0.71912..."
155,Process-rec-156,train,Dementia,male,61,26.0,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,"{'pat': 0.015873015873015872, '3': 0.015873015...","{'looks': 0.5877866649021191, 'about': 0.71912..."


In [34]:
result_df.to_csv("TF-IDF_CTD.csv", index = False)

In [35]:
# PFT_txt
df_train, df_dev = load_process("../PROCESS-V1/")
result_df = process_text_files(df_train, 'PFT_txt')
result_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE,CTD_wav,CTD_txt,PFT_wav,PFT_txt,SFT_wav,SFT_txt,TF,IDF
0,Process-rec-001,train,MCI,male,62,25.0,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,"{'pat': 0.020833333333333332, 'people': 0.0208...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
2,Process-rec-003,train,MCI,female,62,29.0,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,"{'pat': 0.01694915254237288, 'um': 0.067796610...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
4,Process-rec-005,train,MCI,male,65,27.0,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,"{'pat': 0.017543859649122806, 'it': 0.01754385...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
5,Process-rec-006,train,Dementia,male,83,26.0,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,"{'pat': 0.02564102564102564, 'paper': 0.025641...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
8,Process-rec-009,train,HC,female,68,,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,"{'pat': 0.0196078431372549, '1': 0.13725490196...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,63,28.0,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,"{'pat': 0.009523809523809525, 'um': 0.05714285...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
153,Process-rec-154,train,HC,female,79,30.0,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,"{'pat': 0.03278688524590164, 'buzzer': 0.03278...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
154,Process-rec-155,train,HC,male,86,29.0,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,"{'pat': 0.044444444444444446, 'countries': 0.0...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."
155,Process-rec-156,train,Dementia,male,61,26.0,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,"{'pat': 0.07142857142857142, 'phew': 0.1428571...","{'pr': 3.6635616461296463, 'porter': 2.9704144..."


In [36]:
result_df.to_csv("TF-IDF_PFT.csv", index = False)

In [37]:
# SFT_txt
df_train, df_dev = load_process("../PROCESS-V1/")
result_df = process_text_files(df_train, 'SFT_txt')
result_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE,CTD_wav,CTD_txt,PFT_wav,PFT_txt,SFT_wav,SFT_txt,TF,IDF
0,Process-rec-001,train,MCI,male,62,25.0,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,../PROCESS-V1//Process-rec-001/Process-rec-001...,"{'pat': 0.024390243902439025, '3': 0.146341463...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
2,Process-rec-003,train,MCI,female,62,29.0,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,../PROCESS-V1//Process-rec-003/Process-rec-003...,"{'pat': 0.017543859649122806, 'cow': 0.0175438...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
4,Process-rec-005,train,MCI,male,65,27.0,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,../PROCESS-V1//Process-rec-005/Process-rec-005...,"{'pat': 0.015151515151515152, 'dog': 0.0151515...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
5,Process-rec-006,train,Dementia,male,83,26.0,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,../PROCESS-V1//Process-rec-006/Process-rec-006...,"{'pat': 0.029411764705882353, 'dog': 0.0294117...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
8,Process-rec-009,train,HC,female,68,,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,../PROCESS-V1//Process-rec-009/Process-rec-009...,"{'pat': 0.025, 'lion': 0.025, 'tiger': 0.025, ...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,63,28.0,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,../PROCESS-V1//Process-rec-153/Process-rec-153...,"{'pat': 0.008695652173913044, 'um': 0.14782608...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
153,Process-rec-154,train,HC,female,79,30.0,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,../PROCESS-V1//Process-rec-154/Process-rec-154...,"{'pat': 0.022727272727272728, 'cat': 0.0340909...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
154,Process-rec-155,train,HC,male,86,29.0,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,../PROCESS-V1//Process-rec-155/Process-rec-155...,"{'pat': 0.014084507042253521, 'horse': 0.01408...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."
155,Process-rec-156,train,Dementia,male,61,26.0,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,../PROCESS-V1//Process-rec-156/Process-rec-156...,"{'pat': 0.024390243902439025, 'pig': 0.0731707...","{'jaguar': 2.68273239311792, '4': 1.1786549963..."


In [38]:
result_df.to_csv("TF-IDF_SFT.csv", index = False)