In [49]:
import os
import re
import numpy as np
import pandas as pd
from math import log

In [50]:
df = pd.read_csv('../PROCESS-V1/dem-info.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE
0,Process-rec-001,train,MCI,male,62,25.0
1,Process-rec-002,dev,MCI,male,61,25.0
2,Process-rec-003,train,MCI,female,62,29.0
3,Process-rec-004,dev,MCI,female,67,29.0
4,Process-rec-005,train,MCI,male,65,27.0


In [51]:
# fix 66* string value in Age column
# df['Age'].value_counts()

df['Class'].value_counts()
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)
    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    # fix 66* string value in Age column
    df_train = df.loc[df['TrainOrDev'] == 'train']
    df_dev = df.loc[df['TrainOrDev'] == 'dev']
    return df_train, df_dev

In [52]:
def calculate_tf_idf(texts):
    """
    Berechnet die Term Frequency (TF) und Inverse Document Frequency (IDF) für eine Sammlung von Texten.

    Parameter:
        texts (list of str): Eine Liste von Texten, in denen die Häufigkeit der Wörter analysiert wird.

    Rückgabe:
        tf (list of dict): Liste von Dictionaries, die die TF-Werte pro Dokument enthalten.
        idf (dict): Dictionary, das die IDF-Werte für alle Wörter enthält.
    """
    # Tokenisierung der Texte
    tokenized_texts = [re.findall(r'\b\w+\b', text.lower()) for text in texts]

    # Berechnung der TF
    tf = []
    for tokens in tokenized_texts:
        word_counts = {}
        total_words = len(tokens)
        for word in tokens:
            word_counts[word] = word_counts.get(word, 0) + 1
        tf.append({word: count / total_words for word, count in word_counts.items()})

    # Berechnung der IDF
    doc_count = len(tokenized_texts)
    word_doc_counts = {}
    for tokens in tokenized_texts:
        unique_words = set(tokens)
        for word in unique_words:
            word_doc_counts[word] = word_doc_counts.get(word, 0) + 1

    idf = {word: log(doc_count / (1 + count)) for word, count in word_doc_counts.items()}

    return tf, idf

In [53]:
def process_text_files(df, text_column):
    """
    Liest Textdateien ein, berechnet TF und IDF und fügt die Ergebnisse dem DataFrame hinzu.

    Parameter:
        df (pd.DataFrame): DataFrame mit Informationen zu Textdateien.
        text_column (str): Spaltenname, der die Textdateipfade enthält.

    Rückgabe:
        pd.DataFrame: Aktualisierter DataFrame mit TF und IDF-Informationen.
    """
    # Texte aus den Dateien lesen
    texts = []
    for path in df[text_column]:
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
        else:
            texts.append("")

    # Berechnung von TF und IDF
    tf, idf = calculate_tf_idf(texts)

    # Ergebnisse in den DataFrame einfügen
    df['TF'] = tf
    df['IDF'] = [idf] * len(df)

    return df

In [57]:
# CTD_txt_train
result_df_train = process_text_files(df_train, 'CTD_txt')
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Converted-MMSE", "TF", "IDF"]
result_df_train = result_df_train[final_columns]
result_df_train

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Converted-MMSE,TF,IDF
0,Process-rec-001,train,MCI,male,25.0,"{'pat': 0.0091324200913242, 'but': 0.004566210...","{'really': 1.8177349556313156, 'amusing': 4.06..."
2,Process-rec-003,train,MCI,female,29.0,"{'pat': 0.006493506493506494, '3': 0.006493506...","{'really': 1.8177349556313156, 'amusing': 4.06..."
4,Process-rec-005,train,MCI,male,27.0,"{'pat': 0.020833333333333332, '50s': 0.0208333...","{'really': 1.8177349556313156, 'amusing': 4.06..."
5,Process-rec-006,train,Dementia,male,26.0,"{'pat': 0.0072992700729927005, '1': 0.02189781...","{'really': 1.8177349556313156, 'amusing': 4.06..."
8,Process-rec-009,train,HC,female,,"{'pat': 0.0053475935828877, '2': 0.00534759358...","{'really': 1.8177349556313156, 'amusing': 4.06..."
...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,28.0,"{'pat': 0.008333333333333333, 'the': 0.075, 's...","{'really': 1.8177349556313156, 'amusing': 4.06..."
153,Process-rec-154,train,HC,female,30.0,"{'pat': 0.004149377593360996, 'i': 0.029045643...","{'really': 1.8177349556313156, 'amusing': 4.06..."
154,Process-rec-155,train,HC,male,29.0,"{'pat': 0.010869565217391304, 'little': 0.0108...","{'really': 1.8177349556313156, 'amusing': 4.06..."
155,Process-rec-156,train,Dementia,male,26.0,"{'pat': 0.015873015873015872, '3': 0.015873015...","{'really': 1.8177349556313156, 'amusing': 4.06..."


In [32]:
result_df_train.to_csv("TF-IDF_CTD_train.csv", index = False)

In [58]:
# CTD_txt_dev
result_df_dev = process_text_files(df_dev, 'CTD_txt')
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Converted-MMSE", "TF", "IDF"]
result_df_dev = result_df_dev[final_columns]
result_df_dev

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Converted-MMSE,TF,IDF
1,Process-rec-002,dev,MCI,male,25.0,"{'pat': 0.011363636363636364, '4': 0.011363636...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
3,Process-rec-004,dev,MCI,female,29.0,"{'pat': 0.005917159763313609, 'a': 0.029585798...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
6,Process-rec-007,dev,HC,male,,"{'pat': 0.0045662100456621, 'washing': 0.00456...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
7,Process-rec-008,dev,MCI,female,25.0,"{'pat': 0.008403361344537815, '3': 0.016806722...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
15,Process-rec-016,dev,Dementia,female,,"{'pat': 0.006535947712418301, '2': 0.006535947...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
20,Process-rec-021,dev,MCI,male,,"{'pat': 0.018518518518518517, 'there': 0.01851...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
21,Process-rec-022,dev,Dementia,male,27.0,"{'pat': 0.014285714285714285, '2': 0.014285714...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
31,Process-rec-032,dev,MCI,male,,"{'pat': 0.006896551724137931, '6': 0.006896551...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
32,Process-rec-033,dev,MCI,female,,"{'pat': 0.0064516129032258064, 'a': 0.05806451...","{'jar': 0.13353139262452257, 'is': 0.077961541..."
34,Process-rec-035,dev,Dementia,male,27.0,"{'pat': 0.011235955056179775, '1': 0.044943820...","{'jar': 0.13353139262452257, 'is': 0.077961541..."


In [34]:
result_df_dev.to_csv("TF-IDF_CTD_dev.csv", index = False)

In [59]:
# PFT_txt_train
result_df_dev = process_text_files(df_train, 'PFT_txt')
final_columns = ["Record-ID", "TrainOrDev", "Class", "Gender", "Converted-MMSE", "TF", "IDF"]
result_df_train = result_df_train[final_columns]
result_df_train

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Converted-MMSE,TF,IDF
0,Process-rec-001,train,MCI,male,25.0,"{'pat': 0.0091324200913242, 'but': 0.004566210...","{'really': 1.8177349556313156, 'amusing': 4.06..."
2,Process-rec-003,train,MCI,female,29.0,"{'pat': 0.006493506493506494, '3': 0.006493506...","{'really': 1.8177349556313156, 'amusing': 4.06..."
4,Process-rec-005,train,MCI,male,27.0,"{'pat': 0.020833333333333332, '50s': 0.0208333...","{'really': 1.8177349556313156, 'amusing': 4.06..."
5,Process-rec-006,train,Dementia,male,26.0,"{'pat': 0.0072992700729927005, '1': 0.02189781...","{'really': 1.8177349556313156, 'amusing': 4.06..."
8,Process-rec-009,train,HC,female,,"{'pat': 0.0053475935828877, '2': 0.00534759358...","{'really': 1.8177349556313156, 'amusing': 4.06..."
...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,28.0,"{'pat': 0.008333333333333333, 'the': 0.075, 's...","{'really': 1.8177349556313156, 'amusing': 4.06..."
153,Process-rec-154,train,HC,female,30.0,"{'pat': 0.004149377593360996, 'i': 0.029045643...","{'really': 1.8177349556313156, 'amusing': 4.06..."
154,Process-rec-155,train,HC,male,29.0,"{'pat': 0.010869565217391304, 'little': 0.0108...","{'really': 1.8177349556313156, 'amusing': 4.06..."
155,Process-rec-156,train,Dementia,male,26.0,"{'pat': 0.015873015873015872, '3': 0.015873015...","{'really': 1.8177349556313156, 'amusing': 4.06..."


In [36]:
result_df_train.to_csv("TF-IDF_PFT_train.csv", index = False)

In [60]:
# PFT_txt_dev
result_df_dev = process_text_files(df_dev, 'PFT_txt')
final_columns = ["Record-ID", "TrainOrDev", "Class", "Gender", "Converted-MMSE", "TF", "IDF"]
result_df_dev = result_df_dev[final_columns]
result_df_dev

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Converted-MMSE,TF,IDF
1,Process-rec-002,dev,MCI,male,25.0,"{'pat': 0.027777777777777776, '1': 0.111111111...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
3,Process-rec-004,dev,MCI,female,29.0,"{'pat': 0.016129032258064516, 'plank': 0.01612...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
6,Process-rec-007,dev,HC,male,,"{'pat': 0.058823529411764705, 'place': 0.05882...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
7,Process-rec-008,dev,MCI,female,25.0,"{'pat': 0.017857142857142856, '2': 0.160714285...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
15,Process-rec-016,dev,Dementia,female,,"{'pat': 0.02040816326530612, '2': 0.0816326530...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
20,Process-rec-021,dev,MCI,male,,"{'pat': 0.011494252873563218, '3': 0.011494252...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
21,Process-rec-022,dev,Dementia,male,27.0,"{'pat': 0.025, '3': 0.05, 'seconds': 0.15, 'po...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
31,Process-rec-032,dev,MCI,male,,"{'pat': 0.056818181818181816, '1': 0.034090909...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
32,Process-rec-033,dev,MCI,female,,"{'pat': 0.017857142857142856, '2': 0.107142857...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."
34,Process-rec-035,dev,Dementia,male,27.0,"{'pat': 0.04225352112676056, 'perfect': 0.0281...","{'5': 0.7985076962177716, 'pilot': 2.302585092..."


In [38]:
result_df_dev.to_csv("TF-IDF_PFT_dev.csv", index = False)

In [61]:
# SFT_txt_train
result_df_train = process_text_files(df_train, 'SFT_txt')
final_columns = ["Record-ID", "TrainOrDev", "Class", "Gender", "Converted-MMSE", "TF", "IDF"]
result_df_train = result_df_train[final_columns]
result_df_train

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Converted-MMSE,TF,IDF
0,Process-rec-001,train,MCI,male,25.0,"{'pat': 0.024390243902439025, '3': 0.146341463...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
2,Process-rec-003,train,MCI,female,29.0,"{'pat': 0.017543859649122806, 'cow': 0.0175438...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
4,Process-rec-005,train,MCI,male,27.0,"{'pat': 0.015151515151515152, 'dog': 0.0151515...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
5,Process-rec-006,train,Dementia,male,26.0,"{'pat': 0.029411764705882353, 'dog': 0.0294117...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
8,Process-rec-009,train,HC,female,,"{'pat': 0.025, 'lion': 0.025, 'tiger': 0.025, ...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,28.0,"{'pat': 0.008695652173913044, 'um': 0.14782608...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
153,Process-rec-154,train,HC,female,30.0,"{'pat': 0.022727272727272728, 'cat': 0.0340909...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
154,Process-rec-155,train,HC,male,29.0,"{'pat': 0.014084507042253521, 'horse': 0.01408...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."
155,Process-rec-156,train,Dementia,male,26.0,"{'pat': 0.024390243902439025, 'pig': 0.0731707...","{'mosquito': 3.6635616461296463, 'giraffe': 0...."


In [45]:
result_df_train.to_csv("TF-IDF_SFT_train.csv", index = False)

In [62]:
# SFT_txt_dev
result_df_dev = process_text_files(df_dev, 'SFT_txt')
final_columns = ["Record-ID", "TrainOrDev", "Class", "Gender", "Converted-MMSE", "TF", "IDF"]
result_df_dev = result_df_dev[final_columns]
result_df_dev

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Converted-MMSE,TF,IDF
1,Process-rec-002,dev,MCI,male,25.0,"{'pat': 0.02040816326530612, '1': 0.0816326530...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
3,Process-rec-004,dev,MCI,female,29.0,"{'pat': 0.016129032258064516, 'um': 0.09677419...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
6,Process-rec-007,dev,HC,male,,"{'pat': 0.010752688172043012, 'goat': 0.010752...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
7,Process-rec-008,dev,MCI,female,25.0,"{'pat': 0.01818181818181818, 'giraffe': 0.0181...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
15,Process-rec-016,dev,Dementia,female,,"{'pat': 0.017857142857142856, '2': 0.053571428...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
20,Process-rec-021,dev,MCI,male,,"{'pat': 0.007518796992481203, 'dog': 0.0075187...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
21,Process-rec-022,dev,Dementia,male,27.0,"{'pat': 0.01639344262295082, 'antelope': 0.016...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
31,Process-rec-032,dev,MCI,male,,"{'pat': 0.05660377358490566, 'rabbit': 0.00943...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
32,Process-rec-033,dev,MCI,female,,"{'pat': 0.017543859649122806, 'cat': 0.0175438...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."
34,Process-rec-035,dev,Dementia,male,27.0,"{'pat': 0.047619047619047616, 'cat': 0.0158730...","{'rabbits': 2.5902671654458267, '5': 0.9808292..."


In [47]:
result_df_dev.to_csv("TF-IDF_SFT_dev.csv", index = False)