In [2]:
import pandas as pd
import os
import re

In [3]:
df = pd.read_csv('../PROCESS-V1/dem-info.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE
0,Process-rec-001,train,MCI,male,62,25.0
1,Process-rec-002,dev,MCI,male,61,25.0
2,Process-rec-003,train,MCI,female,62,29.0
3,Process-rec-004,dev,MCI,female,67,29.0
4,Process-rec-005,train,MCI,male,65,27.0


In [4]:
def calculate_z_scores(df, columns):
    """
    Berechnet den Z-Score für die angegebenen Spalten eines DataFrames.
    Fügt die berechneten Z-Scores als neue Spalten hinzu.
    """
    for column in columns:
        if column in df.columns:
            mean = df[column].mean()
            std = df[column].std()
            if std > 0:
                df[f'{column} Z-Score'] = (df[column] - mean) / std
            else:
                df[f'{column} Z-Score'] = 0  # Falls die Standardabweichung 0 ist
    return df

In [5]:
# fix 66* string value in Age column
# df['Age'].value_counts()

df['Class'].value_counts()
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)
    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    # fix 66* string value in Age column
    df_train = df.loc[df['TrainOrDev'] == 'train']
    df_dev = df.loc[df['TrainOrDev'] == 'dev']
    return df_train, df_dev

df_train, df_dev = load_process("../PROCESS-V1/")

In [6]:
def get_word_count_and_wait_time(text_path):
    """
    Analysiert eine Textdatei, extrahiert die Anzahl der Wörter und die gesamte Wartezeit.
    """
    try:
        with open(text_path, 'r') as file:
            content = file.read()

        # Wartezeiten extrahieren und summieren
        wait_times = re.findall(r'\((\d+) seconds?\)', content)
        total_wait_time = sum(int(seconds) for seconds in wait_times)

       # Wörter zählen
        words = re.sub(r'\(\d+ seconds?\)', '', content)  # Entferne die Wartezeiten aus dem Text
        word_count = len(words.split())

        return word_count, total_wait_time

    except Exception as e:
        print(f'Fehler beim Verarbeiten von {text_path}: {e}')
        return 0, 0

In [7]:
def process_recordings(folder_path, dem_info_path):
    """
    Iteriert durch alle Textdateien im Ordner, berechnet die Wortanzahl und Wartezeit,
    und fügt zusätzliche Informationen aus der dem-info.csv hinzu.
    """
    # Lade die dem-info.csv-Datei
    dem_info = pd.read_csv(dem_info_path)
    
    # Fixiere fehlerhafte Werte in der Age-Spalte
    dem_info['Age'] = dem_info['Age'].apply(lambda x: x.replace('66*', '66') if isinstance(x, str) else x)
    dem_info['Age'] = dem_info['Age'].astype(int)
    
    records = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith((".txt")):
                # Art des Textes bestimmen (CTD, PFT, SFT)
                if "__CTD" in file:
                    text_type = "CTD"
                elif "__PFT" in file:
                    text_type = "PFT"
                elif "__SFT" in file:
                    text_type = "SFT"
                else:
                    continue  # Überspringe Dateien ohne passenden Texttyp

                text_path = os.path.join(root, file)

                # Record-ID extrahieren
                record_id = os.path.basename(root)

                # Wortanzahl und Wartezeit berechnen
                word_count, total_wait_time = get_word_count_and_wait_time(text_path)

                # Zusätzliche Informationen aus dem DataFrame holen
                dem_info_row = dem_info[dem_info['Record-ID'] == record_id]
                if not dem_info_row.empty:
                    train_or_dev = dem_info_row.iloc[0]['TrainOrDev']
                    rec_class = dem_info_row.iloc[0]['Class']
                    gender = dem_info_row.iloc[0]['Gender']
                    age = dem_info_row.iloc[0]['Age']
                    converted_mmse = dem_info_row.iloc[0]['Converted-MMSE']
                else:
                    train_or_dev = None
                    rec_class = None
                    gender = None
                    age = None
                    converted_mmse = None

                records.append({
                    "Record-ID": record_id,
                    "TrainOrDev": train_or_dev,
                    "Class": rec_class,
                    "Gender": gender,
                    "Age": age,
                    f'Word Count {text_type}': word_count,
                    f'Total Wait Time (s) {text_type}': total_wait_time,
                    "Converted-MMSE": converted_mmse
                })

    # Konvertiere die Ergebnisse in einen DataFrame
    results_df = pd.DataFrame(records)

    # Gruppieren nach Record-ID und zusammenfassen
    results_df = results_df.groupby("Record-ID").first().reset_index()

    # Fehlende Spalten auffüllen
    for ext in ["CTD", "PFT", "SFT"]:
        if f'Word Count {ext}' not in results_df.columns:
            results_df[f'Word Count {ext}'] = 0
        if f'Total Wait Time (s) {ext}' not in results_df.columns:
            results_df[f'Total Wait Time (s) {ext}'] = 0.0

    return results_df

## Ausgabe der Z-Score Spalten

In [11]:
folder_path = "../PROCESS-V1"
dem_info_path = "../PROCESS-V1/dem-info.csv"

results_df = process_recordings(folder_path, dem_info_path)
results_df = results_df.drop(index = 0).reset_index(drop = True)

# Berechnung der Z-Scores und Auswahl der gewünschten Spalten
# Z-Score für relevante numerische Spalten berechnen
numerical_columns = [
    "Age", "Word Count CTD", "Total Wait Time (s) CTD",
    "Word Count PFT", "Total Wait Time (s) PFT",
    "Word Count SFT", "Total Wait Time (s) SFT",
    "Converted-MMSE"
]

results_df = calculate_z_scores(results_df, numerical_columns)

# Auswahl der Spalten mit Z-Scores und Metadaten
final_columns = [
    "Record-ID", "TrainOrDev", "Class", "Gender",
    "Age Z-Score", "Word Count CTD Z-Score", "Total Wait Time (s) CTD Z-Score",
    "Word Count PFT Z-Score", "Total Wait Time (s) PFT Z-Score",
    "Word Count SFT Z-Score", "Total Wait Time (s) SFT Z-Score",
    "Converted-MMSE Z-Score"
]

results_df = results_df[final_columns]
results_df.to_csv("Z-Score.csv", index=False)

## Ausgabe der nicht Z-Score Spalten

In [12]:
folder_path = "../PROCESS-V1"
dem_info_path = "../PROCESS-V1/dem-info.csv"

results_df = process_recordings(folder_path, dem_info_path)
results_df = results_df.drop(index = 0).reset_index(drop = True)

# Reihenfolge der Spalten festlegen
final_columns = [
    "Record-ID", "TrainOrDev", "Class", "Gender", "Age", 
    "Word Count CTD", "Total Wait Time (s) CTD",
    "Word Count PFT", "Total Wait Time (s) PFT",
    "Word Count SFT", "Total Wait Time (s) SFT",
    "Converted-MMSE"
]

results_df = results_df[final_columns]
results_df.to_csv("no-Z-Score.csv", index=False)

## Ausgabe aller Spalten

In [13]:
folder_path = "../PROCESS-V1"
dem_info_path = "../PROCESS-V1/dem-info.csv"

results_df = process_recordings(folder_path, dem_info_path)
results_df = results_df.drop(index = 0).reset_index(drop = True)

# Z-Score für relevante numerische Spalten berechnen
numerical_columns = [
    "Age", "Word Count CTD", "Total Wait Time (s) CTD",
    "Word Count PFT", "Total Wait Time (s) PFT",
    "Word Count SFT", "Total Wait Time (s) SFT",
    "Converted-MMSE"
]

results_df = calculate_z_scores(results_df, numerical_columns)

# Reihenfolge der Spalten festlegen
final_columns = [
    "Record-ID", "TrainOrDev", "Class", "Gender", "Age", 
    "Word Count CTD", "Total Wait Time (s) CTD",
    "Word Count PFT", "Total Wait Time (s) PFT",
    "Word Count SFT", "Total Wait Time (s) SFT",
    "Converted-MMSE"
]

results_df.to_csv("all-columns.csv", index=False)