In [1]:
import os
import re
import pandas as pd
from math import log

In [2]:
df = pd.read_csv('../PROCESS-V1/dem-info.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE
0,Process-rec-001,train,MCI,male,62,25.0
1,Process-rec-002,dev,MCI,male,61,25.0
2,Process-rec-003,train,MCI,female,62,29.0
3,Process-rec-004,dev,MCI,female,67,29.0
4,Process-rec-005,train,MCI,male,65,27.0


In [3]:
# fix 66* string value in Age column
# df['Age'].value_counts()

df['Class'].value_counts()
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)
    
    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    # fix 66* string value in Age column
    df_train = df.loc[df['TrainOrDev'] == 'train']
    df_dev = df.loc[df['TrainOrDev'] == 'dev']
    return df_train, df_dev

df_train, df_dev = load_process("../PROCESS-V1/")

In [4]:
def get_word_count_and_wait_time(text_path):
    """
    Analyze text file, extract word count and total wait time.
    """
    try:
        with open(text_path, 'r') as file:
            content = file.read()

        # Wartezeiten extrahieren und summieren
        wait_times = re.findall(r'\((\d+) seconds?\)', content)
        total_wait_time = sum(int(seconds) for seconds in wait_times)

       # Wörter zählen
        words = re.sub(r'\(\d+ seconds?\)', '', content)  # Entferne die Wartezeiten aus dem Text
        word_count = len(words.split())

        return word_count, total_wait_time

    except Exception as e:
        print(f'Fehler beim Verarbeiten von {text_path}: {e}')
        return 0, 0

In [5]:
def process_recordings(folder_path, dem_info_path):
    """
    Iterate through all text files in file, calculate word count and total wait time and add more information of dem-info.csv.
    """
    # load dem-info.csv file
    dem_info = pd.read_csv(dem_info_path)
    
    # fix 66* string value in Age column
    dem_info['Age'] = dem_info['Age'].apply(lambda x: x.replace('66*', '66') if isinstance(x, str) else x)
    dem_info['Age'] = dem_info['Age'].astype(int)
    
    records = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith((".txt")):
                # kind of text (CTD, PFT, SFT)
                if "__CTD" in file:
                    text_type = "CTD"
                elif "__PFT" in file:
                    text_type = "PFT"
                elif "__SFT" in file:
                    text_type = "SFT"
                else:
                    continue  # skip files with no apropriate text type

                text_path = os.path.join(root, file)

                # extract Record-ID
                record_id = os.path.basename(root)

                # calcualte word count and total wait time
                word_count, total_wait_time = get_word_count_and_wait_time(text_path)

                # get more information of DataFrame
                dem_info_row = dem_info[dem_info['Record-ID'] == record_id]
                if not dem_info_row.empty:
                    train_or_dev = dem_info_row.iloc[0]['TrainOrDev']
                    rec_class = dem_info_row.iloc[0]['Class']
                    gender = dem_info_row.iloc[0]['Gender']
                    age = dem_info_row.iloc[0]['Age']
                    converted_mmse = dem_info_row.iloc[0]['Converted-MMSE']
                else:
                    train_or_dev = None
                    rec_class = None
                    gender = None
                    age = None
                    converted_mmse = None

                records.append({
                    "Record-ID": record_id,
                    "TrainOrDev": train_or_dev,
                    "Class": rec_class,
                    "Gender": gender,
                    "Age": age,
                    f'Word_Count_{text_type}': word_count,
                    f'Total_Wait_Time_{text_type}': total_wait_time,
                    "Converted-MMSE": converted_mmse
                })

    # Convert results in DataFrame
    results_df = pd.DataFrame(records)

    # Group by Record-ID and summarize
    results_df = results_df.groupby("Record-ID").first().reset_index()

    # Fill missing columns
    for ext in ["CTD", "PFT", "SFT"]:
        if f'Word_Count_{ext}' not in results_df.columns:
            results_df[f'Word Count {ext}'] = 0
        if f'Total_Wait_Time_{ext}' not in results_df.columns:
            results_df[f'Total_Wait_Time_{ext}'] = 0.0

    return results_df

In [6]:
folder_path = "../PROCESS-V1"
dem_info_path = "../PROCESS-V1/dem-info.csv"

results_df = process_recordings(folder_path, dem_info_path)
results_df = results_df.drop(results_df.index[0])

# Fix column order
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Age", "Word_Count_CTD", "Total_Wait_Time_CTD", "Word_Count_PFT", "Total_Wait_Time_PFT",
    "Word_Count_SFT", "Total_Wait_Time_SFT", "Converted-MMSE" ]

results_df = results_df[final_columns]
results_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Word_Count_PFT,Total_Wait_Time_PFT,Word_Count_SFT,Total_Wait_Time_SFT,Converted-MMSE
1,Process-rec-001,train,MCI,male,62.0,210.0,3.0,30.0,23.0,23.0,32.0,25.0
2,Process-rec-002,dev,MCI,male,61.0,69.0,14.0,16.0,32.0,25.0,32.0,25.0
3,Process-rec-003,train,MCI,female,62.0,143.0,3.0,35.0,18.0,43.0,19.0,29.0
4,Process-rec-004,dev,MCI,female,67.0,161.0,2.0,36.0,20.0,41.0,19.0,29.0
5,Process-rec-005,train,MCI,male,65.0,45.0,0.0,41.0,13.0,62.0,2.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...
153,Process-rec-153,train,HC,male,63.0,112.0,0.0,85.0,6.0,98.0,8.0,28.0
154,Process-rec-154,train,HC,female,79.0,222.0,4.0,46.0,16.0,78.0,9.0,30.0
155,Process-rec-155,train,HC,male,86.0,91.0,0.0,69.0,24.0,47.0,20.0,29.0
156,Process-rec-156,train,Dementia,male,61.0,48.0,16.0,21.0,43.0,27.0,38.0,26.0


In [7]:
# Save table with filled values
results_df.to_csv("Calculate_Word_Count_Total_Wait_Time.csv", index = False)