In [1]:
import os
import re
import torch
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
df = pd.read_csv('../PROCESS-V1/dem-info.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Converted-MMSE
0,Process-rec-001,train,MCI,male,62,25.0
1,Process-rec-002,dev,MCI,male,61,25.0
2,Process-rec-003,train,MCI,female,62,29.0
3,Process-rec-004,dev,MCI,female,67,29.0
4,Process-rec-005,train,MCI,male,65,27.0


In [3]:
# fix 66* string value in Age column
# df['Age'].value_counts()

df['Class'].value_counts()
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)
    
    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    # fix 66* string value in Age column
    df_train = df.loc[df['TrainOrDev'] == 'train']
    df_dev = df.loc[df['TrainOrDev'] == 'dev']
    return df_train, df_dev

df_train, df_dev = load_process("../PROCESS-V1/")

In [4]:
def get_word_count_and_wait_time(text_path):
    """
    Analyze text file, extract word count and total wait time.
    """
    try:
        with open(text_path, 'r') as file:
            content = file.read()

        # Wartezeiten extrahieren und summieren
        wait_times = re.findall(r'\((\d+) seconds?\)', content)
        total_wait_time = sum(int(seconds) for seconds in wait_times)

       # Wörter zählen
        words = re.sub(r'\(\d+ seconds?\)', '', content)  # Entferne die Wartezeiten aus dem Text
        word_count = len(words.split())

        return word_count, total_wait_time

    except Exception as e:
        print(f'Fehler beim Verarbeiten von {text_path}: {e}')
        return 0, 0

In [5]:
def process_recordings(folder_path, dem_info_path):
    """
    Iterate through all CTD text files in file, calculate word count and total wait time and add more information of dem-info.csv.
    """
    # load dem-info.csv file
    dem_info = pd.read_csv(dem_info_path)
    
    # fix 66* string value in Age column
    dem_info['Age'] = dem_info['Age'].apply(lambda x: x.replace('66*', '66') if isinstance(x, str) else x)
    dem_info['Age'] = dem_info['Age'].astype(int)
    
    records = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt") and "__CTD" in file:
                text_type = "CTD"
                text_path = os.path.join(root, file)

                # extract Record-ID
                record_id = os.path.basename(root)

                # calcualte word count and total wait time
                word_count, total_wait_time = get_word_count_and_wait_time(text_path)

                # get more information of DataFrame
                dem_info_row = dem_info[dem_info['Record-ID'] == record_id]
                if not dem_info_row.empty:
                    train_or_dev = dem_info_row.iloc[0]['TrainOrDev']
                    rec_class = dem_info_row.iloc[0]['Class']
                    gender = dem_info_row.iloc[0]['Gender']
                    age = dem_info_row.iloc[0]['Age']
                    converted_mmse = dem_info_row.iloc[0]['Converted-MMSE']
                else:
                    train_or_dev = None
                    rec_class = None
                    gender = None
                    age = None
                    converted_mmse = None

                records.append({
                    "Record-ID": record_id,
                    "TrainOrDev": train_or_dev,
                    "Class": rec_class,
                    "Gender": gender,
                    "Age": age,
                    "Word_Count_CTD": word_count,
                    "Total_Wait_Time_CTD": total_wait_time,
                    "Converted-MMSE": converted_mmse
                })

    # Convert results in DataFrame
    results_df = pd.DataFrame(records)

    # Group by Record-ID and summarize
    results_df = results_df.groupby("Record-ID").first().reset_index()

    # Fill missing columns
    for ext in ["CTD"]:
        if "Word_Count_CTD" not in results_df.columns:
            results_df["Word_Count_CTD"] = 0
        if "Total_Wait_Time_CTD" not in results_df.columns:
            results_df["Total_Wait_Time_CTD"] = 0.0

    return results_df

## Select needed columns

In [6]:
folder_path = "../PROCESS-V1"
dem_info_path = "../PROCESS-V1/dem-info.csv"

results_df = process_recordings(folder_path, dem_info_path)
results_df = results_df.drop(results_df.index[0])

# Fix column order
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Age", "Word_Count_CTD", "Total_Wait_Time_CTD", "Converted-MMSE" ]

results_df = results_df[final_columns]
results_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Converted-MMSE
1,Process-rec-001,train,MCI,male,62.0,210,3,25.0
2,Process-rec-002,dev,MCI,male,61.0,69,14,25.0
3,Process-rec-003,train,MCI,female,62.0,143,3,29.0
4,Process-rec-004,dev,MCI,female,67.0,161,2,29.0
5,Process-rec-005,train,MCI,male,65.0,45,0,27.0
...,...,...,...,...,...,...,...,...
153,Process-rec-153,train,HC,male,63.0,112,0,28.0
154,Process-rec-154,train,HC,female,79.0,222,4,30.0
155,Process-rec-155,train,HC,male,86.0,91,0,29.0
156,Process-rec-156,train,Dementia,male,61.0,48,16,26.0


## Calculate TF and IDF on behalf of word count and total wait time

In [7]:
# Function to read and clean text files
def read_and_clean_text(text_path, stop_words):
    with open(text_path, 'r') as file:
        content = file.read()

    # Remove numbers and filler words
    content = re.sub(r'\b\d+\b', '', content) # Remove numbers
    filler_words = ["second", "seconds"]
    words = content.split()
    content = ' '.join([word for word in words if word.lower() not in stop_words and word.lower() not in filler_words])
    return content

In [8]:
# Load text files and clean content
texts = []

record_ids = results_df['Record-ID'].tolist()

custom_stop_words = {'second', 'seconds'}
stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop_words) # Using Scikit-Learn's built-in stop words
# pattern = re.compile(r'.*_*.[A-Za-z].*\.txt$') # Regex pattern to match files with "_" followed by any character

for record_id in record_ids:
    text_path = f'{folder_path}/{record_id}/{record_id}__CTD.txt'
    texts.append(read_and_clean_text(text_path, stop_words))

# Calculate TF-IDF for the cleaned texts
vectorizer = TfidfVectorizer(max_features = 10) # Limit to top 10 frequent words tfidf_matrix
tfidf_matrix = vectorizer.fit_transform(texts)

# Integrate TF-IDF values into the DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names_out())
tfidf_df['Record-ID'] = results_df['Record-ID']
results_df = pd.merge(results_df, tfidf_df, on = 'Record-ID')
results_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Converted-MMSE,cookie,er,jar,looks,second,seconds,sheâ,sink,thereâ,um
0,Process-rec-001,train,MCI,male,62.0,210,3,25.0,0.340812,0.340812,0.157611,0.000000,0.498002,0.546340,0.371680,0.159659,0.180894,0.000000
1,Process-rec-002,dev,MCI,male,61.0,69,14,25.0,0.399672,0.199836,0.184832,0.245363,0.000000,0.213565,0.000000,0.187233,0.424272,0.667300
2,Process-rec-003,train,MCI,female,62.0,143,3,29.0,0.554507,0.369671,0.170958,0.000000,0.540173,0.000000,0.403154,0.173179,0.000000,0.205737
3,Process-rec-004,dev,MCI,female,67.0,161,2,29.0,0.265732,0.797195,0.245780,0.000000,0.000000,0.000000,0.289800,0.248973,0.000000,0.295781
4,Process-rec-005,train,MCI,male,65.0,45,0,27.0,0.289786,0.144893,0.268028,0.000000,0.423443,0.774240,0.000000,0.135755,0.000000,0.161278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,Process-rec-152,dev,HC,female,67.0,415,7,29.0,0.283071,0.566142,0.261818,0.000000,0.000000,0.000000,0.000000,0.265219,0.600989,0.315081
152,Process-rec-153,train,HC,male,63.0,112,0,28.0,0.126761,0.380282,0.117243,0.155640,0.246968,0.135470,0.829451,0.118766,0.000000,0.141095
153,Process-rec-154,train,HC,female,79.0,222,4,30.0,0.000000,0.568473,0.525791,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.632757
154,Process-rec-155,train,HC,male,86.0,91,0,29.0,0.195114,0.390227,0.180464,0.000000,0.190070,0.834075,0.212786,0.000000,0.000000,0.000000


In [9]:
# Save table with filled values
results_df.to_csv("Calculation_of_TF_IDF.csv", index = False)