In [None]:
import os
import re
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
df = pd.read_csv('../PROCESS-V1/dem-info.csv')
df.head()

In [None]:
# fix 66* string value in Age column
# df['Age'].value_counts()
# Funktion zum Einlesen und Vorprozessieren der Daten

def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)

    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    return df

## Calculate Word_Count_CTD and Total_Wait_Time_CTD

In [None]:
def get_word_count_and_wait_time(text_path):
    """
    Analysiert eine Textdatei, extrahiert die Anzahl der Wörter und die gesamte Wartezeit.
    """
    try:
        with open(text_path, 'r') as file:
            content = file.read()

        # Wartezeiten extrahieren und summieren
        wait_times = re.findall(r'\((\d+) seconds?\)', content)
        total_wait_time = sum(int(seconds) for seconds in wait_times)

       # Wörter zählen
        words = re.sub(r'\(\d+ seconds?\)', '', content)  # Entferne die Wartezeiten aus dem Text
        word_count = len(words.split())

        return word_count, total_wait_time

    except Exception as e:
        print(f'Fehler beim Verarbeiten von {text_path}: {e}')
        return 0, 0

In [None]:
def process_recordings(folder_path, dem_info_path):
    """
    Iteriert durch alle Textdateien im Ordner, berechnet die Wortanzahl und Wartezeit,
    und fügt zusätzliche Informationen aus der dem-info.csv hinzu.
    """
    # Lade die dem-info.csv-Datei
    dem_info = pd.read_csv(dem_info_path)
    
    # Fixiere fehlerhafte Werte in der Age-Spalte
    dem_info['Age'] = dem_info['Age'].apply(lambda x: x.replace('66*', '66') if isinstance(x, str) else x)
    dem_info['Age'] = dem_info['Age'].astype(int)
    
    records = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith((".txt")):
                # Art des Textes bestimmen (CTD, PFT, SFT)
                if "__CTD" in file:
                    text_type = "CTD"
                else:
                    continue  # Überspringe Dateien ohne passenden Texttyp

                text_path = os.path.join(root, file)

                # Record-ID extrahieren
                record_id = os.path.basename(root)

                # Wortanzahl und Wartezeit berechnen
                word_count, total_wait_time = get_word_count_and_wait_time(text_path)

                # Zusätzliche Informationen aus dem DataFrame holen
                dem_info_row = dem_info[dem_info['Record-ID'] == record_id]
                if not dem_info_row.empty:
                    train_or_dev = dem_info_row.iloc[0]['TrainOrDev']
                    rec_class = dem_info_row.iloc[0]['Class']
                    gender = dem_info_row.iloc[0]['Gender']
                    age = dem_info_row.iloc[0]['Age']
                    converted_mmse = dem_info_row.iloc[0]['Converted-MMSE']
                else:
                    train_or_dev = None
                    rec_class = None
                    gender = None
                    age = None
                    converted_mmse = None

                records.append({
                    "Record-ID": record_id,
                    "TrainOrDev": train_or_dev,
                    "Class": rec_class,
                    "Gender": gender,
                    "Age": age,
                    "Word_Count_CTD": word_count,
                    "Total_Wait_Time_CTD": total_wait_time,
                    "Converted-MMSE": converted_mmse
                })

    # Konvertiere die Ergebnisse in einen DataFrame
    results_df = pd.DataFrame(records)

    # Gruppieren nach Record-ID und zusammenfassen
    results_df = results_df.groupby("Record-ID").first().reset_index()

    return results_df

In [None]:
df.to_csv("Zwischenstand.csv", index = False)

In [None]:
folder_path = "../PROCESS-V1"
dem_info_path = "../PROCESS-V1/dem-info.csv"

results_df = process_recordings(folder_path, dem_info_path)
results_df = results_df.drop(index = 0).reset_index(drop = True)

# Reihenfolge der Spalten festlegen
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Age", "Word_Count_CTD", "Total_Wait_Time_CTD", "Converted-MMSE" ]

results_df = results_df[final_columns]
results_df.to_csv("Zwischenstand.csv", index = False)

## Calculate TF and IDF on behalf of word count and total wait time

In [22]:
# Function to read and clean text files
def read_and_clean_text(text_path):
    with open(text_path, 'r') as file:
        content = file.read()
    
    # Remove numbers and filler words
    content = re.sub(r'\b\d+\b', '', content) # Remove numbers
    filler_words = ["second", "seconds", "um"]
    content = ' '.join([word for word in content.split() if word.lower() not in filler_words])
    return content

In [24]:
# Load text files and clean content
texts = []
record_ids = results_df['Record-ID'].tolist()
for record_id in record_ids:
    text_path = f'{folder_path}/{record_id}/{record_id}__CTD.txt'
    texts.append(read_and_clean_text(text_path))

# Calculate TF-IDF for the cleaned texts
vectorizer = TfidfVectorizer(max_features = 10) # Limit to top 10 frequent words tfidf_matrix
tfidf_matrix = vectorizer.fit_transform(texts)

# Integrate TF-IDF values into the DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names_out())
tfidf_df['Record-ID'] = results_df['Record-ID']
results_df = pd.merge(results_df, tfidf_df, on = 'Record-ID')
results_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Converted-MMSE,and_x,er_x,...,and_y,er_y,in_y,is_y,it_y,of_y,on_y,second_y,the_y,to_y
0,Process-rec-001,train,MCI,male,62.0,210,3,25.0,0.264722,0.220946,...,0.264722,0.220946,0.051420,0.278501,0.174735,0.205679,0.183324,0.053809,0.812166,0.178732
1,Process-rec-002,dev,MCI,male,61.0,69,14,25.0,0.454769,0.284675,...,0.454769,0.284675,0.000000,0.239220,0.000000,0.000000,0.354301,0.415974,0.550749,0.230284
2,Process-rec-003,train,MCI,female,62.0,143,3,29.0,0.326950,0.081865,...,0.326950,0.081865,0.076209,0.343969,0.086324,0.152417,0.339627,0.000000,0.633526,0.463567
3,Process-rec-004,dev,MCI,female,67.0,161,2,29.0,0.395836,0.141591,...,0.395836,0.141591,0.131807,0.475932,0.074651,0.197711,0.176222,0.206896,0.547861,0.400884
4,Process-rec-005,train,MCI,male,65.0,45,0,27.0,0.278998,0.523938,...,0.278998,0.523938,0.162578,0.000000,0.184158,0.650314,0.000000,0.000000,0.405457,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,63.0,112,0,28.0,0.087269,0.218513,...,0.087269,0.218513,0.101707,0.275434,0.230414,0.305121,0.090652,0.000000,0.760947,0.353527
153,Process-rec-154,train,HC,female,79.0,222,4,30.0,0.302879,0.189595,...,0.302879,0.189595,0.176495,0.424860,0.266562,0.176495,0.104874,0.123129,0.635791,0.357865
154,Process-rec-155,train,HC,male,86.0,91,0,29.0,0.000000,0.129056,...,0.000000,0.129056,0.240276,0.433797,0.272169,0.360414,0.000000,0.000000,0.599229,0.417592
155,Process-rec-156,train,Dementia,male,61.0,48,16,26.0,0.438701,0.366155,...,0.438701,0.366155,0.000000,0.000000,0.000000,0.170427,0.151904,0.178345,0.708386,0.296197


In [25]:
df.to_csv("Calculation_of_TF_IDF.csv", index = False)

## Calculate the missing converted MMSEs on behalf of TF and IDF using bert

In [26]:
# Custom dataset for BERT
class CustomDataset(Dataset):
    def __init__(self, texts, targets = None, tokenizer = None, max_len = 512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        
        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()

        if self.targets is not None:
            target = torch.tensor(self.targets[index], dtype = torch.float)
            return { 'ids': ids, 'mask': mask, 'target': target }
        else:
            return { 'ids': ids, 'mask': mask }

In [27]:
# Function to train the BERT model
def train_model(train_dataset, model, tokenizer, device, learning_rate = 1e-5, epochs = 3):
        train_loader = DataLoader(train_dataset, batch_size = 4, shuffle = True)
        model.train()
        
        optimizer = torch.optim.Adam(params = model.parameters(), lr = learning_rate)
        loss_fn = torch.nn.MSELoss()

        for epoch in range(epochs):
            for batch in train_loader:
                ids = batch['ids'].to(device)
                mask = batch['mask'].to(device)
                targets = batch['target'].to(device)
                outputs = model(input_ids = ids, attention_mask = mask)[0]
                optimizer.zero_grad()
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
        return model

In [None]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare training data
train_texts = results_df[~results_df['Converted-MMSE'].isna()]['Record-ID'].apply(lambda x: read_and_clean_text(f'{folder_path}/{x}/{x}__CTD.txt')).tolist()
train_targets = results_df[~results_df['Converted-MMSE'].isna()]['Converted-MMSE'].tolist()
train_dataset = CustomDataset(train_texts, train_targets, tokenizer)

# Train the BERT model
model = train_model(train_dataset, model, tokenizer, device)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
# Predict MMSE for missing values
test_texts = results_df[results_df['Converted-MMSE'].isna()]['Record-ID'].apply(lambda x: read_and_clean_text(f'{folder_path}/{x}/{x}__CTD.txt')).tolist()
test_dataset = CustomDataset(test_texts, tokenizer = tokenizer)
test_loader = DataLoader(test_dataset, batch_size = 4, shuffle = False)

model.eval()
predicted_mmse = []

for batch in test_loader:
    ids = batch['ids'].to(device)
    mask = batch['mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids = ids, attention_mask = mask)[0]
    predicted_mmse.extend(outputs.cpu().numpy())

In [None]:
# Update the DataFrame with the predicted MMSE values
results_df.loc[results_df['Converted-MMSE'].isna(), 'Converted-MMSE'] = predicted_mmse

In [None]:
results_df

In [None]:
# Speichere das Ergebnis
results_df.to_csv("final_results.csv", index = False)