In [18]:
import os
import re
import torch
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertForSequenceClassification

In [19]:
df = pd.read_csv('Calculate_Word_Count_Total_Wait_Time.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Word_Count_PFT,Total_Wait_Time_PFT,Word_Count_SFT,Total_Wait_Time_SFT,Converted-MMSE
0,Process-rec-001,train,MCI,male,62.0,210.0,3.0,30.0,23.0,23.0,32.0,25.0
1,Process-rec-002,dev,MCI,male,61.0,69.0,14.0,16.0,32.0,25.0,32.0,25.0
2,Process-rec-003,train,MCI,female,62.0,143.0,3.0,35.0,18.0,43.0,19.0,29.0
3,Process-rec-004,dev,MCI,female,67.0,161.0,2.0,36.0,20.0,41.0,19.0,29.0
4,Process-rec-005,train,MCI,male,65.0,45.0,0.0,41.0,13.0,62.0,2.0,27.0


In [20]:
# fix 66* string value in Age column
# df['Age'].value_counts()

def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)

    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    return df

## Select needed columns

In [21]:
# Fix columns needed
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Age", "Word_Count_CTD", "Total_Wait_Time_CTD", "Converted-MMSE" ]
results_df = df[final_columns]
results_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Converted-MMSE
0,Process-rec-001,train,MCI,male,62.0,210.0,3.0,25.0
1,Process-rec-002,dev,MCI,male,61.0,69.0,14.0,25.0
2,Process-rec-003,train,MCI,female,62.0,143.0,3.0,29.0
3,Process-rec-004,dev,MCI,female,67.0,161.0,2.0,29.0
4,Process-rec-005,train,MCI,male,65.0,45.0,0.0,27.0
...,...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,63.0,112.0,0.0,28.0
153,Process-rec-154,train,HC,female,79.0,222.0,4.0,30.0
154,Process-rec-155,train,HC,male,86.0,91.0,0.0,29.0
155,Process-rec-156,train,Dementia,male,61.0,48.0,16.0,26.0


## Calculate TF and IDF on behalf of word count and total wait time

In [44]:
# Function to read and clean text files
def read_and_clean_text(text_path, stop_words):
    with open(text_path, 'r') as file:
        content = file.read()

    # Remove numbers and filler words
    content = re.sub(r'\b\d+\b', '', content) # Remove numbers
    filler_words = ["second", "seconds"]
    words = content.split()
    content = ' '.join([word for word in words if word.lower() not in stop_words and word.lower() not in filler_words])
    return content

In [45]:
# Load text files and clean content
texts = []
record_ids = results_df['Record-ID'].tolist()
stop_words = text.ENGLISH_STOP_WORDS # Using Scikit-Learn's built-in stop words
for record_id in record_ids:
    text_path = f'{folder_path}/{record_id}/{record_id}__CTD.txt'
    texts.append(read_and_clean_text(text_path, stop_words))

# Calculate TF-IDF for the cleaned texts
vectorizer = TfidfVectorizer(max_features = 10) # Limit to top 10 frequent words tfidf_matrix
tfidf_matrix = vectorizer.fit_transform(texts)

# Integrate TF-IDF values into the DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names_out())
tfidf_df['Record-ID'] = results_df['Record-ID']
results_df = pd.merge(results_df, tfidf_df, on = 'Record-ID')
results_df

MergeError: Passing 'suffixes' which cause duplicate columns {'sheâ_x', 'jar_x', 'seconds_x', 'cookie_x', 'thereâ_x', 'second_x', 'looks_x', 'sink_x'} is not allowed.

In [46]:
# Save table with filled values
results_df.to_csv("Calculation_of_TF_IDF.csv", index = False)