In [27]:
import re
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification

In [25]:
df = pd.read_csv('Calculate_Word_Count_Total_Wait_Time.csv')
df.head()

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Word_Count_PFT,Total_Wait_Time_PFT,Word_Count_SFT,Total_Wait_Time_SFT,Converted-MMSE
0,Process-rec-001,train,MCI,male,62.0,210.0,3.0,30.0,23.0,23.0,32.0,25.0
1,Process-rec-002,dev,MCI,male,61.0,69.0,14.0,16.0,32.0,25.0,32.0,25.0
2,Process-rec-003,train,MCI,female,62.0,143.0,3.0,35.0,18.0,43.0,19.0,29.0
3,Process-rec-004,dev,MCI,female,67.0,161.0,2.0,36.0,20.0,41.0,19.0,29.0
4,Process-rec-005,train,MCI,male,65.0,45.0,0.0,41.0,13.0,62.0,2.0,27.0


In [19]:
# fix 66* string value in Age column
# df['Age'].value_counts()

df['Class'].value_counts()
def load_process(path):
    df = pd.read_csv(f'{path}/dem-info.csv')
    
    # fix 66* string value in Age column
    df['Age'] = df['Age'].apply(lambda x: x.replace('66*', '66'))
    df['Age'] = df['Age'].astype(int)
    
    # get_file_names
    for ext in ["CTD", "PFT", 'SFT']:
        df[f'{ext}_wav'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.wav'
        df[f'{ext}_txt'] = f'{path}/' + df['Record-ID'] + '/' + df['Record-ID'] + f'__{ext}.txt'

    # fix 66* string value in Age column
    df_train = df.loc[df['TrainOrDev'] == 'train']
    df_dev = df.loc[df['TrainOrDev'] == 'dev']
    return df_train, df_dev

df_train, df_dev = load_process("../PROCESS-V1/")

## Select needed columns

In [26]:
# Fix columns needed
final_columns = [ "Record-ID", "TrainOrDev", "Class", "Gender", "Age", "Word_Count_CTD", "Total_Wait_Time_CTD", "Converted-MMSE" ]
results_df = df[final_columns]
results_df

Unnamed: 0,Record-ID,TrainOrDev,Class,Gender,Age,Word_Count_CTD,Total_Wait_Time_CTD,Converted-MMSE
0,Process-rec-001,train,MCI,male,62.0,210.0,3.0,25.0
1,Process-rec-002,dev,MCI,male,61.0,69.0,14.0,25.0
2,Process-rec-003,train,MCI,female,62.0,143.0,3.0,29.0
3,Process-rec-004,dev,MCI,female,67.0,161.0,2.0,29.0
4,Process-rec-005,train,MCI,male,65.0,45.0,0.0,27.0
...,...,...,...,...,...,...,...,...
152,Process-rec-153,train,HC,male,63.0,112.0,0.0,28.0
153,Process-rec-154,train,HC,female,79.0,222.0,4.0,30.0
154,Process-rec-155,train,HC,male,86.0,91.0,0.0,29.0
155,Process-rec-156,train,Dementia,male,61.0,48.0,16.0,26.0


## Calculate the missing converted MMSEs using bert

In [28]:
# Custom dataset for BERT
class CustomDataset(Dataset):
    def __init__(self, texts, targets = None, tokenizer = None, max_len = 512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        
        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()

        if self.targets is not None:
            target = torch.tensor(self.targets[index], dtype = torch.float)
            return { 'ids': ids, 'mask': mask, 'target': target }
        else:
            return { 'ids': ids, 'mask': mask }

In [29]:
# Function to train the BERT model
def train_model(train_dataset, model, tokenizer, device, learning_rate = 1e-5, epochs = 3):
        train_loader = DataLoader(train_dataset, batch_size = 4, shuffle = True)
        model.train()
        
        optimizer = torch.optim.Adam(params = model.parameters(), lr = learning_rate)
        loss_fn = torch.nn.MSELoss()

        for epoch in range(epochs):
            for batch in train_loader:
                ids = batch['ids'].to(device)
                mask = batch['mask'].to(device)
                targets = batch['target'].to(device)
                outputs = model(input_ids = ids, attention_mask = mask)[0].squeeze()
                optimizer.zero_grad()
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
        return model

In [22]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [30]:
# Prepare training data
train_texts = df[~df['Converted-MMSE'].isna()]['Record-ID'].tolist()
train_targets = df[~df['Converted-MMSE'].isna()]['Converted-MMSE'].tolist()
train_dataset = CustomDataset(train_texts, train_targets, tokenizer)

In [None]:
# Train the BERT model
model = train_model(train_dataset, model, tokenizer, device)
model

  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
# Predict MMSE for missing values
test_texts = df_dev[df_dev['Converted-MMSE'].isna()]['Record-ID'].tolist()
test_dataset = CustomDataset(test_texts, tokenizer = tokenizer)
test_loader = DataLoader(test_dataset, batch_size = 4, shuffle = False)

model.eval()
predicted_mmse = []

for batch in test_loader:
    ids = batch['ids'].to(device)
    mask = batch['mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids = ids, attention_mask = mask)[0].squeeze()
    predicted_mmse.extend(outputs.cpu().numpy())

In [None]:
# Update the DataFrame with the predicted MMSE values
predicted_mmse = [int(round(pred)) for pred in predicted_mmse]
results_df.loc[results_df['Converted-MMSE'].isna
results_df

In [None]:
# Save table with filled values
results_df.to_csv("final_results.csv", index = False)