In [5]:
import pandas as pd

control_df = pd.read_csv("Control_db.csv")
dementia_df = pd.read_csv("Dementia_db.csv")
test_df = pd.read_csv("Testing_db.csv")


# Combine the datasets
df = pd.concat([control_df, dementia_df], ignore_index=True)

df = df.sample(frac=1)
df

Unnamed: 0,Language,Data,Participant,Age,Gender,Diagnosis,Category,mmse,Filename,Transcript
70,eng,Pitt,PAR,68,female,ProbableAD,1,14.0,S100,+< okay . &uh there's <a cook> [//] a [/] &b ...
32,eng,Pitt,PAR,57,female,Control,0,30.0,S041,water's pouring out_of the sink . and the wom...
78,eng,Pitt,PAR,66,male,Vascular,1,16.0,S114,mhm . well the kids is [* m:a] robbin(g) a co...
35,eng,Pitt,PAR,65,male,Control,0,30.0,S049,there's a child reaching for a cookie . the c...
43,eng,Pitt,PAR,64,male,Control,0,30.0,S062,"well, there's a kid stealin(g) cookies from t..."
...,...,...,...,...,...,...,...,...,...,...
84,eng,Pitt,PAR,73,male,ProbableAD,1,13.0,S126,(. (loo)ks like somebody took some pencils or...
17,eng,Pitt,PAR,64,female,Control,0,30.0,S021,&um the boy is taking &uh cookies . &uh the g...
103,eng,Pitt,PAR,58,male,ProbableAD,1,20.0,S150,well the boy on the chair [: stool] [* s:r] i...
22,eng,Pitt,PAR,72,female,Control,0,29.0,S029,action . +< alright . a lady's drying dishes ...


In [6]:
from sklearn.model_selection import train_test_split

X_train = df['Transcript']  # Features
y_train = df['Category']       # 0 means no AD, 1 means AD

X_test = test_df['Transcript'] # From test_df so different than train data
y_test = test_df['Category']


In [7]:
print(X_train.head()) 
print("----------")
print(y_train.head())

70     +< okay . &uh there's <a cook> [//] a [/] &b ...
32     water's pouring out_of the sink . and the wom...
78     mhm . well the kids is [* m:a] robbin(g) a co...
35     there's a child reaching for a cookie . the c...
43     well, there's a kid stealin(g) cookies from t...
Name: Transcript, dtype: object
----------
70    1
32    0
78    1
35    0
43    0
Name: Category, dtype: int64


In [8]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

'''The result, train_encodings and test_encodings are dictionaries containing tokenized 
representations of the training and testing text data, respectively, suitable for input 
into a RoBERTa model for training or prediction. --Details on Notion'''

'The result, train_encodings and test_encodings are dictionaries containing tokenized \nrepresentations of the training and testing text data, respectively, suitable for input \ninto a RoBERTa model for training or prediction. --Details on Notion'

In [9]:
import torch

class DementiaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DementiaDataset(train_encodings, y_train.tolist())
test_dataset = DementiaDataset(test_encodings, y_test.tolist())


In [10]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='/content/result',          # Directory where the model predictions and checkpoints will be written.
    num_train_epochs=9,              # Total number of training epochs to perform.
    per_device_train_batch_size=8,   # Batch size per device during training.
    per_device_eval_batch_size=8,    # Batch size for evaluation.
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler.
    weight_decay=0.01,               # Strength of weight decay.
    logging_dir='/content/logs',            # Directory for storing logs.
    logging_steps=1,                # Log every X updates steps.
    evaluation_strategy="steps",     # Evaluation is done (and logged) every X steps.
    eval_steps=2000,                  # Number of steps to perform evaluation.
    save_strategy="steps",           # The checkpoint save strategy to adopt during training.
    save_steps=2000                   # Save checkpoint every X steps.
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics 
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


TrainOutput(global_step=126, training_loss=0.5616350000546801, metrics={'train_runtime': 2521.519, 'train_samples_per_second': 0.385, 'train_steps_per_second': 0.05, 'total_flos': 255743945809920.0, 'train_loss': 0.5616350000546801, 'epoch': 9.0})

In [13]:
import numpy as np
trainer.evaluate()

{'eval_loss': 0.8392927646636963,
 'eval_accuracy': 0.7708333333333334,
 'eval_f1': 0.7317073170731708,
 'eval_precision': 0.8823529411764706,
 'eval_recall': 0.625,
 'eval_runtime': 36.1828,
 'eval_samples_per_second': 1.327,
 'eval_steps_per_second': 0.166,
 'epoch': 9.0}

In [14]:
trainer.save_model('/content/model')

In [15]:
tokenizer.save_pretrained('/content/tokenizer')


('/content/tokenizer\\tokenizer_config.json',
 '/content/tokenizer\\special_tokens_map.json',
 '/content/tokenizer\\vocab.json',
 '/content/tokenizer\\merges.txt',
 '/content/tokenizer\\added_tokens.json')

In [16]:
!zip -r /content/model_and_tokenizer.zip /content/result


  adding: content/result/ (192 bytes security) (stored 0%)


In [17]:
from google.colab import files

files.download('/content/model_and_tokenizer.zip')


ModuleNotFoundError: No module named 'google.colab'

In [3]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt


In [18]:
import numpy as np
from sklearn.metrics import accuracy_score

predictions = trainer.predict(test_dataset)
pred_probs = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
pred_labels = np.argmax(pred_probs, axis=1)

true_labels = [label for label in test_dataset.labels]  
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7708333333333334


In [None]:
import torch

# Apply softmax to convert logits to probabilities
pred_probs = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Convert probabilities to predicted class indices
pred_labels = np.argmax(pred_probs, axis=1)


In [None]:
# Get true labels from the test dataset
true_labels = test_dataset.labels  # Adjust this according to how your labels are stored

# Compute the confusion matrix
cm = confusion_matrix(true_labels, pred_labels)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.show()
