<a href="https://colab.research.google.com/github/JKEVIN2010/LLMs-for-Dementia-Detection/blob/main/Dementia_GPT_2_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import TextDataset, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TextDataset, DataCollatorWithPadding, Trainer, TrainingArguments
import re
import nltk

In [None]:
class DementiaTextDataset(Dataset):
    def __init__(self, encodings, labels, block_size=384):
        self.encodings = encodings
        self.labels = labels
        self.block_size = block_size

    def __getitem__(self, idx):
        input_ids = self.encodings["input_ids"][idx][:self.block_size]
        attention_mask = self.encodings["attention_mask"][idx][:self.block_size]

        # Pad the input_ids and attention_mask if needed
        padding_length = max(0, self.block_size - len(input_ids))
        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
        attention_mask = attention_mask + [0] * padding_length

        item = {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask)}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# Load and preprocess your data
def load_data(dementia_file_path, non_dementia_file_path):
    with open(dementia_file_path, 'r') as file:
        dementia_data = file.readlines()
    with open(non_dementia_file_path, 'r') as file:
        non_dementia_data = file.readlines()

    data = {
        "text": dementia_data + non_dementia_data,
        "label": [1] * len(dementia_data) + [0] * len(non_dementia_data),
    }
    return pd.DataFrame(data)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub('[^a-zA-Z0-9]', ' ', text)

    return text

In [None]:
dementia_file_path = "dementia_samples.txt"
non_dementia_file_path = "non_dementia_samples.txt"
df = load_data(dementia_file_path, non_dementia_file_path)

# Visualize labeled data
df

In [None]:
# Clean data and visualize it
df['text'] = df['text'].apply(preprocess_text)
df

Unnamed: 0,text,label
0,mhm alright there is a young boy that is g...,1
1,mhm there is a young boy going in a cookie j...,1
2,heres a cookie jar and the lid is off the co...,1
3,the boy is slipping off the stool he is tryi...,1
4,okay he is falling off a stool she is runnin...,1
...,...,...
547,well the little girl is reaching for a cookie ...,0
548,mhm mhm a lot of things are happening yes ...,0
549,alright the little girls reaching up there a...,0
550,okay well in the first place the mother forg...,0


In [None]:
# Initialize GPT-2 tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Adding padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Labels
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
train_labels = train_labels.to_list()
val_labels = val_labels.to_list()


block_size = 384
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=block_size)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=block_size)

# Create PyTorch datasets
train_dataset = DementiaTextDataset(train_encodings, train_labels)
val_dataset = DementiaTextDataset(val_encodings, val_labels)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments and instantiate Trainer
training_args = TrainingArguments(
    output_dir='./GPT_results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(eval_results)




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9238,0.542902,0.72973,0.843137,0.661538,0.741379
2,0.6063,0.46318,0.765766,0.791045,0.815385,0.80303
3,0.428,0.481881,0.756757,0.787879,0.8,0.793893


{'eval_loss': 0.46317964792251587, 'eval_accuracy': 0.7657657657657657, 'eval_precision': 0.7910447761194029, 'eval_recall': 0.8153846153846154, 'eval_f1': 0.803030303030303, 'eval_runtime': 53.6049, 'eval_samples_per_second': 2.071, 'eval_steps_per_second': 0.261, 'epoch': 3.0}


In [None]:
trainer.save_model("dementia_gpt2")