In [1]:
#FIRST TRY WITH RF
# 1. Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk

# 2. Load the Datasets
dementia_df = pd.read_csv('Dementia_db.csv')
control_df = pd.read_csv('Control_db.csv')
testing_df = pd.read_csv('Testing_db.csv')  # This might be split further for validation

# 3. Data Preprocessing
combined_df = pd.concat([dementia_df.assign(label=1), control_df.assign(label=0)])
removing missing values
combined_df.dropna(inplace=True)

print(dementia_df.columns)
print(control_df.columns)

nltk.download('punkt') 
nltk.download('averaged_perceptron_tagger')

combined_df['unique_words'] = combined_df['Transcript'].apply(lambda x: len(set(nltk.word_tokenize(x))))
combined_df['total_words'] = combined_df['Transcript'].apply(lambda x: len(nltk.word_tokenize(x)))
combined_df['lexical_diversity'] = combined_df.apply(lambda row: row['unique_words'] / row['total_words'] if row['total_words'] > 0 else 0, axis=1)
combined_df['noun_count'] = combined_df['Transcript'].apply(lambda x: len([word for word, pos in nltk.pos_tag(nltk.word_tokenize(x)) if pos.startswith('NN')]))
combined_df['unique_words'] = combined_df['Transcript'].apply(lambda x: len(set(nltk.word_tokenize(x))))

#train Test Split
X_train, X_test, y_train, y_test = train_test_split(combined_df[['unique_words']], combined_df['label'], test_size=0.2, random_state=42)

# training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# model Evaluation
predictions = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Index(['Language', 'Data', 'Participant', 'Age', 'Gender', 'Diagnosis',
       'Category', 'mmse', 'Filename', 'Transcript'],
      dtype='object')
Index(['Language', 'Data', 'Participant', 'Age', 'Gender', 'Diagnosis',
       'Category', 'mmse', 'Filename', 'Transcript'],
      dtype='object')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juvic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\juvic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Accuracy: 0.4090909090909091


In [None]:
pip install torch transformers pandas
--------------------------------------------------------------------------

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

dementia_df = pd.read_csv('Dementia_db.csv')
control_df = pd.read_csv('Control_db.csv')

# Add a 'Diagnosis' column to each dataset before concatenation
dementia_df['Diagnosis'] = 1  # Assuming 'Dementia' is labeled as 1
control_df['Diagnosis'] = 0  # Assuming 'Control' is labeled as 0

df = pd.concat([dementia_df, control_df]).sample(frac=1).reset_index(drop=True)


df.dropna(subset=['Transcript'], inplace=True)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Transcript'], df['Diagnosis'], test_size=0.1)


In [3]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)


In [4]:
import torch

class DementiaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Ensure labels are converted to a tensor correctly
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DementiaDataset(train_encodings, list(train_labels))
val_dataset = DementiaDataset(val_encodings, list(val_labels))


In [5]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    eval_steps=2000,  
    save_steps=2000,  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.6996,0.715447
2,0.6891,0.712044
3,0.6815,0.70746
4,0.7159,0.703706
5,0.6847,0.698005
6,0.6657,0.659799


TrainOutput(global_step=78, training_loss=0.6878250806759565, metrics={'train_runtime': 1261.8342, 'train_samples_per_second': 0.461, 'train_steps_per_second': 0.062, 'total_flos': 153130634219520.0, 'train_loss': 0.6878250806759565, 'epoch': 6.0})

In [16]:
test_df.dropna(subset=['Diagnosis'], inplace=True)

In [17]:
assert test_df['Diagnosis'].isna().sum() == 0, "There are still NaN values in the 'Diagnosis' column"

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
test_encodings = tokenizer(list(test_df['Transcript']), truncation=True, padding=True, max_length=512)
test_dataset = DementiaDataset(test_encodings, list(test_df['Diagnosis']))

# Evaluate the model
test_results = trainer.evaluate(test_dataset)
print(test_results)


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'eval_loss': 0.5942063331604004, 'eval_runtime': 16.602, 'eval_samples_per_second': 1.446, 'eval_steps_per_second': 0.181, 'epoch': 6.0}


In [21]:
from datasets import load_metric
import numpy as np
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Add this line
)
test_results = trainer.evaluate(test_dataset)
print(test_results)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'eval_loss': 0.5942063331604004, 'eval_accuracy': 0.9583333333333334, 'eval_runtime': 16.3804, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.183}
