<a href="https://colab.research.google.com/github/JKEVIN2010/LLMs-for-Dementia-Detection/blob/main/LLAMA2_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import regex as re
import pandas as pd

import torch
from datasets import Dataset, DatasetDict
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline


In [None]:
def post_process(text):
    text = text.strip()
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace(" .", ".")
    text = text.replace(" ?", "?")
    text = text.replace("/.", "")
    text = text.replace("/", "")
    text = text.replace(" ,", ",")
    text = text.replace("//.", "")
    text = re.sub(r' +', ' ', text)
    text = text.replace('\\', '')
    return text

ds = pd.read_csv("cookie.csv")
df = pd.DataFrame(columns=['text', 'label'], index=range(len(ds)))

In [None]:
for i in range(len(ds)):
    idx = ds.iat[i, 0]
    text = ds.iat[i, 1]
    label = ds.iat[i, 2]
    text = post_process(text)
    df.loc[i] = [text, label]

split = int(len(df) * 0.8)
df_train = df[:split]
df_val = df[split:]

In [None]:
id_to_label = {0:'diseased subject', 1:'healthy subject'}
template = "### Human: The given speech transcript is either from a healthy subject or a diseased subject. Categorize it as one of them. "
train_instructions = [f"{template}\nTranscript: {x}\n\n ### Assistant: {id_to_label[y]}" for x, y in zip(df_train.text, df_train.label)]
val_instructions = [f"{template}\nTranscript: {x}\n\n ### Assistant: {id_to_label[y]}" for x, y in zip(df_val.text, df_val.label)]

In [None]:
ds_train = Dataset.from_dict({"text": train_instructions})
ds_val = Dataset.from_dict({"text": val_instructions})
instructions_ds_dict = DatasetDict({"train": ds_train, "eval": ds_val})

In [None]:
print(instructions_ds_dict['train']['text'][0])

### Human: The given speech transcript is either from a healthy subject or a diseased subject. Categorize it as one of them. 
Transcript: well â€¡ the stool is falling over. the boy is taking cookies out of the cookie jar. the little girl is reaching for a cookie and holding her arm up. the mother is doing dishes. the water is spilling out of the sink. and the mothers standing in the water. the mother is looking out of the window. the waters coming out of the faucet. that is about all i can get out of that.

 ### Assistant: diseased subject


In [None]:
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = LlamaForCausalLM.from_pretrained("./ft", load_in_8bit=False, device_map='auto', torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.eval()
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
queries = [instructions_ds_dict['eval']['text'][i].split('### Assistant: ')[0] + '### Assistant:' for i in range(len(instructions_ds_dict['eval']))]
sequences = pipeline(
    queries,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=3,
    early_stopping=True
)

In [None]:
results = []
for seq in sequences:
  result = seq[0]['generated_text'].split('### Assistant:')[1]
  results.append(result)

labels = []
for label in instructions_ds_dict['eval']['text']:
  result = label.split('### Assistant:')[1]
  labels.append(result)

print("Accuracy: ", (len([1 for x, y in zip(results, labels) if y in x]) / len(labels)))

Accuracy:  0.8018018018018018


In [None]:
results_idx = []
for result in results:
  if 'healthy' in result:
    results_idx.append(1)
  else:
    results_idx.append(0)

labels_idx = []
for label in labels:
  if 'healthy' in label:
    labels_idx.append(1)
  else:
    labels_idx.append(0)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
tn, fp, fn, tp = confusion_matrix(results_idx, labels_idx).ravel()
specificity = tn / (tn+fp)

In [None]:
print("Accuracy:           {:.4f}".format(accuracy_score(results_idx, labels_idx)))
print("F1 Score:           {:.4f}".format(f1_score(results_idx, labels_idx)))
print("Precision:          {:.4f}".format(precision_score(results_idx, labels_idx)))
print("Specificity:        {:.4f}".format(specificity))
print("Recall/Sensitivity: {:.4f}".format(recall_score(results_idx, labels_idx)))

Accuracy:           0.8018
F1 Score:           0.8281
Precision:          0.9138
Specificity:        0.8780
Recall/Sensitivity: 0.7571
