In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.1/transformers/8b-instruct/1/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/1/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/1/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/special_tokens_map.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00002-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/generation_config.json
/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv


# Fine-tuning Llama 3.1 on Mental Health Disorder Classification 


*We must load the dataset, process it, and fine-tune the Llama 3.1 model. We will also compare the model's performance before and after fine-tuning.*

## 1. Setting up
*First, we’ll start the new Kaggle notebook and Llama 3.1 model*

In [2]:
# *We will then install the necessary Python packages as outlined below :*
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m786.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
  

In [3]:
# We can then initiate the Weights and Biases project by using the API key.
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune llama-3.1-8b-it on Sentiment Analysis Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmed-houbid[0m ([33mmed-houbid-enset-mohammedia[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# Next, we need to import all the necessary Python packages and functions.
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

## 2. Loading and processing the dataset

In [5]:
# Now it’s time for us to load the dataset, perform data cleaning, and drop three ambiguous categories. 
df = pd.read_csv("/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv",index_col = "Unnamed: 0")
df.loc[:,'status'] = df.loc[:,'status'].str.replace('Bi-Polar','Bipolar')
df = df[(df.status != "Personality disorder") & (df.status != "Stress") & (df.status != "Suicidal")]
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [6]:
# To save training time, we will fine-tune the model on only 3000 samples.
# For that, we will shuffle the dataset and select 3000 rows. 

In [7]:
# Shuffle the DataFrame and select only 3000 rows
df = df.sample(frac=1, random_state=85).reset_index(drop=True).head(3000)

# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            Classify the text into Normal, Depression, Anxiety, Bipolar, and return the answer as the corresponding mental health disorder label.
text: {data_point["statement"]}
label: {data_point["status"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into Normal, Depression, Anxiety, Bipolar, and return the answer as the corresponding mental health disorder label.
text: {data_point["statement"]}
label: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'status']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)


In [8]:
# At this point, we want to check the distribution of categories in the train set. 
X_train.status.value_counts()

status
Normal        1028
Depression     938
Anxiety        258
Bipolar        176
Name: count, dtype: int64

In [9]:
# convert the train and eval set from pandas dataframe into the Hugging Face datasets. 
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [10]:
# display the 4th sample from the “text” column.
train_data['text'][3]

'Classify the text into Normal, Depression, Anxiety, Bipolar, and return the answer as the corresponding mental health disorder label.\ntext: I am so sad. Everything in my work life is going fine, but my personal life is a wreck. No one ever takes me seriously because I am the funny friend. I do not want to talk to anyone anymore. I just want to die sometimes. Please help me. I have never had this feeling in my entire life. Why am I so sad\nlabel: Depression'

## 3. Loading the model and tokenizer

In [11]:
# we want to load the Llama-3.1-8b-instruct model in 4-bit quantization to save the GPU memory. 

# We will then load the tokenizer and set the pad token id. 

base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# 4. Model evaluation before fine-tuning

In [12]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["Normal", "Depression", "Anxiety", "Bipolar"]
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=2, 
                        temperature=0.1)
        
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()
        
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

y_pred = predict(X_test, model, tokenizer)

100%|██████████| 300/300 [01:50<00:00,  2.72it/s]


In [13]:
# After, we create the evaluate function that will use the predicted labels and true labels 
# to calculate the overall accuracy of the model and the accuracy per category, 
# generate a classification report, and print out a confusion matrix.
# Running the function will give us a detailed model evaluation summary. 

def evaluate(y_true, y_pred):
    labels = ["Normal", "Depression", "Anxiety", "Bipolar"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

evaluate(y_true, y_pred)

Accuracy: 0.797
Accuracy for label Normal: 0.755
Accuracy for label Depression: 0.930
Accuracy for label Anxiety: 0.593
Accuracy for label Bipolar: 0.533

Classification Report:
              precision    recall  f1-score   support

      Normal       0.90      0.76      0.82       143
  Depression       0.72      0.93      0.81       115
     Anxiety       0.73      0.59      0.65        27
     Bipolar       0.89      0.53      0.67        15

    accuracy                           0.80       300
   macro avg       0.81      0.70      0.74       300
weighted avg       0.81      0.80      0.79       300


Confusion Matrix:
[[108  32   3   0]
 [  4 107   3   1]
 [  6   5  16   0]
 [  2   5   0   8]]


## 5. Building the model

In [14]:
"""
When building the model, we start by extracting the linear module names from the model using the bits and bytes library. 
We then configure LoRA using the target modules, task type, and other arguments before setting up training arguments. 
These training arguments are optimized for the Kaggle notebook. You might need to change them if you are using them locally. 
"""

'\nWhen building the model, we start by extracting the linear module names from the model using the bits and bytes library. \nWe then configure LoRA using the target modules, task type, and other arguments before setting up training arguments. \nThese training arguments are optimized for the Kaggle notebook. You might need to change them if you are using them locally. \n'

In [15]:
# We will then create the model trainer using training arguments, a model, a tokenizer, a LoRA configuration, and a dataset. 
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
modules

['gate_proj', 'down_proj', 'up_proj', 'k_proj', 'q_proj', 'o_proj', 'v_proj']

In [16]:
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="wandb",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=512,
    packing=False,
    dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## 6. Model training

In [17]:
# It’s now time to initiate the model training:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
60,1.655,2.180394
120,1.7006,2.151644
180,1.3567,2.137661
240,1.7423,2.12949
300,1.9255,2.126186


TrainOutput(global_step=300, training_loss=1.9099050623178482, metrics={'train_runtime': 3127.1476, 'train_samples_per_second': 0.767, 'train_steps_per_second': 0.096, 'total_flos': 1.5064971940184064e+16, 'train_loss': 1.9099050623178482, 'epoch': 1.0})

In [18]:
# we finish the weights and biases run.
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▄▂▁▁
eval/runtime,▄█▄▁▃
eval/samples_per_second,▅▁▅█▆
eval/steps_per_second,█▁███
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▅▇█▂▂▃▂▃▁▃▂▂▂▂▂▂▁▁▂▃▃▃▄▂▄▁▂▃▂▂▂▂▄▂▁▂▂▂▂▃
train/learning_rate,▄▇██████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,█▆▄▃▃▃▂▂▂▂▃▂▂▃▂▂▄▂▃▃▄▁▃▂▃▁▃▃▂▃▃▃▃▂▃▃▂▃▃▃

0,1
eval/loss,2.12619
eval/runtime,119.4628
eval/samples_per_second,2.511
eval/steps_per_second,0.318
total_flos,1.5064971940184064e+16
train/epoch,1.0
train/global_step,300.0
train/grad_norm,0.31326
train/learning_rate,0.0
train/loss,1.9255


In [19]:
model_dir = "Llama-3.1-8B-Instruct-Fine-Tuned-for-Mental-Health-Classification"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('llama-3.1-fine-tuned-model/tokenizer_config.json',
 'llama-3.1-fine-tuned-model/special_tokens_map.json',
 'llama-3.1-fine-tuned-model/tokenizer.json')

## 7. Testing model after fine-tuning

In [20]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 300/300 [02:36<00:00,  1.92it/s]

Accuracy: 0.907
Accuracy for label Normal: 0.972
Accuracy for label Depression: 0.913
Accuracy for label Anxiety: 0.630
Accuracy for label Bipolar: 0.733

Classification Report:
              precision    recall  f1-score   support

      Normal       0.91      0.97      0.94       143
  Depression       0.92      0.91      0.92       115
     Anxiety       0.74      0.63      0.68        27
     Bipolar       1.00      0.73      0.85        15

    accuracy                           0.91       300
   macro avg       0.89      0.81      0.85       300
weighted avg       0.91      0.91      0.90       300


Confusion Matrix:
[[139   3   1   0]
 [  5 105   5   0]
 [  7   3  17   0]
 [  1   3   0  11]]





In [25]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_value_1 = user_secrets.get_secret("Pass")
secret_value_2 = user_secrets.get_secret("User")
secret_value_3 = user_secrets.get_secret("wandb")

In [28]:
# push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(output_dir, use_temp_dir=False ,token="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")
tokenizer.push_to_hub(output_dir, use_temp_dir=False ,token="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")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.81G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Houbid/llama-3.1-fine-tuned-model/commit/4911cd30c473305b28531896bc4967765bad4fc1', commit_message='Upload tokenizer', commit_description='', oid='4911cd30c473305b28531896bc4967765bad4fc1', pr_url=None, pr_revision=None, pr_num=None)