In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
!pip show unsloth

Name: unsloth
Version: 2024.8
Summary: 2-5X faster LLM finetuning
Home-page: http://www.unsloth.ai
Author: Unsloth AI team
Author-email: info@unsloth.ai
License: Apache License
                                   Version 2.0, January 2004
                                http://www.apache.org/licenses/
        
           TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
        
           1. Definitions.
        
              "License" shall mean the terms and conditions for use, reproduction,
              and distribution as defined by Sections 1 through 9 of this document.
        
              "Licensor" shall mean the copyright owner or entity authorized by
              the copyright owner that is granting the License.
        
              "Legal Entity" shall mean the union of the acting entity and all
              other entities that control, are controlled by, or are under common
              control with that entity. For the purposes of this definition,
     

In [3]:
!pip install flash-attn

Collecting flash-attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->flash-attn)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->flash-attn)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->flash-attn)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->flash-attn)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->flash-attn)
  Using cached nvidia_cublas_cu12-

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [5]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import pipeline

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [6]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 4.0, it can be magically be extended to 8192!


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/948 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


In [9]:
import pandas as pd
import numpy as np
df = pd.read_csv("/content/SOAP_notes_Training_Aug5.csv")
df

Unnamed: 0,Conversation,GPT Response
0,The right kidney is slightly hyperechoic with ...,Subjective: - Chief Complaint: Patient prese...
1,"Hello, I have upper abdominal pain that radia...",Subjective: CC: Upper abdominal pain radiating...
2,"hello sir, almost 2 years has passed but my wi...",Subjective: Chief Complaint (CC): Concerns of ...
3,My three year old son has white crusty scabs o...,Subjective: - Chief Complaint (CC): White crus...
4,I got my two lower wisdom teeth removed a week...,Subjective: - Chief Complaint (CC): Patient is...
...,...,...
7309,Customer: Im really struggling with my current...,The given conversation is not a doctor-patient...
7310,Customer: Im having trouble finding a solution...,The given conversation is not a doctor-patient...
7311,Customer: Im really struggling to manage my fi...,The given conversation is not a doctor-patient...
7312,Customer: Im really struggling with managing m...,The given conversation is not a doctor-patient...


In [10]:
df = df.dropna()

In [11]:
df.isnull().sum()

Unnamed: 0,0
Conversation,0
GPT Response,0


In [12]:
# Define the Alpaca prompt template with Doctor-patient conversation as input and SOAP notes as response
alpaca_prompt = """Below is a Doctor-patient conversation. Based on this conversation, create SOAP notes.

### Doctor-Patient Conversation:
{}

### SOAP Notes:
{}"""

EOS_TOKEN = tokenizer.eos_token

# Function to format prompts
def formatting_prompts_func(examples):
    conversations = examples["Conversation"]
    gpt_responses = examples["GPT Response"]
    texts = []
    for conversation, gpt_response in zip(conversations, gpt_responses):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(conversation, gpt_response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }

# Convert DataFrame to datasets.Dataset
from datasets import Dataset

dataset_dict = {
    "Conversation": df["Conversation"].tolist(),
    "GPT Response": df["GPT Response"].tolist(),
}
dataset = Dataset.from_dict(dataset_dict)

# Apply formatting_prompts_func to the dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/7313 [00:00<?, ? examples/s]

In [13]:
dataset

Dataset({
    features: ['Conversation', 'GPT Response'],
    num_rows: 7313
})

In [14]:
def predict_soap_notes(input_conversation):
  alpaca_prompt = """You are tasked with creating detailed SOAP notes from the information given in a provided doctor-patient conversation. It is imperative that you do not add any additional information, make assumptions, or include recommendations and suggestions. Focus strictly on parsing the details provided in the conversation to generate SOAP notes that reflect only the factual content discussed between the doctor and patient.

  ### Doctor-Patient Conversation:
  {}

  ### SOAP Notes:
  {}"""

  sample_conversation = input_conversation
  # Generate SOAP notes for the sample conversation
  inputs = tokenizer(
      [alpaca_prompt.format(sample_conversation, "")],  # Leave output blank for generation
      return_tensors="pt"
  ).to("cuda")


  # Initialize TextStreamer
  text_streamer = TextStreamer(tokenizer)

  # Generate SOAP notes using the model
  outputs = model.generate(
      **inputs,
      streamer=text_streamer,
      max_new_tokens=512
  )

  # Decode the generated SOAP notes
  decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
  final = decoded_outputs.split("###")
  result_text = final[2].replace("\n","").replace("<|end_of_text|>","")
  return result_text


In [15]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
0.822 GB of memory reserved.


In [None]:
import mlflow
from transformers import pipeline
import os
from getpass import getpass
from urllib.parse import urlparse

os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your DAGsHub username: ')
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = input('Enter your DAGsHub project name: ')

mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME'] + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')
# remote_server_uri = "https://dagshub.com/Indumathi-S-145/MLflow_Medical_NER_notebook.mlflow"
# mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("SOAP_notes-experiment")


tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

with mlflow.start_run(run_name="MLflow for SOAP notes"):
  mlflow.log_input(dataset, context="training")

  epochs = 2
  learning_rate = 2e-4
  trainer = SFTTrainer(
      model=model,
      tokenizer=tokenizer,
      train_dataset=formatted_dataset,
      dataset_text_field="text",
      max_seq_length=max_seq_length,
      dataset_num_proc=2,
      args=TrainingArguments(
          per_device_train_batch_size=2,
          gradient_accumulation_steps=4,
          warmup_steps=5,
          num_train_epochs=epochs,
          learning_rate=learning_rate,
          fp16=not torch.cuda.is_bf16_supported(),
          bf16=torch.cuda.is_bf16_supported(),
          logging_steps=1,
          optim="adamw_8bit",
          weight_decay=0.01,
          lr_scheduler_type="linear",
          seed=3407,
          output_dir="outputs",
      ),
  )
  trainer_stats = trainer.train()
  mlflow.log_param("epochs", epochs)
  mlflow.log_param("learning_rate", learning_rate)
  mlflow.log_metrics(trainer_stats.metrics)

  if tracking_url_type_store != "file":
    # Register the model
    # There are other ways to use the Model Registry, which depends on the use case,
    # please refer to the doc for more information:
    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
    pipe = predict_soap_notes(input_conversation)
    mlflow.transformers.log_model(pipe, "model", registered_model_name="SOAP_notes_generation")
  else:
    pipe = predict_soap_notes(input_conversation)
    mlflow.transformers.log_model(pipe, "model")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,313 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,828
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.5946
2,1.6416
3,1.6453
4,1.6081
5,1.4817
6,1.4793
7,1.1813
8,1.5535
9,1.402
10,1.418


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.02 GiB. GPU 

In [None]:
trainer_stats.metrics

{'train_runtime': 2604.3469,
 'train_samples_per_second': 5.635,
 'train_steps_per_second': 0.704,
 'total_flos': 6.626523773362176e+16,
 'train_loss': 1.4607295248590397,
 'epoch': 2.9975470155355683}

In [None]:
model.save_pretrained("/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17")
tokenizer.save_pretrained("/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17")

('/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17/tokenizer_config.json',
 '/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17/special_tokens_map.json',
 '/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17/tokenizer.model',
 '/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17/added_tokens.json',
 '/content/drive/MyDrive/Tigmaminds-tinyllama-Jul17/tokenizer.json')