In [None]:
!pip install --upgrade transformers datasets peft accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.2.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiproces

In [None]:
# Cell 2: Setup Environment and Mount Google Drive

import os
from google.colab import drive

# Define the mount path
mount_path = '/content/drive'

# Function to check if Google Drive is already mounted
def is_drive_mounted(mount_path):
    return os.path.exists(os.path.join(mount_path, 'MyDrive'))

# Mount Google Drive only if it's not already mounted
if not is_drive_mounted(mount_path):
    drive.mount(mount_path)
else:
    print("Google Drive is already mounted.")

# Define the base directory in Google Drive
BASE_DIR = '/content/drive/MyDrive/DL'  # Ensure this path is correct and exists
DATA_DIR = os.path.join(BASE_DIR, 'data')
MODEL_DIR = os.path.join(BASE_DIR, 'fine_tuned_model')
LOG_DIR = os.path.join(BASE_DIR, 'logs')

# Create directories if they don't exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

print(f"Directories set up successfully:\n- Data Directory: {DATA_DIR}\n- Model Directory: {MODEL_DIR}\n- Logs Directory: {LOG_DIR}")


Mounted at /content/drive
Directories set up successfully:
- Data Directory: /content/drive/MyDrive/DL/data
- Model Directory: /content/drive/MyDrive/DL/fine_tuned_model
- Logs Directory: /content/drive/MyDrive/DL/logs


In [None]:
# Cell 2: Data Preprocessing

import pandas as pd
from datasets import Dataset, DatasetDict
import os

# Define the path to the dataset in Google Drive
dataset_path = os.path.join('/content/drive/MyDrive/DL', 'Diseases_Symptoms.csv')

# Load dataset
try:
    dataset = pd.read_csv(dataset_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"File not found at {dataset_path}. Please ensure the file exists in the specified directory.")
    raise

# Dataset Preview
print("Dataset Preview:")
print(dataset.head())

# Process the dataset into instruction-output pairs
def process_data(row):
    disease_name = row['Name']
    symptoms = row['Symptoms']
    treatments = row['Treatments']

    # Format the text in the style of instruction-output pairs
    instruction = (
        f"Symptoms: {symptoms}\n\n### Response:"
    )
    output = (
        f"Disease: {disease_name}\n"
        f"Treatments: {treatments}"
    )
    return {"instruction": instruction, "output": output}

# Apply the processing function to each row
processed_data = dataset.apply(process_data, axis=1).tolist()

# Convert to Hugging Face dataset format
processed_df = pd.DataFrame(processed_data)
train_dataset = Dataset.from_pandas(processed_df)

# Split into training (80%), validation (10%), and test (10%)
split_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)
test_valid_split = split_dataset['test'].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
final_dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': test_valid_split['test'],
    'test': test_valid_split['train']
})

# Define paths to save the processed datasets
train_path = os.path.join('/content/drive/MyDrive/DL', 'data', 'train_data.json')
validation_path = os.path.join('/content/drive/MyDrive/DL', 'data', 'validation_data.json')
test_path = os.path.join('/content/drive/MyDrive/DL', 'data', 'test_data.json')

# Save datasets to Google Drive in JSON format
final_dataset['train'].to_json(train_path)
final_dataset['validation'].to_json(validation_path)
final_dataset['test'].to_json(test_path)

print("Datasets have been processed and saved to Google Drive successfully.")


Dataset loaded successfully.
Dataset Preview:
   Code                         Name  \
0     1               Panic disorder   
1     2             Vocal cord polyp   
2     3              Turner syndrome   
3     4               Cryptorchidism   
4     5  Ethylene glycol poisoning-1   

                                            Symptoms  \
0  Palpitations, Sweating, Trembling, Shortness o...   
1           Hoarseness, Vocal Changes, Vocal Fatigue   
2  Short stature, Gonadal dysgenesis, Webbed neck...   
3  Absence or undescended testicle(s), empty scro...   
4  Nausea, vomiting, abdominal pain, General mala...   

                                          Treatments  
0  Antidepressant medications, Cognitive Behavior...  
1       Voice Rest, Speech Therapy, Surgical Removal  
2  Growth hormone therapy, Estrogen replacement t...  
3  Observation and monitoring (in cases of mild o...  
4  Supportive Measures, Gastric Decontamination, ...  


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Datasets have been processed and saved to Google Drive successfully.


In [None]:
# Step 3: Fine-Tuning the Mistral-7B Model

import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    TrainerCallback,
    TrainerState,
    TrainerControl
)

from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import Dataset, DatasetDict
from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model

# **Important:** Replace 'your_hf_token_here' with your actual Hugging Face token.
# It's recommended to use environment variables or a secure method to handle tokens.
os.environ["HF_TOKEN"] = "your_hf_token_here"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Define paths based on Google Drive directories
BASE_DIR = '/content/drive/MyDrive/DL'  # Base directory in Google Drive
DATA_DIR = os.path.join(BASE_DIR, 'data')
MODEL_DIR = os.path.join(BASE_DIR, 'fine_tuned_model')
LOG_DIR = os.path.join(BASE_DIR, 'logs')

# Paths to the processed datasets
train_path = os.path.join(DATA_DIR, 'train_data.json')
validation_path = os.path.join(DATA_DIR, 'validation_data.json')
test_path = os.path.join(DATA_DIR, 'test_data.json')

# Load the tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set pad_token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Configure quantization using BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Use float16 for computations
    bnb_4bit_use_double_quant=True,  # Double quantization
    bnb_4bit_quant_type="nf4"  # NormalFloat4 quantization
)

# Load the model with 4-bit quantization
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",  # Automatically use the available GPU
        trust_remote_code=True
    )

# Optimize with Accelerate
device_map = infer_auto_device_map(
    model,
    max_memory={"cpu": "12GiB", 0: "14GiB"}  # Adjust based on available GPU memory
)
model = dispatch_model(model, device_map=device_map)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()
model.config.use_cache = False

# Configure LoRA for fine-tuning
lora_config = LoraConfig(
    r=4,  # Rank for LoRA
    lora_alpha=8,  # Scaling factor for LoRA
    target_modules=["q_proj", "k_proj"],  # Target layers in the Mistral model
    lora_dropout=0.1,  # Dropout for LoRA
    bias="none",
    task_type=TaskType.CAUSAL_LM,  # Causal Language Modeling task
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Load the datasets from Google Drive
final_dataset = DatasetDict({
    'train': Dataset.from_json(train_path),
    'validation': Dataset.from_json(validation_path),
    'test': Dataset.from_json(test_path)
})

# Tokenization function
def tokenize(example):
    prompt = example['instruction']
    response = example['output']
    full_prompt = f"{prompt}\n\n### Response:\n{response}"
    tokenized = tokenizer(
        full_prompt,
        truncation=True,
        max_length=256,
        padding='max_length'
    )
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

# Tokenize datasets
tokenized_datasets = final_dataset.map(tokenize, remove_columns=['instruction', 'output'])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

class LossPrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if 'loss' in logs:
                print(f"Training Loss at step {state.global_step}: {logs['loss']}")
            if 'eval_loss' in logs:
                print(f"Validation Loss at step {state.global_step}: {logs['eval_loss']}")

# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(LOG_DIR, 'results_medical_symptom_checker'),  # Directory to save the results
    per_device_train_batch_size=1,  # Batch size for each device
    gradient_accumulation_steps=32,  # Accumulate gradients to simulate larger batch sizes
    num_train_epochs=3,  # Number of epochs
    logging_steps=10,
    save_steps=500,  # Adjusted to save less frequently for larger models
    save_total_limit=2,
    learning_rate=2e-5,  # Learning rate
    bf16=True,  # Use bfloat16 for better performance
    evaluation_strategy='steps',  # Enable evaluation during training
    # eval_steps=500,  # Perform evaluation every 500 steps
    eval_steps=10,
    logging_dir=LOG_DIR,
    optim="paged_adamw_8bit",  # Optimizer for memory efficiency
    report_to="none",  # Disable reporting to external platforms
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    callbacks=[LossPrinterCallback()],
)

# Train the model
trainer.train()

# Save the fine-tuned model to Google Drive
model_save_path = os.path.join(MODEL_DIR, 'fine_tuned_mistral_medical_symptom_checker')
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Fine-tuned model saved to {model_save_path}.")

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
10,2.1006,1.969916
20,2.0554,1.935979
30,2.024,1.921518


Training Loss at step 10: 2.1006
Validation Loss at step 10: 1.969915747642517
Training Loss at step 20: 2.0554
Validation Loss at step 20: 1.9359785318374634
Training Loss at step 30: 2.024
Validation Loss at step 30: 1.9215176105499268
Fine-tuned model saved to /content/drive/MyDrive/DL/fine_tuned_model/fine_tuned_mistral_medical_symptom_checker.


In [None]:
# Cell 5: Inference - Predicting Disease and Treatments from Symptoms

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import sys

# Ensure bitsandbytes is installed and up-to-date
try:
    import bitsandbytes as bnb
except ImportError:
    !pip install bitsandbytes
    import bitsandbytes as bnb

# Define paths based on Google Drive directories
BASE_DIR = '/content/drive/MyDrive/DL'  # Base directory in Google Drive
MODEL_DIR = os.path.join(BASE_DIR, 'fine_tuned_model')
model_save_path = os.path.join(MODEL_DIR, 'fine_tuned_mistral_medical_symptom_checker')

# **Important:** Securely handle your Hugging Face token.
# It's recommended to set it as an environment variable or use a secure input method.
# For demonstration purposes, we'll use a placeholder.
HF_TOKEN = os.getenv("HF_TOKEN", "your_hf_token_here")  # Replace with your token or secure method
os.environ["HF_TOKEN"] = HF_TOKEN

# Verify that the model path exists
if not os.path.exists(model_save_path):
    print(f"Model path '{model_save_path}' does not exist. Please check the path and try again.")
    sys.exit()

# Load the tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(model_save_path, trust_remote_code=True)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    sys.exit()

# Load the base model with 4-bit quantization
try:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit loading
        bnb_4bit_compute_dtype=torch.float16,  # Use float16 for computations
        bnb_4bit_use_double_quant=True,  # Double quantization for stability
        bnb_4bit_quant_type="nf4"  # NormalFloat4 quantization
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        model_save_path,
        quantization_config=bnb_config,
        device_map="auto",  # Automatically maps the model to available devices
        trust_remote_code=True
    )
    print("Base model loaded successfully with 4-bit quantization.")
except Exception as e:
    print(f"Error loading base model: {e}")
    sys.exit()

# Load the LoRA adapters
try:
    model = PeftModel.from_pretrained(
        base_model,
        model_save_path,
        torch_dtype=torch.float16  # Use float16 for better performance
    )
    print("LoRA adapters loaded successfully.")
except Exception as e:
    print(f"Error loading LoRA adapters: {e}")
    sys.exit()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model is on device: {device}")

# Disable gradients for inference
model.eval()
for param in model.parameters():
    param.requires_grad = False

# Function to predict disease and treatments
def predict_disease_and_treatment(symptoms):
    """
    Predicts the disease and recommends treatments based on provided symptoms.

    Args:
        symptoms (str): A string describing the symptoms.

    Returns:
        str: The model's prediction of the disease and recommended treatments.
    """
    # Define the prompt aligned with training
    prompt = f"Symptoms: {symptoms}\n\n### Response:"

    # Tokenize the input
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
    except Exception as e:
        print(f"Error during tokenization: {e}")
        return "Error during tokenization."

    # Generate output with memory-efficient settings
    try:
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                max_length=150,              # Adjust as needed
                temperature=0.7,             # Controls randomness: lower is more deterministic
                top_p=0.9,                   # Nucleus sampling
                num_beams=5,                 # Beam search for better coherence
                no_repeat_ngram_size=2,      # Prevents repetition
                early_stopping=True,         # Stops generation when all beams reach EOS
                do_sample=False              # Disable sampling for more deterministic output
            )
    except Exception as e:
        print(f"Error during generation: {e}")
        return "Error during generation."

    # Decode the generated tokens
    try:
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract the response part
        response = response.split("### Response:")[-1].strip()
    except Exception as e:
        print(f"Error during decoding: {e}")
        return "Error during decoding."

    return response

# Example usage
# symptoms_input = "I have a High fever, body aches, fatigue, cough, sore throat, congestion" => output => Influenza (Flu) => Rest, fluids, over-the-counter medications for symptom relief, antiviral medications (in some cases)
#
# Input => I have a Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness Ouput => Panic disorder => Antidepressant medications, Cognitive Behavioral Therapy, Relaxation Techniques
# response = predict_disease_and_treatment(symptoms_input)
# print("Predicted Disease and Treatments:")
# print(response)

# Continuous loop to prompt user for symptoms until 'exit'
print("Medical Symptom Checker")
print("Enter 'exit' to quit.\n")
while True:
    symptoms_input = input("Please describe your symptoms: ")
    if symptoms_input.lower() == 'exit':
        print("Exiting the symptom checker. Stay healthy!")
        break
    response = predict_disease_and_treatment(symptoms_input)
    print("\nPredicted Disease and Treatments:")
    print("=" * 50)
    # Format the output for better visibility
    print(response)
    print("=" * 50 + "\n")

Tokenizer loaded successfully.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Base model loaded successfully with 4-bit quantization.
LoRA adapters loaded successfully.
Model is on device: cuda
Medical Symptom Checker
Enter 'exit' to quit.

Please describe your symptoms: I have a High fever, body aches, fatigue, cough, sore throat, congestion


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Predicted Disease and Treatments:
Based on the symptoms you've described, it's possible that you have the flu or another viral infection. Here are some steps you can take to help alleviate your symptoms and speed up your recovery:
1. Stay hydrated: Drink plenty of fluids, such as water, clear broths, or fruit juices. Avoid caffeinated and alcoholic beverages, as they can dehydrate you.
2. Get rest: Try to get as much sleep as possible. Resting will help your body focus its energy on fighting

Please describe your symptoms: I have a Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness Ouput


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Predicted Disease and Treatments:
It sounds like you may be experiencing symptoms of anxiety or a panic attack. These symptoms can be quite distressing, but they are not life-threatening. It's important to remember that your body is responding to your thoughts and emotions, and that you have the power to calm yourself down. Here are some things you can try to help alleviate your symptoms:
1. Focus on your breath: Take slow, deep breaths in through your nose and out throughyour mouth. Count to five on each inhale

Please describe your symptoms: exit
Exiting the symptom checker. Stay healthy!
