### Data Preparation

In [1]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [2]:
# Import pandas for data analysis
import pandas as pd
df = pd.read_csv("./Data_cleaned_medication_QA_data.csv", encoding='windows-1252', encoding_errors='replace')
df = df[['Question', 'Answer']]

In [3]:
df.head() #show first five rows

Unnamed: 0,Question,Answer
0,What are the signs of a healthy body?,The signs of a healthy body include consistent...
1,How can I boost my immune system?,"To boost your immune system, focus on a balanc..."
2,immune system,The *immune system* is your body's natural def...
3,"What is a balanced diet, and how do I follow one?",A **balanced diet** is one that provides your ...
4,How much water should I drink daily?,The amount of water you should drink daily dep...


In [4]:
df.Question[0]

'What are the signs of a healthy body?'

In [5]:
df.Answer[0]

'The signs of a healthy body include\xa0consistent energy levels\xa0throughout the day, indicating proper nutrition and rest. You should have a\xa0strong immune system, with fewer instances of colds or infections.\xa0Healthy skin, hair, and nails\xa0are also indicators, as they reflect good internal health. A\xa0stable weight\xa0within a normal range for your age and height is another sign. Regular\xa0digestive health\xa0and normal bowel movements suggest a well-functioning digestive system. Additionally,\xa0good sleep quality\xa0and waking up refreshed are key markers.\xa0Mental clarity, focus, and emotional stability\xa0also contribute to overall health. Finally, the ability to engage in\xa0physical activity\xa0without excessive fatigue or discomfort is a strong sign of a healthy body.'

In [6]:
df.shape # 690 rows | 2 cols

(690, 2)

In [7]:
#pip install cleantext uncomment when needed

In [8]:
import os
print(os.getcwd())
print("Current directory:", os.getcwd())
print("Directory contents:", os.listdir())


import cleantext

# Function to clean text data by removing unwanted characters and formatting
def clean(textdata):
    cleaned_text = []
    for i in textdata:
        cleaned_text.append(cleantext.clean(str(i), extra_spaces=True, lowercase=True, stopwords=False, stemming=False, numbers=True, punct=True, clean_all = True))

    return cleaned_text

c:\Users\mohda\Documents\AI Health Assistant\Notebook
Current directory: c:\Users\mohda\Documents\AI Health Assistant\Notebook
Directory contents: ['Couselling Chat.ipynb', 'Data_cleaned_counsel_QA_data.csv', 'Data_cleaned_medication_QA_data.csv', 'Diabetes Classification.ipynb', 'diabetes.csv', 'logs', 'Medication Chat.ipynb', 'Medicine Classification.ipynb', 'random_forest_model.joblib', 'results']


In [9]:
# Apply the clean function to the questions and answers columns

df.Question = list(clean(df.Question))
df.Answer = list(clean(df.Answer))

In [10]:
# Save the cleaned data into a new CSV file & save
df.to_csv("Clean_medication_output.csv", index=False)

### GPT-2 Model

In [11]:
#pip install datasets

In [12]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')




In [13]:
# Set the padding token for the tokenizer to be the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Maximum sequence length that GPT-2 can handle
max_length = tokenizer.model_max_length
print(max_length)

1024


In [15]:
# Load the cleaned QA dataset as a training set using the 'datasets' library
dataset = load_dataset('csv', data_files={'train': 'Clean_medication_output.csv'}, split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [16]:
#Function to tokenize questions and answers and prepare them for the model
def tokenize_function(examples):
  '''1. Combine each question and answer into a single input string
     2. Tokenize the combined text using the GPT-2 tokenizer
     3. Set the labels to be the same as the input_ids (shifted to predict the next word)
     4. Return the tokenized output. '''
    
  combined_text = [str(q) + " " + str(a) for q, a in zip(examples['Question'], examples['Answer'])]
  tokenized_output = tokenizer(combined_text, padding='max_length', truncation=True, max_length=128)
  # Set the labels to be the same as the input_ids (shifted to predict the next word)
  tokenized_output['labels'] = tokenized_output['input_ids'].copy()
  
  return tokenized_output
# Tokenize the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/690 [00:00<?, ? examples/s]

In [19]:
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device_type = "GPU"
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
else:
    import psutil
    device_name = "CPU"
    device_type = "CPU"
    total_memory = psutil.virtual_memory().total / 1e9  # GB

print(f"Device: {device_type} ({device_name})")
print(f"Memory: {total_memory:.2f} GB")

from transformers import TrainingArguments, Trainer


# Define training arguments for the GPT-2 model
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save model outputs
    num_train_epochs=3,  # Train for 50 epochs
    fp16 = True,
    per_device_train_batch_size=8, # Batch size during training
    per_device_eval_batch_size=32,  # Batch size during evaluation
    eval_strategy="no", # Evaluation can slow training.
    save_strategy="epoch", #Saves checkpoints only once per epoch instead of every few steps
    warmup_steps=500,  # Warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for saving logs
    logging_steps=500,  # Log every 10 steps
    save_steps=2000,  # Save model checkpoints every 1000 steps
)

# Trainer class to handle training process
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Device: CPU (CPU)
Memory: 6.38 GB


  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=261, training_loss=2.8027467691121886, metrics={'train_runtime': 21627.442, 'train_samples_per_second': 0.096, 'train_steps_per_second': 0.012, 'total_flos': 135218626560000.0, 'train_loss': 2.8027467691121886, 'epoch': 3.0})

In [32]:
# Save the model
trainer.save_model('medication_info_model.joblib')

### Testing

In [33]:
# Function to generate a response based on a user prompt (testing the model)
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")  # no .to('cuda')
    outputs = model.generate(
        inputs,
        max_length=150,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [34]:
# Example conversation
user_input = "what is desonide ointment used for"
bot_response = generate_response(user_input)
print("Bot Response:", bot_response)

Bot Response: what is desonide ointment used for the treatment of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a severe case of a


## Use This for Google Collab file 

In [35]:
# Copying the model to Google Drive (optional)
# import shutil

# Path to the file in Colab
#colab_file_path = '/content/med_info_model/model.safetensors'

# Path to your Google Drive
# drive_file_path = '/content/drive/MyDrive'

# Copy the file
# shutil.copy(colab_file_path, drive_file_path)

## Use This for vs code and jupyter notebook

In [36]:
import shutil

# Path to the source file on your system
source_file_path = r'C:\Users\mohda\Documents\AI Health Assistant\Notebook\medication_info_model\model.safetensors'

# Destination folder where you want to copy the file
destination_folder = r'C:\Users\mohda\Documents\AI Health Assistant\backend\models\medication_info'

destination_file_path = destination_folder + r'\model.safetensors'
# Copy the fil
#;[{}] ea
shutil.copy(source_file_path, destination_file_path)

print("File copied successfully!")

File copied successfully!
