In [1]:
import pandas as pd
import numpy as np
import re

# Load the dataset
dataset = pd.read_csv("dataset.csv")

# Preprocess the dataset
def preprocess_text(text):
    # Remove unwanted characters
    text = re.sub(r"[^a-zA-Z0-9\s.,']", "", text)
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Apply preprocessing to the dataset
dataset['Paragraph'] = dataset['Paragraph'].apply(preprocess_text)

# Split the paragraphs into sentences
dataset['Sentences'] = dataset['Paragraph'].str.split('.')

# Drop rows with empty sentences
dataset = dataset.dropna()

# Split the sentences into tokens
dataset['Tokens'] = dataset['Sentences'].apply(lambda x: [sentence.split() for sentence in x])

# Display the preprocessed dataset
print(dataset.head())
dataset.to_csv("preprocessed_dataset.csv", index=False)


                                           Paragraph  \
0  Patient of Patient ID 1 is a smoker and an alc...   
1  Patient of Patient ID 1 is a smoker and an alc...   
2  Patient of Patient ID 1 is a smoker and an alc...   
3  Patient of Patient ID 1 is a smoker and an alc...   
4  Patient of Patient ID 1 is a smoker and an alc...   

                                           Sentences  \
0  [Patient of Patient ID 1 is a smoker and an al...   
1  [Patient of Patient ID 1 is a smoker and an al...   
2  [Patient of Patient ID 1 is a smoker and an al...   
3  [Patient of Patient ID 1 is a smoker and an al...   
4  [Patient of Patient ID 1 is a smoker and an al...   

                                              Tokens  
0  [[Patient, of, Patient, ID, 1, is, a, smoker, ...  
1  [[Patient, of, Patient, ID, 1, is, a, smoker, ...  
2  [[Patient, of, Patient, ID, 1, is, a, smoker, ...  
3  [[Patient, of, Patient, ID, 1, is, a, smoker, ...  
4  [[Patient, of, Patient, ID, 1, is, a, smoker, ..

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load GPT-Neo tokenizer and model with ignore_mismatched_sizes=True
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model = GPT2LMHeadModel.from_pretrained("EleutherAI/gpt-neo-125M", ignore_mismatched_sizes=True)

# Prepare the dataset for fine-tuning
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="preprocessed_dataset.csv",  # Path to your preprocessed dataset
    block_size=128  # Set appropriate block size
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./text-generation-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    ),
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

You are using a model of type gpt_neo to instantiate a model of type gpt2. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.10.attn.c_attn.bias', 'transformer.h.10.attn.c_attn.weight', 'transformer.h.10.attn.c_proj.bias', 'transformer.h.10.attn.c_proj.weight', 'transformer.h.11.attn.c_attn.bias', 'transformer.h.11.attn.c_attn.weight', 'transformer.h.11.attn.c_proj.bias', 'transformer.h.11.attn.c_proj.weight', 'transformer.h.2.attn.c_attn.bias', 'transformer.h.2.attn.c_attn.weight', 'transformer.h.2.attn.c_proj.bias', 'transformer.h.2.attn.c_proj.weight', 'transformer.h.3.attn.c_attn.bias', 'transformer.h.3.attn.c_attn.weight', 'transformer.h.3.attn.c_proj.bias', 'transfo

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,2.2976
1000,0.431
1500,0.3148
2000,0.2994
2500,0.2872
3000,0.2812
3500,0.2764
4000,0.2769
4500,0.2712
5000,0.2684


TrainOutput(global_step=19902, training_loss=0.2937912958708855, metrics={'train_runtime': 1000.3378, 'train_samples_per_second': 79.578, 'train_steps_per_second': 19.895, 'total_flos': 5200038051840000.0, 'train_loss': 0.2937912958708855, 'epoch': 3.0})

In [17]:
def generate_next_hour(patient_info):
    # Tokenize the input text
    input_ids = tokenizer.encode(patient_info, return_tensors="pt").to(model.device)

    # Generate text for the next hour
    output = model.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the information for the next hour
    next_hour_info = generated_text.split(" at hour ")[-1]

    return next_hour_info

# Example usage
patient_info = "age 70 has 120 mmHg Systolic Blood Pressure, 70 mmHg Diastolic Blood Pressure, 70 beats per minute Heart Rate, 20 breaths per minute Respiratory Rate, 98 fahrenheit Body temperature, 98 mEqL SpO2"
next_hour_info = generate_next_hour(patient_info)
print(next_hour_info)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


9 is continuing treatment in the ETU.","['Patient of Patient ID 9 is not a smoker and not an alcoholic female with Family History of Ischemic Heart Diseases at age 65 has 127 mmHg Systolic Blood Pressure, 80 mmHg Diastolic Blood Pressure, 80 beats per minute Heart Rate, 20 breaths per minute Respiratory Rate, 99 fahrenheit Body temperature, 100 mEqL SpO2, 139 mEqL Sodium Level, 4 mEqL Pot


In [5]:
# Define the directory path to save the model
output_model_dir = "./final-tuned-model"

# Save the fine-tuned model and tokenizer
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print("Model saved successfully.")


Model saved successfully.


In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the saved model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./final-tuned-model")
tokenizer = GPT2Tokenizer.from_pretrained("./final-tuned-model")

def generate_next_hour(patient_info):
    # Tokenize the input text
    input_ids = tokenizer.encode(patient_info, return_tensors="pt").to(model.device)

    # Generate text for the next hour
    output = model.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the information for the next hour
    next_hour_info = generated_text.split(" at hour ")[-1]

    return next_hour_info

# Example usage
patient_info = "Patient of Patient ID 10 is not a smoker and female with no Family History of Ischemic Heart Diseases at age 55 has 140 mmHg Systolic Blood Pressure, 70 mmHg Diastolic Blood Pressure, 80 beats per minute Heart Rate, 15 breaths per minute Respiratory Rate, 85 fahrenheit Body temperature , 98 mEq/L SpO2, 139 mEq/L Sodium Level, 4 mEq/L Potassium Level , 105 mEq/L Chloride Level, 41 mg/dL Urea, 91 mg/dL Creatinine at hour  is continuing treatment in the ETU. GENERATE ONLY THE Systolic Blood Pressure FOR THE NEXT HOUR "
next_hour_info = generate_next_hour(patient_info)
print(next_hour_info)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 is continuing treatment in the ETU. GENERATE ONLY THE Systolic Blood Pressure FOR THE NEXT HOUR astolic Blood Pressure, 75 beats per minute Heart Rate, 20 breaths per minute Respiratory Rate, 98 fahrenheit Body temperature, 98 mEqL SpO2, 139 mEqL Sodium Level, 4 mEqL


In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the saved model and tokenizer
model_path = "./final-tuned-model"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set pad token ID
model.config.pad_token_id = tokenizer.eos_token_id

def generate_next_hour(patient_info, top_k=5):
    # Tokenize the input text
    input_ids = tokenizer.encode(patient_info, return_tensors="pt")

    # Generate text for the next hour
    output = model.generate(input_ids, 
                            max_length=250, 
                            num_return_sequences=1, 
                            temperature=0.7,
                            pad_token_id=tokenizer.eos_token_id,
                            top_k=top_k,
                            top_p=0.95,
                            do_sample=True,
                            num_beams=1)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the information for the next hour
    next_hour_info = generated_text.split(" at hour ")[-1]

    return next_hour_info

# Example usage
patient_info = "Patient of Patient ID 50 is not a smoker and male"
next_hour_info = generate_next_hour(patient_info)
print(next_hour_info)


36 is continuing treatment in the ETU.","['Patient of Patient ID 37 is a smoker and an alcoholic male with Family History of Ischemic Heart Diseases at age 51 has 95 mmHg Systolic Blood Pressure, 51 mmHg Diastolic Blood Pressure, 56 beats per minute Heart Rate, 18 breaths per minute Respiratory Rate, 98 fahrenheit Body temperature, 99 mEqL SpO2, 143 mEqL Sodium Level, 4 mEqL SpO2, 4 mEqL Sodium Level, 4 mEqL Potassium Level, 102 mEqL Chloride Level
