In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config, Trainer, TrainingArguments, AdamW, DataCollatorWithPadding
from transformers.optimization import get_scheduler
from transformers import EarlyStoppingCallback

# Load data
file_path = r'C:\Users\jites\Desktop\new_folder\synthetic_patient_data_with_lab_results.xlsx'
df = pd.read_excel(file_path)

# Prepare input text and labels
df['input_text'] = df.apply(lambda x: f"Age: {x['Age']} Gender: {x['Gender']} Symptoms: {x['Symptoms']} History: {x['Illness History']} LabTests: {x['Lab Test Results']}", axis=1)
df['Diagnosis'] = df['Diagnosis Suggested']

labels = df['Diagnosis'].astype('category').cat.codes


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['input_text'], labels, test_size=0.1, random_state=42
)

In [15]:
print(train_texts[0])
print(train_labels[0])

Age: 84 Gender: Female Symptoms: Hot Flashes, Muscle Cramps History: Leg Weakness, Strong Urine Odor LabTests: HR: 97 bpm, BP: 103/78 mmHg, RR: 13, O2 Sat: 96%, Temp: 38.3°C, Amylase: 45 U/L, Lipase: 459 U/L, WBC: (72.0, 0) lakhs/L, Na: 142 mmol/L, ALT: 17 U/L
9


In [3]:
# x_train = df_train['text']
# y_train = df_train['label']
# x_test = df_test['text']
# y_test = df_test['label']

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = torch.tensor(label_encoder.fit_transform(train_labels))
y_test_encoded = torch.tensor(label_encoder.transform(val_labels))

In [4]:
config = GPT2Config.from_pretrained('gpt2')
config.num_labels = len(label_encoder.classes_) 

# Load tokenizer and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=config).to(device)
model.config.pad_token_id = model.config.eos_token_id

tokenizer.padding_side = "right"  # For consistent padding to the right

model.config.attn_pdrop = 0.2
model.config.embd_pdrop = 0.2
model.config.resid_pdrop = 0.2

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

tokenizer.pad_token = tokenizer.eos_token

train_encodings = tokenize(train_texts.tolist())
test_encodings = tokenize(val_texts.tolist())

In [6]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        for key, val in self.encodings.items():
            if isinstance(val, torch.Tensor):
                item[key] = val[idx].clone().detach()
            else:
                item[key] = torch.tensor(val[idx])
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = CustomDataset(train_encodings, y_train_encoded)
test_dataset = CustomDataset(test_encodings, y_test_encoded)

In [7]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True
)

In [8]:

# # Create datasets
# train_dataset = Dataset.from_dict({'input_text': train_texts, 'labels': train_labels})
# val_dataset = Dataset.from_dict({'input_text': val_texts, 'labels': val_labels})


# # Add a padding token
# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})  # Define a new padding token

# # Define a padding token
# tokenizer.pad_token = tokenizer.eos_token  # Use the end of text token as the padding token
# tokenizer.pad_token_id = tokenizer.eos_token_id  # Use the end of text token as the padding token

# model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=len(df['Diagnosis'].unique()))
# model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings to account for new padding token
# model.to(device)

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=256)

# # Tokenize datasets
# train_dataset = train_dataset.map(tokenize_function, batched=True)
# val_dataset = val_dataset.map(tokenize_function, batched=True)


In [9]:
# train_dataset['labels']

In [10]:

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,  # Accumulate over 4 batches to simulate a batch size of 32
    per_device_eval_batch_size=8,
    eval_steps=200,
    save_steps=300,
    warmup_steps=300,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    learning_rate=2e-4,
    save_total_limit=3,
    eval_accumulation_steps=4,
)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
num_warmup_steps = int(0.1 * num_training_steps)

scheduler = get_scheduler("linear", optimizer=optimizer,
                            num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./gpt_diagnosis_model')
tokenizer.save_pretrained('./gpt_diagnosis_tokenizer')


  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                
 10%|█         | 28/280 [00:31<04:23,  1.05s/it]

{'eval_loss': 3.2788782119750977, 'eval_runtime': 0.982, 'eval_samples_per_second': 101.835, 'eval_steps_per_second': 13.239, 'epoch': 0.99}


                                                
 20%|██        | 56/280 [01:03<03:56,  1.06s/it]

{'eval_loss': 3.265268564224243, 'eval_runtime': 0.9897, 'eval_samples_per_second': 101.04, 'eval_steps_per_second': 13.135, 'epoch': 1.98}


                                                
 30%|███       | 84/280 [01:36<03:28,  1.06s/it]

{'eval_loss': 3.4272756576538086, 'eval_runtime': 0.9929, 'eval_samples_per_second': 100.717, 'eval_steps_per_second': 13.093, 'epoch': 2.97}


 36%|███▌      | 100/280 [01:54<03:12,  1.07s/it]

{'loss': 3.431, 'grad_norm': 4.4741692543029785, 'learning_rate': 0.0001785714285714286, 'epoch': 3.54}


                                                 
 40%|████      | 113/280 [02:09<02:52,  1.03s/it]

{'eval_loss': 3.480830669403076, 'eval_runtime': 1.0289, 'eval_samples_per_second': 97.192, 'eval_steps_per_second': 12.635, 'epoch': 4.0}


 40%|████      | 113/280 [02:11<03:14,  1.16s/it]


{'train_runtime': 131.4705, 'train_samples_per_second': 68.456, 'train_steps_per_second': 2.13, 'train_loss': 3.415597223602565, 'epoch': 4.0}


('./gpt_diagnosis_tokenizer\\tokenizer_config.json',
 './gpt_diagnosis_tokenizer\\special_tokens_map.json',
 './gpt_diagnosis_tokenizer\\vocab.json',
 './gpt_diagnosis_tokenizer\\merges.txt',
 './gpt_diagnosis_tokenizer\\added_tokens.json')

In [1]:
from sklearn.preprocessing import LabelEncoder
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch
import json
from torch.nn.functional import softmax

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = LabelEncoder()
# def predict(texts):
#     encodings = tokenize(texts)
#     outputs = model(**encodings).to(device)
#     predictions = torch.argmax(outputs.logits, dim=1)
#     return label_encoder.inverse_transform(predictions.tolist())



tokenizer = GPT2Tokenizer.from_pretrained('./gpt_diagnosis_tokenizer')
model = GPT2ForSequenceClassification.from_pretrained('./gpt_diagnosis_model').to(device)       ##, config=config

# def tokenize(texts):
#     return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

def predict(texts):
    # Tokenize and move input tensors to the correct device
    # encodings = tokenize(texts)
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    # Ensure the model is on the correct device
    model.to(device)
    
    # Forward pass through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Move outputs to CPU for processing
    logits = outputs.logits.cpu()
    
    # Get predictions
    predictions = torch.argmax(logits, dim=1)

    top_values, top_indices = torch.topk(logits, k=3)

    # js_data = open(r'C:\Users\jites\Desktop\Project_folder\Jupyter_practice\diesease_data_reverse.json')
    # json_data = json.load(js_data)

    # predictions = json_data.get(str(int(predictions[-1])), 'Unknown disease')

    return predictions
    # return label_encoder.inverse_transform(predictions.tolist())
    #### return top_values, top_indices, logits

predictions = predict(["Age: 84 Gender: Female Symptoms: Hot Flashes, Muscle Cramps History: Leg Weakness, Strong Urine Odor LabTests: HR: 97 bpm, BP: 103/78 mmHg, RR: 13, O2 Sat: 96%, Temp: 38.3°C, Amylase: 45 U/L, Lipase: 459 U/L, WBC: (72.0, 0) lakhs/L, Na: 142 mmol/L, ALT: 17 U/L"])

print("Output disease=> ", predictions)



# def get_probabilities(logits):
#     return softmax(logits, dim=-1)

# def calculate_top_k_accuracy(predictions, labels):
#     top_values, top_indices = get_top_k_predictions(predictions, k=k)
#     probabilities = get_probabilities(predictions)
    

#     js_data = open(r'C:\Users\jites\Desktop\Project_folder\Jupyter_practice\diesease_data_reverse.json')
#     json_data = json.load(js_data)


#     correct = 0
#     for i in range(len(labels)):
#         true_label = labels[i].item()
#         top_k_indices = top_indices[i].tolist()
#         top_k_probabilities = probabilities[i][top_k_indices].tolist()
        
#         if true_label in top_k_indices:
#             correct += 1
    
#     accuracy = correct / len(labels)
#     return top_values, top_indices, probabilities, accuracy




  from .autonotebook import tqdm as notebook_tqdm


Output disease=>  tensor([19])


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

start = time.time()

model = GPT2LMHeadModel.from_pretrained('./gpt2-diagnosis-model').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-diagnosis-tokenizer')
# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token

# Enable mixed precision if using a compatible GPU
model = model.half() if torch.cuda.is_available() else model

# Example of a generation function using a pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

def generate_labels(sent):
    paragraph = sent
    input_text = f"{paragraph} =>"

    # Encode with truncation and padding
    inputs = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        max_length=256,  # Adjust based on your requirement
        truncation=True,  # Explicit truncation
        padding='max_length'  # Padding to the max length
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate predictions with optimized settings
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # max_new_tokens=32,
            max_length=288,  # 512
            no_repeat_ngram_size=3,  # Prevent repeating 3-grams
            repetition_penalty=2.0,  # Penalize repetition
            num_return_sequences=1,
            num_beams=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Process and return the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(output_text)
    labels = output_text.split('=>')
    if len(labels) >= 2:
        labels = labels[1].strip()
    return labels


# sent = "Age: 48 Gender: Male Symptoms: Episodes of fainting, heart palpitations, and sweating IllnessHistory: Chest discomfort started suddenly after physical exertion, radiating to the left arm, with associated sweating. VitalSigns: BP 160/100, HR 110 bpm, RR 24, O2 Sat 90%, Temp 37.5°C LabResults: Troponin 0.22 ng/mL, WBC 12.5 x10^9/L, Na 138 mmol/L, K 4.1 mmol/L, CRP 30 mg/L ImagingResults: Ultrasound abdomen ! Gallbladder stones with signs of cholecystitis. AdditionalInfo: Chronic alcohol consumption for 10 years, with elevated liver enzymes."
sent = "Age: 44 Gender: Male Symptoms: Joint pain and stiffness, especially in the morning IllnessHistory: Frequent urination and excessive thirst for 2 weeks, blurred vision when reading small print. VitalSigns: BP 135/85, HR 95 bpm, RR 20, O2 Sat 96%, Temp 37.0°C LabResults: D-dimer 0.85 ug/mL, WBC 11.0 x10^9/L, Na 137 mmol/L, BNP 350 pg/mL ImagingResults: CT chest ! Large pulmonary embolism in the right lung. AdditionalInfo: Patient has a family history of diabetes and heart disease."
# sent = "Age: 50 Gender: Female Symptoms: Dizziness, blurred vision IllnessHistory: Chronic cough for the past 3 weeks, with significant weight loss. VitalSigns: BP 130/85, HR 100 bpm, RR 22, O2 Sat 98% LabResults: CRP 15 mg/L, WBC 15.5 x10^9/L, ESR 50 mm/h ImagingResults: MRI lumbar spine ! Herniated disc at L4-L5. AdditionalInfo: nan" ##  => Severe UTI
# sent = "Age: 34 Gender: Female Symptoms: Severe lower back pain radiating to legs IllnessHistory: Frequent urination and excessive thirst for 2 weeks, blurred vision when reading small print. VitalSigns: BP 145/95, HR 100 bpm, RR 18, O2 Sat 93 percent, Temp 38.0°C LabResults: Amylase 520 U per liter, Lipase 720 U per liter, WBC 14.0 x10^9 per liter, Na 135 mmol per liter, K 4.2 mmol per liter, ALT 75 U per liter ImagingResults: MRI brain ! Small ischemic infarct in the right parietal lobe. AdditionalInfo: Chronic alcohol consumption for 10 years, with elevated liver enzymes."  ## => Stroke (Ischemic)

result = generate_labels(sent)
print(result)