In [1]:
import numpy as np
import pandas as pd


file_path = r'C:\Users\jites\Desktop\Project_folder\Medicine_and_count_multiple_sentences.xlsx'
df = pd.read_excel(file_path)

df = df.astype(str)
df = df.fillna("")
df.head(2)

Unnamed: 0,Item Name,Item Count
0,"HUMALOG MIX 50 CARTRIDGE, ZECAL FEM TABLET",
1,HIFENAC P TABLETS,HIFENAC P : 1 TABLETS


In [2]:
from sklearn.model_selection import train_test_split

# Split the data into train and validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [3]:
import torch

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned-model-medicine-1').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-finetuned-tokenizer-medicine-1')

tokenizer.pad_token = tokenizer.eos_token

# Regularization: Add dropout and weight decay
model.config.attn_pdrop = 0.2
model.config.embd_pdrop = 0.2
model.config.resid_pdrop = 0.2




In [5]:
def create_dataset(df):
    texts = df['Item Name'].tolist()
    # labels = [' '.join([f"{label}: {sentence}" for label, sentence in zip(df.columns[:-1], row[:-1])]) for row in df.itertuples(index=False)]
    labels = df['Item Count'].tolist()
    
    # Create a list of dictionaries
    data = [{'text': f"{text} => {label}"} for text, label in zip(texts, labels)]
    return data

xx = create_dataset(df)
xx[19]

{'text': 'HAPPIBIOTIC CAP, MAXILIV INJECTION, DAPEFY S 10100MG TABLET => MAXILIV : 1 INJECTION'}

In [6]:
from datasets import Dataset

def create_dataset(df):
    texts = df['Item Name'].tolist()
    # labels = [' '.join([f"{label}: {sentence}" for label, sentence in zip(df.columns[:-1], row[:-1])]) for row in df.itertuples(index=False)]
    labels = df['Item Count'].tolist()
    
    # Create a list of dictionaries
    data = [{'text': f"{text} => {label}"} for text, label in zip(texts, labels)]
    return Dataset.from_list(data)

# Create Hugging Face datasets from DataFrames
train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)


# data_collator = DataCollatorForSeq2Seq(
#     tokenizer=tokenizer, padding=True, model= model
# )

Map:   0%|          | 0/2208 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

In [7]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from transformers import EarlyStoppingCallback

#### Optimizer and learning rate scheduler
# optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
# scheduler = CosineAnnealingLR(optimizer, T_max=500, eta_min=5e-6)


optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.001)       ## weight_decay =>  L2 regularization
scheduler = ReduceLROnPlateau(optimizer, min_lr=7e-7)

In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,      ## 2
    gradient_accumulation_steps=4,  # Accumulate over 4 batches to simulate a batch size of 32
    per_device_eval_batch_size=8,
    eval_steps=400,
    save_steps=500,
    warmup_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    # eval_strategy="steps",
    # save_strategy='steps',
    weight_decay=0.001,  ## L2 regularization
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,  # Load best model at the end based on evaluation
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    # fp16=True,  # Enable mixed precision
)

# training_args = TrainingArguments(
#     output_dir=repo_name,
#     group_by_length=True,
#     length_column_name='input_length',
#     per_device_train_batch_size=24,
#     gradient_accumulation_steps=2,
#     evaluation_strategy="steps",
#     num_train_epochs=20,
#     fp16=True,
#     save_steps=1000,
#     save_strategy='steps', # we cannot set it to "no". Otherwise, the model cannot guess the best checkpoint.
#     eval_steps=1000,
#     logging_steps=1000,
#     learning_rate=5e-5,
#     warmup_steps=500,
#     save_total_limit=3,
#     load_best_model_at_end = True # this will let the model save the best checkpoint
# )

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    # train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train model
trainer.train()

# Save model and tokenizer
model.save_pretrained('./gpt2-finetuned-model-medicine-2')
tokenizer.save_pretrained('./gpt2-finetuned-tokenizer-medicine-2')

  0%|          | 0/138 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
import time
import json

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

start = time.time()

model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned-model-medicine-2').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-finetuned-tokenizer-medicine-2')

tokenizer.pad_token = tokenizer.eos_token

model = model.half() if torch.cuda.is_available() else model

generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

def generate_labels(sent):
    paragraph = sent
    input_text = f"{paragraph} =>"

    # Encode with truncation and padding
    inputs = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        max_length=256,  # Adjust based on your requirement
        truncation=True,  # Explicit truncation
        padding='max_length'  # Padding to the max length
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate predictions with optimized settings
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=512,  # Adjust based on your requirement
            num_return_sequences=1,
            num_beams=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Process and return the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    labels = output_text.split('=>')[-1].strip()
    return labels


result = generate_labels("HAPPIBIOTIC CAP, MAXILIV INJECTION, DAPEFY S 10100MG TABLET")

def parse_output(output):
    lines = output.split(',')
    labeled_sentences = {}
    current_label = None
    for line in lines:
        if ':' in line:
            current_label, sentence = line.split(':', 1)
            if current_label.strip() in labeled_sentences:
                pass
            else:
                labeled_sentences[current_label.strip()] = sentence.strip()
        else:
            if current_label:
                if current_label.strip() in labeled_sentences:
                    pass
                else:
                    labeled_sentences[current_label] += ' ' + line.strip()
    return labeled_sentences

dict_data = parse_output(result)
print(dict_data)

end = time.time()
print(end - start)