In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("./data/stress_urinary_incontinence.csv")

In [3]:
# Text cleaning function
def clean_text(text):
    # Remove special characters, digits, and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the cleaning function to the text column
df['cleaned_text'] = df['FOI_TEXT'].apply(clean_text)

In [11]:
import nltk
nltk.download('punkt')

# Function to segment text into sentences
def segment_text(text):
    return nltk.sent_tokenize(text)

# Apply the function to segment the cleaned text
df['segmented_text'] = df['cleaned_text'].apply(segment_text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhaohengchuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
df['segmented_text'][500]

['it was reported to boston scientific corporation that an obtryx system was used during a transobturator tape for stress incontinence procedure performed on b6 2007 according to the complainant on b6 2014 the mesh was found to have eroded until it lay across the urethral lumen and a large bladder stone formed around the mesh the patient underwent transurethral surgery to endoscopically to crush up and remove stones and all visible mesh from within the urethral lumen she was catheterized for one week after the surgery and reportedly her symptoms settled down well all other information is unknown should additional relevant details become available a supplemental report will be submitted']

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm  # Import tqdm for progress bar

# Load T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('valhalla/t5-small-qg-hl')
model = T5ForConditionalGeneration.from_pretrained('valhalla/t5-small-qg-hl')

def generate_questions(text):
    # Prepare the input for the model
    input_text = f"generate question: {text}"
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    
    # Generate questions
    outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Example of generating questions for each sentence with progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Generating Questions"):
    questions = []
    for sentence in row['segmented_text']:
        question = generate_questions(sentence)
        questions.append(question)
    df.at[index, 'questions'] = questions


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return self.fget.__get__(instance, owner)()
Generating Questions: 100%|██████████| 20/20 [00:14<00:00,  1.38it/s]


In [30]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('valhalla/t5-small-qg-hl')
tokenizer.pad_token = tokenizer.eos_token
model = T5ForConditionalGeneration.from_pretrained('valhalla/t5-small-qg-hl')

def generate_questions(text):
    # Prepare the input for the model
    input_text = f"generate question: {text}"
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    
    # Generate questions
    outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Example of generating questions for each sentence
for index, row in df.iterrows():
    questions = []
    for sentence in row['segmented_text']:
        question = generate_questions(sentence)
        questions.append(question)
    df.at[index, 'questions'] = questions


In [54]:
from transformers import pipeline

# Load a pre-trained QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def generate_answer(question, context):
    # Extract the answer from the context based on the question
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Generate answers based on the generated questions
qa_pairs = []
for index, row in df.iterrows():
    context = row['FOI_TEXT']
    for question in row['questions']:
        answer = generate_answer(question, context)
        qa_pairs.append({
            "question": question,
            "answer": answer,
            "context": context
        })

# Convert QA pairs to DataFrame
qa_df = pd.DataFrame(qa_pairs)
print(qa_df.head())


                                            question  \
0                 <pad> What is a medtronic product?   
1           <pad> What is the name of the complaint?   
2  <pad> What will a supplemental report be issue...   
3  <pad> What was the manufacturer reference numb...   
4  <pad> What was the preoperative and postoperat...   

                                      answer  \
0  THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT   
1                                  MEDTRONIC   
2      A SUPPLEMENTAL REPORT WILL BE ISSUED.   
3                                     (B)(4)   
4                                    EROSION   

                                             context  
0  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  
1  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  
2  IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...  
3  MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...  
4  THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AG...  


In [55]:
qa_df.head()

Unnamed: 0,question,answer,context
0,<pad> What is a medtronic product?,THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
1,<pad> What is the name of the complaint?,MEDTRONIC,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
2,<pad> What will a supplemental report be issue...,A SUPPLEMENTAL REPORT WILL BE ISSUED.,"IF INFORMATION IS PROVIDED IN THE FUTURE, A SU..."
3,<pad> What was the manufacturer reference numb...,(B)(4),MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...
4,<pad> What was the preoperative and postoperat...,EROSION,THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AG...


In [None]:
qa_pairs

In [32]:
from transformers import pipeline

# Load a pre-trained QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def generate_answer(question, context):
    # Extract the answer from the context based on the question
    result = qa_pipeline(question=question, context=context)
    return result['answer']

In [7]:
generate_answer("What is stress urinary incontinence","THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AGAINST THE DEVICE RESULTING IN AN UNSPECIFIED ADVERSE OUTCOME. PRODUCT WAS USED FOR THERAPEUTIC TREATMENT. THE PREOPERATIVE AND POSTOPERATIVE DIAGNOSIS WAS PELVIC PAIN, MENOMETRORRHAGIA, UTERINE FIBROID, STRESS URINARY INCONTINENCE, AND HYPERMOBILE URETHRA. THE PROCEDURE PERFORMED WAS A TRANS-OBTURATOR TAPE AND LAPAROSCOPIC SUPRA-CERVICAL HYSTERECTOMY. IN (B)(6) 2007 THE PATIENT UNDERWENT AN ADDITIONAL PROCEDURE FOR DYSMENORRHEA, PELVIC PAIN, AND PMS PELVIC CRAMPING STATUS POST SUPRACERVICAL LAPAROSCOPY HYSTERECTOMY. THE PROCEDURE PERFORMED WAS AN OPERATIVE LAPAROSCOPY WITH ADHESIOLYSIS, BILATERAL SALPINGO-OOPHORECTOMY AND TRACHELECTOMY.")

'PREOPERATIVE AND POSTOPERATIVE DIAGNOSIS'

In [33]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Load your QA pairs
# qa_df = pd.read_csv('generated_qa_pairs.csv')

# Prepare dataset for fine-tuning
data = []
for index, row in qa_df.iterrows():
    data.append({"input": row['question'], "output": row['answer']})

# Create a Hugging Face Dataset
dataset = Dataset.from_list(data)

In [34]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [35]:



# Tokenization
def tokenize_function(examples):
    # Tokenize the input and output
    input_ids = tokenizer(examples['input'], truncation=True, padding='max_length', max_length=50)
    output_ids = tokenizer(examples['output'], truncation=True, padding='max_length', max_length=50)
    
    # Prepare labels (shifted input)
    labels = output_ids['input_ids']
    
    # Return the tokenized inputs and labels
    return {
        'input_ids': input_ids['input_ids'],
        'attention_mask': input_ids['attention_mask'],
        'labels': labels
    }


tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)




Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [36]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

  0%|          | 0/10 [00:00<?, ?it/s]

{'train_runtime': 32.8177, 'train_samples_per_second': 0.609, 'train_steps_per_second': 0.305, 'train_loss': 4.321907424926758, 'epoch': 1.0}


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [37]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [40]:
import torch

def generate_answer(question, max_length=50):
    # Tokenize the input question
    inputs = tokenizer.encode(question, return_tensors='pt')  # Add batch dimension

    # Generate the response
    with torch.no_grad():  # Disable gradient calculations
        outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)

    # Decode the generated tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [None]:
import torch

def generate_answer(question, context, max_length=50):
    # Prepare the input text with question and context
    input_text = f"question: {question} context: {context}"
    
    # Tokenize the input (make sure pad_token is set)
    inputs = tokenizer.encode(input_text, return_tensors='pt', truncation=True)
    
    # Ensure the pad token is correctly set
    tokenizer.pad_token = tokenizer.eos_token

    # Generate the response
    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model.generate(
            inputs, 
            max_length=max_length, 
            num_return_sequences=1, 
            pad_token_id=tokenizer.eos_token_id  # Ensure padding is handled properly
        )

    # Decode the generated tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [42]:
# Example question
question = "what is sui?"

# Generate an answer
answer = generate_answer(question, "In generat, sui is a disease")

# Print the answer
print("Question:", question)
print("Answer:", answer)


Question: what is sui?
Answer: question: what is sui? context: In generat, sui is a disease that is caused by the use of a certain type of food.


In [9]:
from transformers import pipeline

# Load a pre-trained QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def generate_answer(question, context):
    # Use the pipeline to generate an answer based on the question and context
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Example usage
context = "THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AGAINST THE DEVICE RESULTING IN AN UNSPECIFIED ADVERSE OUTCOME. PRODUCT WAS USED FOR THERAPEUTIC TREATMENT. THE PREOPERATIVE AND POSTOPERATIVE DIAGNOSIS WAS PELVIC PAIN, MENOMETRORRHAGIA, UTERINE FIBROID, STRESS URINARY INCONTINENCE, AND HYPERMOBILE URETHRA. THE PROCEDURE PERFORMED WAS A TRANS-OBTURATOR TAPE AND LAPAROSCOPIC SUPRA-CERVICAL HYSTERECTOMY. IN (B)(6) 2007 THE PATIENT UNDERWENT AN ADDITIONAL PROCEDURE FOR DYSMENORRHEA, PELVIC PAIN, AND PMS PELVIC CRAMPING STATUS POST SUPRACERVICAL LAPAROSCOPY HYSTERECTOMY. THE PROCEDURE PERFORMED WAS AN OPERATIVE LAPAROSCOPY WITH ADHESIOLYSIS, BILATERAL SALPINGO-OOPHORECTOMY AND TRACHELECTOMY."
question = "What surgeries did the patient undergo??"
answer = generate_answer(question, context)
print("Answer:", answer)


Answer: LAPAROSCOPIC SUPRA-CERVICAL HYSTERECTOMY


In [5]:
import spacy

# Load a pre-trained NER model for medical text
nlp = spacy.load("en_core_sci_md")  # SciSpacy model

# Function to extract entities
def extract_context(text):
    doc = nlp(text)
    return " ".join([ent.text for ent in doc.ents])  # Join entities as potential context

# Apply the function to extract context
df['context'] = df['cleaned_text'].apply(extract_context)


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("ramsrigouthamg/t5_squad_v1")
tokenizer = T5Tokenizer.from_pretrained("ramsrigouthamg/t5_squad_v1")


  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Function to generate question based on context
def generate_question(context):
    input_text = "generate question: " + context
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# from tqdm.notebook import tqdm
# # Apply the function to generate questions
# df['generated_question'] = df['context'].apply(generate_question)

In [None]:
from tqdm.notebook import tqdm
# Get total number of rows for progress calculation
total_rows = len(df)

# Apply the function with a progress bar
generated_questions = []
for index, row in tqdm(df.iterrows(), total=total_rows, desc="Processing rows", unit="row"):
    generated_question = generate_question(row['context'])
    generated_questions.append(generated_question)

# Assign generated questions back to the DataFrame
df['generated_question'] = generated_questions


In [None]:
qa_pairs = []

for index, row in df.iterrows():
    qa_pairs.append({
        "question": row['generated_question'],
        "context": row['context']
    })

# Example output
print(qa_pairs[:2])


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
train_data, val_data = train_test_split(qa_pairs, test_size=0.2)

# Format the data for Hugging Face fine-tuning
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)


In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the QA data
def tokenize_qa(examples):
    inputs = tokenizer(examples['question'], examples['context'], truncation=True, padding=True)
    return inputs

# Convert train and validation data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize_qa, batched=True)
val_dataset = val_dataset.map(tokenize_qa, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()


In [44]:
qa_df.head()

Unnamed: 0,question,answer
0,<pad> What is a medtronic product?,THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT
1,<pad> What is the name of the complaint?,MEDTRONIC
2,<pad> What will a supplemental report be issue...,A SUPPLEMENTAL REPORT WILL BE ISSUED.
3,<pad> What was the manufacturer reference numb...,(B)(4)
4,<pad> What was the preoperative and postoperat...,EROSION


In [45]:
questions = qa_df['question'].tolist()
answers = qa_df['answer'].tolist()

In [50]:
from transformers import BertForQuestionAnswering, BertTokenizer

# Load the pre-trained BERT tokenizer and model for QA
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [51]:
def preprocess_data(question, answer, tokenizer):
    # Tokenize the inputs for BERT
    inputs = tokenizer.encode_plus(question, answer, return_tensors='pt')
    return inputs


In [None]:
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Create a dataset object from pandas DataFrame
dataset = Dataset.from_pandas(qa_df[['question', 'answer']])

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['question'], examples['answer'], truncation=True, padding='max_length')

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",     # evaluate during training
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    save_steps=10_000,               # save checkpoint every 10k steps
    save_total_limit=2,              # limit the total amount of checkpoints
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # your tokenized training dataset
)

# Fine-tune the model
trainer.train()


In [56]:
qa_df.head()

Unnamed: 0,question,answer,context
0,<pad> What is a medtronic product?,THIS COMPLAINT IS NOT A MEDTRONIC PRODUCT,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
1,<pad> What is the name of the complaint?,MEDTRONIC,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...
2,<pad> What will a supplemental report be issue...,A SUPPLEMENTAL REPORT WILL BE ISSUED.,"IF INFORMATION IS PROVIDED IN THE FUTURE, A SU..."
3,<pad> What was the manufacturer reference numb...,(B)(4),MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...
4,<pad> What was the preoperative and postoperat...,EROSION,THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AG...


In [61]:
# import pandas as pd

# Load your dataset (replace 'your_dataset.csv' with your actual file)
# data = pd.read_csv('your_dataset.csv')

# Example of how the dataset should look
# data = pd.DataFrame({
#     'question': ['What is SUI?', 'What are the symptoms of SUI?'],
#     'context': ['SUI stands for Stress Urinary Incontinence.', 'Symptoms include involuntary urine leakage.'],
#     'answer': ['Stress Urinary Incontinence', 'involuntary urine leakage']
# })

def prepare_dataset(data):
    dataset = []
    for index, row in data.iterrows():
        question = row['question']
        context = row['context']
        answer = row['answer']
        
        dataset.append({
            'input_text': f"question: {question} context: {context}",
            'target_text': answer
        })
    return pd.DataFrame(dataset)

# Prepare the dataset
prepared_data = prepare_dataset(qa_df)


In [58]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [62]:
from datasets import Dataset

# Convert the prepared DataFrame to a Hugging Face Dataset
huggingface_dataset = Dataset.from_pandas(prepared_data)

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    # Setup the labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'],
            max_length=128,
            truncation=True,
            padding='max_length'
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the dataset
tokenized_dataset = huggingface_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/20 [00:00<?, ? examples/s]



In [64]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and evaluation sets
train_data, eval_data = train_test_split(prepared_data, test_size=0.2, random_state=42)

# Convert the split data into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)


In [65]:
# Tokenization function (reuse this for both datasets)
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    # Setup the labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'],
            max_length=128,
            truncation=True,
            padding='max_length'
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [66]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=10_000,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,  # Add eval_dataset here
)

# Start training
trainer.train()


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.824167251586914, 'eval_runtime': 1.7678, 'eval_samples_per_second': 2.263, 'eval_steps_per_second': 0.566, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.669038712978363, 'eval_runtime': 1.8499, 'eval_samples_per_second': 2.162, 'eval_steps_per_second': 0.541, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4058331549167633, 'eval_runtime': 1.8033, 'eval_samples_per_second': 2.218, 'eval_steps_per_second': 0.555, 'epoch': 3.0}
{'train_runtime': 99.4063, 'train_samples_per_second': 0.483, 'train_steps_per_second': 0.06, 'train_loss': 5.350741068522136, 'epoch': 3.0}


TrainOutput(global_step=6, training_loss=5.350741068522136, metrics={'train_runtime': 99.4063, 'train_samples_per_second': 0.483, 'train_steps_per_second': 0.06, 'total_flos': 6496406470656.0, 'train_loss': 5.350741068522136, 'epoch': 3.0})

In [68]:
# Example function to answer questions
def answer_question(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    
    output_ids = model.generate(input_ids)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return answer

# Example usage
test_question = "What is SUI?"
test_context = "SUI stands for Stress Urinary Incontinence."
answer = answer_question(test_question, test_context)
print(f"Question: {test_question}\nAnswer: {answer}")



Question: What is SUI?
Answer: Stress Urinary Incontinence


In [70]:
# Example usage
test_question = "What therapeutic treatment was the product used for?"
test_context = """ 
THE PATIENT'S ATTORNEY ALLEGED A DEFICIENCY AGAINST THE DEVICE RESULTING IN AN UNSPECIFIED ADVERSE OUTCOME. PRODUCT WAS USED FOR THERAPEUTIC TREATMENT. THE PREOPERATIVE AND POSTOPERATIVE DIAGNOSIS WAS PELVIC PAIN, MENOMETRORRHAGIA, UTERINE FIBROID, STRESS URINARY INCONTINENCE, AND HYPERMOBILE URETHRA. THE PROCEDURE PERFORMED WAS A TRANS-OBTURATOR TAPE AND LAPAROSCOPIC SUPRA-CERVICAL HYSTERECTOMY. IN (B)(6) 2007 THE PATIENT UNDERWENT AN ADDITIONAL PROCEDURE FOR DYSMENORRHEA, PELVIC PAIN, AND PMS PELVIC CRAMPING STATUS POST SUPRACERVICAL LAPAROSCOPY HYSTERECTOMY. THE PROCEDURE PERFORMED WAS AN OPERATIVE LAPAROSCOPY WITH ADHESIOLYSIS, BILATERAL SALPINGO-OOPHORECTOMY AND TRACHELECTOMY.
"""
answer = answer_question(test_question, test_context)
print(f"Question: {test_question}\nAnswer: {answer}")



Question: What therapeutic treatment was the product used for?
Answer: THERAPEUTIC TREATMENT
