In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("./data/stress_urinary_incontinence.csv")

In [6]:
# Text cleaning function
def clean_text(text):
    # Remove special characters, digits, and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the cleaning function to the text column
df['cleaned_text'] = df['FOI_TEXT'].apply(clean_text)

In [3]:
df = df.head(10)

In [7]:
import nltk
nltk.download('punkt')

# Function to segment text into sentences
def segment_text(text):
    return nltk.sent_tokenize(text)

# Apply the function to segment the cleaned text
df['segmented_text'] = df['cleaned_text'].apply(segment_text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhaohengchuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('valhalla/t5-small-qg-hl')
model = T5ForConditionalGeneration.from_pretrained('valhalla/t5-small-qg-hl')

def generate_questions(text):
    # Prepare the input for the model
    input_text = f"generate question: {text}"
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    
    # Generate questions
    outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Example of generating questions for each sentence
for index, row in df.iterrows():
    questions = []
    for sentence in row['segmented_text']:
        question = generate_questions(sentence)
        questions.append(question)
    df.at[index, 'questions'] = questions


tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [9]:
from transformers import pipeline

# Load a pre-trained QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def generate_answer(question, context):
    # Extract the answer from the context based on the question
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Generate answers based on the generated questions
qa_pairs = []
for index, row in df.iterrows():
    context = row['FOI_TEXT']
    for question in row['questions']:
        answer = generate_answer(question, context)
        qa_pairs.append({
            "question": question,
            "answer": answer
        })

# Convert QA pairs to DataFrame
qa_df = pd.DataFrame(qa_pairs)
print(qa_df.head())


2024-09-28 21:23:19.236339: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

                                            question  \
0                       What is a medtronic product?   
1                 What is the name of the complaint?   
2  What will a supplemental report be issued if i...   
3  What was the manufacturer reference number b4 ...   
4  What was the preoperative and postoperative di...   

                                              answer  
0  BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...  
1                                  MEDTRONIC PRODUCT  
2               A SUPPLEMENTAL REPORT WILL BE ISSUED  
3                                              (B)(4  
4                                            EROSION  


In [11]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Load your QA pairs
# qa_df = pd.read_csv('generated_qa_pairs.csv')

# Prepare dataset for fine-tuning
data = []
for index, row in qa_df.iterrows():
    data.append({"input": row['question'], "output": row['answer']})

# Create a Hugging Face Dataset
dataset = Dataset.from_list(data)

In [14]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [19]:



# Tokenization
def tokenize_function(examples):
    # Tokenize the input and output
    input_ids = tokenizer(examples['input'], truncation=True, padding='max_length', max_length=50)
    output_ids = tokenizer(examples['output'], truncation=True, padding='max_length', max_length=50)
    
    # Prepare labels (shifted input)
    labels = output_ids['input_ids']
    
    # Return the tokenized inputs and labels
    return {
        'input_ids': input_ids['input_ids'],
        'attention_mask': input_ids['attention_mask'],
        'labels': labels
    }


tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)




Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [20]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 44.8309, 'train_samples_per_second': 0.669, 'train_steps_per_second': 0.335, 'train_loss': 3.2952044169108072, 'epoch': 3.0}


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [21]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [22]:
import torch

def generate_answer(question, max_length=50):
    # Tokenize the input question
    inputs = tokenizer.encode(question, return_tensors='pt')  # Add batch dimension

    # Generate the response
    with torch.no_grad():  # Disable gradient calculations
        outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)

    # Decode the generated tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [24]:
# Example question
question = "What is stress urinary incontinence?"

# Generate an answer
answer = generate_answer(question)

# Print the answer
print("Question:", question)
print("Answer:", answer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What is stress urinary incontinence?
Answer: What is stress urinary incontinence?


In [5]:
import spacy

# Load a pre-trained NER model for medical text
nlp = spacy.load("en_core_sci_md")  # SciSpacy model

# Function to extract entities
def extract_context(text):
    doc = nlp(text)
    return " ".join([ent.text for ent in doc.ents])  # Join entities as potential context

# Apply the function to extract context
df['context'] = df['cleaned_text'].apply(extract_context)


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("ramsrigouthamg/t5_squad_v1")
tokenizer = T5Tokenizer.from_pretrained("ramsrigouthamg/t5_squad_v1")


  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Function to generate question based on context
def generate_question(context):
    input_text = "generate question: " + context
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# from tqdm.notebook import tqdm
# # Apply the function to generate questions
# df['generated_question'] = df['context'].apply(generate_question)

In [None]:
from tqdm.notebook import tqdm
# Get total number of rows for progress calculation
total_rows = len(df)

# Apply the function with a progress bar
generated_questions = []
for index, row in tqdm(df.iterrows(), total=total_rows, desc="Processing rows", unit="row"):
    generated_question = generate_question(row['context'])
    generated_questions.append(generated_question)

# Assign generated questions back to the DataFrame
df['generated_question'] = generated_questions


In [None]:
qa_pairs = []

for index, row in df.iterrows():
    qa_pairs.append({
        "question": row['generated_question'],
        "context": row['context']
    })

# Example output
print(qa_pairs[:2])


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
train_data, val_data = train_test_split(qa_pairs, test_size=0.2)

# Format the data for Hugging Face fine-tuning
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)


In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the QA data
def tokenize_qa(examples):
    inputs = tokenizer(examples['question'], examples['context'], truncation=True, padding=True)
    return inputs

# Convert train and validation data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize_qa, batched=True)
val_dataset = val_dataset.map(tokenize_qa, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()
