In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("./data/stress_urinary_incontinence.csv")

In [3]:
# Text cleaning function
def clean_text(text):
    # Remove special characters, digits, and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the cleaning function to the text column
df['cleaned_text'] = df['FOI_TEXT'].apply(clean_text)

In [4]:
import nltk
nltk.download('punkt')

# Function to segment text into sentences
def segment_text(text):
    return nltk.sent_tokenize(text)

# Apply the function to segment the cleaned text
df['segmented_text'] = df['cleaned_text'].apply(segment_text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhaohengchuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import spacy

# Load a pre-trained NER model for medical text
nlp = spacy.load("en_core_sci_md")  # SciSpacy model

# Function to extract entities
def extract_context(text):
    doc = nlp(text)
    return " ".join([ent.text for ent in doc.ents])  # Join entities as potential context

# Apply the function to extract context
df['context'] = df['cleaned_text'].apply(extract_context)


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("ramsrigouthamg/t5_squad_v1")
tokenizer = T5Tokenizer.from_pretrained("ramsrigouthamg/t5_squad_v1")


  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Function to generate question based on context
def generate_question(context):
    input_text = "generate question: " + context
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# from tqdm.notebook import tqdm
# # Apply the function to generate questions
# df['generated_question'] = df['context'].apply(generate_question)

In [None]:
from tqdm.notebook import tqdm
# Get total number of rows for progress calculation
total_rows = len(df)

# Apply the function with a progress bar
generated_questions = []
for index, row in tqdm(df.iterrows(), total=total_rows, desc="Processing rows", unit="row"):
    generated_question = generate_question(row['context'])
    generated_questions.append(generated_question)

# Assign generated questions back to the DataFrame
df['generated_question'] = generated_questions


In [None]:
qa_pairs = []

for index, row in df.iterrows():
    qa_pairs.append({
        "question": row['generated_question'],
        "context": row['context']
    })

# Example output
print(qa_pairs[:2])


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
train_data, val_data = train_test_split(qa_pairs, test_size=0.2)

# Format the data for Hugging Face fine-tuning
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)


In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load BERT model and tokenizer
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the QA data
def tokenize_qa(examples):
    inputs = tokenizer(examples['question'], examples['context'], truncation=True, padding=True)
    return inputs

# Convert train and validation data to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize_qa, batched=True)
val_dataset = val_dataset.map(tokenize_qa, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()
