In [None]:
!pip install huggingface_hub
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install transformers accelerate evaluate datasets peft -q

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import json
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

In [None]:
file_data_name = "/content/cpgQA-v1.0.json"

In [None]:
data = pd.read_json(file_data_name)["data"]

In [None]:
rows = []
for item in data:
    title = item.get("title", "")
    paragraphs = item.get("paragraphs", {})
    for qa in paragraphs.get("qas", []):
        row = [
            title,
            qa.get("id", ""),
            qa.get("question", ""),
            qa.get("answers", [{}])[0].get("text", ""),
            qa.get("answers", [{}])[0].get("answer_start", ""),
            paragraphs.get("context", ""),
        ]
        rows.append(row)

# Create a Pandas DataFrame
columns = ["title", "id", "question", "answer_text", "answer_start", "context"]
df = pd.DataFrame(rows, columns=columns)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Your DataFrame creation
columns = ["title", "id", "question", "answer_text", "answer_start", "context"]
df = pd.DataFrame(rows, columns=columns)

# Splitting into train and test sets while preserving context uniqueness
unique_contexts = df["context"].unique()
train_contexts, test_contexts = train_test_split(
    unique_contexts, test_size=0.1, random_state=42
)

train_df = df[df["context"].isin(train_contexts)]
test_df = df[df["context"].isin(test_contexts)]

In [None]:
arr = list(train_set["context"].unique())

In [None]:
test_set["context"].unique()

# Validation set

In [None]:
dataset = df

In [None]:
train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
validation_dataset, test_dataset = train_test_split(
    temp_dataset, test_size=0.5, random_state=42
)

In [None]:
train_df = pd.DataFrame(train_dataset)
validation_df = pd.DataFrame(validation_dataset)
test_df = pd.DataFrame(test_dataset)

In [None]:
train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
train_df

Unnamed: 0,title,id,question,answer_text,answer_start,context
0,Features and overview,2,What should be done if an opioid thrapy is dis...,plan,127,Opioids are not first-line or routine therapy ...
1,Features and overview,6,What is not first-line or routine therapy for ...,Opioids,0,Opioids are not first-line or routine therapy ...
2,Background information,590,Where can the DoD Opioid Prescriber Safety Tr...,http://opstp.cds.pesgce.com/hub.php,828,The presidential memorandum of October 2015 ma...
3,Background information,634,Since when has there been a significant increm...,the late 1990s and early 2000s,273,"Chronic pain is among the most common, costly,..."
4,Recommendations,842,All patients who take opioids chronically are ...,OUD and overdose,58,All patients who take opioids chronically are ...
...,...,...,...,...,...,...
872,Algorithm,466,What is module D for?,patients currently on opioid therapy,16,Module D is for patients currently on opioid t...
873,Features and overview,121,What to prescribe to patients at increased ris...,naloxone,965,"When formulating an opioid taper plan, determi..."
874,Recommendations,1044,Which specific safety precautions should all c...,Transdermal fentanyl should not be used in opi...,413,Given the potential serious risks with startin...
875,Features and overview,1095,What does the OTRR do?,allows VA providers to review clinical data re...,252,There are electronic tools to facilitate clini...


In [None]:
train_ds = Dataset.from_pandas(train_df)
validation_ds = Dataset.from_pandas(validation_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
train_ds

In [None]:
hf_dataset = DatasetDict(
    {"train": train_ds, "validation": validation_ds, "test": test_ds}
)

In [None]:
hf_dataset.push_to_hub("cpgQA-v1.0")

# Unique Context

In [None]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
train_df

In [None]:
hf_dataset = DatasetDict({"train": train_ds, "test": test_ds})

In [None]:
hf_dataset.push_to_hub("minh21/cpgQA-v1.0-unique-context")

# Unique context and clean data exceed token limit of Flan-t5 (512)

In [None]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
model_name = "google/flan-t5-large"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, use_fast=True
)  # Convert text to vector space

In [None]:
filtered_train = train_ds.filter(
    lambda data: len(tokenizer(data["context"] + "\n" + data["question"])["input_ids"])
    + 30
    < 512
)
filtered_train

filtered_test = test_ds.filter(
    lambda data: len(tokenizer(data["context"] + "\n" + data["question"])["input_ids"])
    + 30
    < 512
)
filtered_test

In [None]:
filtered_test

In [None]:
hf_dataset = DatasetDict({"train": filtered_examples, "test": test_ds})

In [None]:
hf_dataset.push_to_hub("minh21/cpgQA-v1.0-unique-context-for-flan-t5")