In [None]:
!pip install huggingface_hub
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install transformers accelerate evaluate datasets peft -q

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [41]:
import json
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

In [4]:
file_data_name = "/content/cpgQA-v1.0.json"

In [6]:
data = pd.read_json(file_data_name)["data"]

In [7]:
rows = []
for item in data:
    title = item.get("title", "")
    paragraphs = item.get("paragraphs", {})
    for qa in paragraphs.get("qas", []):
        row = [
            title,
            qa.get("id", ""),
            qa.get("question", ""),
            qa.get("answers", [{}])[0].get("text", ""),
            qa.get("answers", [{}])[0].get("answer_start", ""),
            paragraphs.get("context", ""),
        ]
        rows.append(row)

# Create a Pandas DataFrame
columns = ["title", "id", "question", "answer_text", "answer_start", "context"]
df = pd.DataFrame(rows, columns=columns)

In [None]:
df

In [39]:
columns = ["title", "id", "question", "answer_text", "answer_start", "context"]
df = pd.DataFrame(rows, columns=columns)

unique_contexts = df["context"].unique()
train_contexts, test_contexts = train_test_split(
    unique_contexts, test_size=0.1, random_state=42
)

max_test_size = int(0.1 * len(df))

test_df = pd.DataFrame(columns=columns)  # Empty DF

test_contexts_sorted = sorted(
    test_contexts, key=lambda context: -len(df[df["context"] == context])
)  # DESC

for context in test_contexts_sorted:
    context_df = df[df["context"] == context]

    if len(test_df) + len(context_df) <= max_test_size:
        test_df = pd.concat([test_df, context_df], axis=0)
    else:
        continue

train_df = df[~df.index.isin(test_df.index)]

In [None]:
train_df

# Unique Context

In [42]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [43]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [45]:
hf_dataset = DatasetDict({"train": train_ds, "test": test_ds})

In [None]:
hf_dataset.push_to_hub("minh21/cpgQA-v1.0-unique-context-test-10-percent")