In [None]:
! pip install huggingface_hub
! pip install -U sentence-transformers
! pip install nltk
! pip install datasets

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
import re
from nltk import sent_tokenize, word_tokenize
import nltk
import math

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Define the path to the JSON file
json_file_path = "/content/COVID-QA.json"

# Load the JSON file into a Pandas DataFrame
data = pd.read_json(json_file_path)
format_data = data["data"]

In [None]:
# remove the dictionary form
data_df = pd.json_normalize(format_data, "paragraphs")

In [None]:
# Transform and restructure data

schema = {
    "question": str,
    "answer_text": str,
    "answer_start": int,
    "is_impossible": bool,
    "document_id": int,
    "id": int,
    "context": str,
}
current_df = pd.DataFrame(columns=schema.keys())
for index, row in data_df.iterrows():
    for qas in row["qas"]:
        data = {
            "question": qas["question"],
            "answer_text": qas["answers"][0]["text"],
            "answer_start": qas["answers"][0]["answer_start"],
            "is_impossible": qas["is_impossible"],
            "document_id": row["document_id"],
            "id": qas["id"],
            "context": "",
        }
        current_df = current_df._append(data, ignore_index=True)

In [None]:
data_context = data_df["context"]
data_document_id = data_df["document_id"]

In [None]:
def clean_data(text):
    # Extract abstract content
    index = text.find("\nAbstract: ")
    if index != -1:
        cleaned_text = text[index + len("\nAbstract: ") :]
    else:
        cleaned_text = text  # If "\nAbstract: " is not found, keep the original text

    # Remove both http and https links using a regular expression
    cleaned_text = re.sub(
        r"(http(s|)\/\/:( |)\S+)|(http(s|):\/\/( |)\S+)", "", cleaned_text
    )

    # Remove DOI patterns like "doi:10.1371/journal.pone.0007211.s003"
    cleaned_text = re.sub(r"doi:( |)\w+", "", cleaned_text)

    # Remove the "(0.11 MB DOC)" pattern
    cleaned_text = re.sub(r"\(0\.\d+ MB DOC\)", "", cleaned_text)

    cleaned_text = re.sub(r"www\.\w+(.org|)", "", cleaned_text)

    return cleaned_text

In [None]:
data_cleaned = data_context.apply(clean_data)

In [None]:
data_context = pd.concat([data_cleaned, data_document_id], axis=1)

In [None]:
for doc_id in current_df["document_id"].unique():
    data_context.loc[data_context["document_id"] == doc_id, "document_id_count"] = (
        current_df["document_id"] == doc_id
    ).sum()

## Check answer in context

In [None]:
ind = 0
for index, row in current_df.iterrows():
    document_id = row["document_id"]
    context_value = data_context[data_context["document_id"] == document_id][
        "context"
    ].values[0]

    if row["answer_text"] not in context_value:
        ind += 1
        print(row)
print("Number of invalid: ", ind)

Number of invalid:  0


# Split data set to 90% and 10%

In [None]:
sentence_transformer_percent = 0.90
test_set_percent = 0.10

x = 13

while True:
    test_set_context, sentence_transformer_context = train_test_split(
        data_context, test_size=1 - test_set_percent, random_state=x
    )

    test_sum = test_set_context["document_id_count"].sum()
    sentence_sum = sentence_transformer_context["document_id_count"].sum()

    if (
        test_sum >= test_set_percent * len(current_df) - 10
        and test_sum <= test_set_percent * len(current_df) + 10
    ):
        break
    else:
        x += 1


print(f"x = {x}")

x = 13


In [None]:
sentence_transformer_data = current_df[
    current_df["document_id"].isin(sentence_transformer_context["document_id"])
]
test_set_data = current_df[
    current_df["document_id"].isin(test_set_context["document_id"])
]

In [None]:
print(len(sentence_transformer_data))
print(len(test_set_data))

1816
203


In [None]:
print(len(sentence_transformer_data["document_id"].unique()))
print(len(test_set_data["document_id"].unique()))

133
14


In [None]:
min_sentences_per_chunk = 3
chunk_size = 64


window_size = math.ceil(min_sentences_per_chunk * 0.25)
over_lap_chunk_size = chunk_size * 0.25


def chunk_splitter(context):
    sentences = sent_tokenize(context)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        if len(current_chunk) < min_sentences_per_chunk:
            current_chunk.append(sentence)
            continue
        elif (
            len(nltk.word_tokenize(" ".join(current_chunk) + " " + sentence))
            < chunk_size
        ):
            current_chunk.append(sentence)
            continue

        chunks.append(" ".join(current_chunk))
        new_chunk = current_chunk[-window_size:]
        new_window = window_size
        buffer_new_chunk = new_chunk

        while len(word_tokenize(" ".join(new_chunk))) <= over_lap_chunk_size:
            buffer_new_chunk = new_chunk
            new_window += 1
            new_chunk = current_chunk[-new_window:]
            if new_window >= len(current_chunk):
                break

        current_chunk = buffer_new_chunk
        current_chunk.append(sentence)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


texts = {}
chunks_list = []

# Assuming you have a DataFrame called data_context
for ind, data in data_context.iterrows():
    chunks = chunk_splitter(data["context"])
    doc_id = data["document_id"]
    texts[doc_id] = chunks
    chunks_list.extend(chunks)

# Test set data preparation

In [None]:
schema = {
    "question": str,
    "answer": str,
    "context_chunks": list,
    "document_id": int,
    "id": int,
    "context": str,
}

testset_data_results = pd.DataFrame(columns=schema.keys())

for ind, data in test_set_data.iterrows():
    context_chunks = texts[data["document_id"]]

    new_chunks = []
    for chunk in context_chunks:
        new_chunks.append(chunk)

    testset_data_results = testset_data_results._append(
        {
            "question": data["question"],
            "answer": data["answer_text"],
            "context": data_context[data_context["document_id"] == data["document_id"]][
                "context"
            ].values[0],
            "context_chunks": new_chunks,
            "document_id": data["document_id"],
            "id": data["id"],
        },
        ignore_index=True,
    )

In [None]:
testset_data_results["context_chunks"].apply(len).describe()

count    203.000000
mean      87.866995
std       36.200310
min       14.000000
25%       49.000000
50%       83.000000
75%      115.000000
max      157.000000
Name: context_chunks, dtype: float64

In [None]:
testset_data_results

Unnamed: 0,question,answer,context_chunks,document_id,id,context
0,What is IFITM?,interferon-induced transmembrane,"[Recently, one of the interferon-induced trans...",650,568,"Recently, one of the interferon-induced transm..."
1,How many cysteine residues are contained in th...,three,"[Recently, one of the interferon-induced trans...",650,569,"Recently, one of the interferon-induced transm..."
2,What inhibits S-palmitoylation?,2-bromopalmitic acid (2BP),"[Recently, one of the interferon-induced trans...",650,570,"Recently, one of the interferon-induced transm..."
3,What interaction is inhibited by the presence ...,IFITM5 with FKBP11,"[Recently, one of the interferon-induced trans...",650,571,"Recently, one of the interferon-induced transm..."
4,What is a function associated with IFITM5?,bone formation factor.,"[Recently, one of the interferon-induced trans...",650,572,"Recently, one of the interferon-induced transm..."
...,...,...,...,...,...,...
198,"for the 2009 influenza pandemic, what were the...",that most cases of H1N1 influenza A virus infe...,[nan\n\nText: Influenza and influenza viruses ...,776,300,nan\n\nText: Influenza and influenza viruses a...
199,What factors would contribute now to the fast...,"Nowadays, we travel faster, and we travel more...",[nan\n\nText: Influenza and influenza viruses ...,776,301,nan\n\nText: Influenza and influenza viruses a...
200,What factors would be responsible in future fo...,influenza virus infections are controllable a...,[nan\n\nText: Influenza and influenza viruses ...,776,302,nan\n\nText: Influenza and influenza viruses a...
201,What was the detected fatality rate of H7N9 Av...,"the detected 32.14% (45/140, one case from Ta...",[nan\n\nText: Influenza and influenza viruses ...,776,303,nan\n\nText: Influenza and influenza viruses a...


# QA transformer dataset preparation

In [None]:
schema = {
    "question": str,
    "answer": str,
    "context_chunks": list,
    "document_id": int,
    "id": int,
}
qa_transformer_data = sentence_transformer_data
qa_transformer_data_results = pd.DataFrame(columns=schema.keys())

for ind, data in qa_transformer_data.iterrows():
    answer = data["answer_text"]
    context_chunks = texts[data["document_id"]]

    new_context_chunks = []
    for context_chunk in context_chunks:
        new_context_chunks.append(context_chunk)
    qa_transformer_data_results = qa_transformer_data_results._append(
        {
            "question": data["question"],
            "answer": answer,
            "context_chunks": new_context_chunks,
            "document_id": data["document_id"],
            "id": data["id"],
        },
        ignore_index=True,
    )

In [None]:
qa_transformer_data_results

Unnamed: 0,question,answer,context_chunks,document_id,id
0,What is the main cause of HIV-1 infection in c...,Mother-to-child transmission (MTCT) is the mai...,[BACKGROUND: Mother-to-child transmission (MTC...,630,262
1,What plays the crucial role in the Mother to C...,DC-SIGNR plays a crucial role in MTCT of HIV-1...,[BACKGROUND: Mother-to-child transmission (MTC...,630,276
2,How many children were infected by HIV-1 in 20...,"more than 400,000 children were infected world...",[BACKGROUND: Mother-to-child transmission (MTC...,630,278
3,What is the role of C-C Motif Chemokine Ligand...,"High copy numbers of CCL3L1, a potent HIV-1 su...",[BACKGROUND: Mother-to-child transmission (MTC...,630,316
4,What is DC-GENR and where is it expressed?,Dendritic cell-specific ICAM-grabbing non-inte...,[BACKGROUND: Mother-to-child transmission (MTC...,630,305
...,...,...,...,...,...
1811,What is the structure of the Ebolavirus?,single-strand RNA filoviruses,[The maintenance mechanisms of ebolaviruses in...,1713,5315
1812,When was the West African Ebolavirus outbreak?,2013-2016,[The maintenance mechanisms of ebolaviruses in...,1713,5316
1813,What animals are considered to be maintenance ...,African bats,[The maintenance mechanisms of ebolaviruses in...,1713,5317
1814,What do circles indicate in Figure 1?,a maintenance function play by the host(s),[The maintenance mechanisms of ebolaviruses in...,1713,5318


# Push to hub

In [None]:
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [None]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# sentence_transformer_data_results

In [None]:
def split(param_df):
    trainset_percent = 0.9
    testset_percent = 0.1

    # Create a DataFrame to store unique 'document_id' values and their counts
    context_pd = pd.DataFrame(columns=["document_id", "document_id_count"])

    unique_doc_ids = param_df["document_id"].unique()

    for doc_id in unique_doc_ids:
        doc_id_count = int((param_df["document_id"] == doc_id).sum())
        context_pd = context_pd._append(
            {"document_id": doc_id, "document_id_count": doc_id_count},
            ignore_index=True,
        )
    x = 1

    # Set up the loop to adjust x and y
    while True:
        # Split the data with the current values of x and y
        trainset_context, testset_context = train_test_split(
            context_pd, test_size=1 - trainset_percent, random_state=x
        )

        # Calculate the sums for trainsetr_context and sentence_transformer_context
        train_sum = int(trainset_context["document_id_count"].sum())
        test_sum = int(testset_context["document_id_count"].sum())

        # Check if the sums meet the desired conditions
        if (
            test_sum >= testset_percent * len(param_df) - 10
            and test_sum <= testset_percent * len(param_df) + 10
        ):
            break
        else:
            x += 1
    train_data = param_df[param_df["document_id"].isin(trainset_context["document_id"])]
    test_data = param_df[param_df["document_id"].isin(testset_context["document_id"])]

    print("Train_data context: ", len(train_data["document_id"].unique()))
    print("Test_data context: ", len(test_data["document_id"].unique()))
    print("Train_data: ", len(train_data))
    print("Test_data: ", len(test_data))
    return (train_data, test_data)

# testset_data_results

In [None]:
testset_data_results = testset_data_results.reset_index(drop=True)
testset_data_results_dataset = Dataset.from_pandas(testset_data_results)

In [None]:
testset_data_results_dataset.push_to_hub(
    "minh21/COVID-QA-Chunk-64-testset-biencoder-data-90_10"
)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

# qa_transformer_data_results

In [None]:
qa_transformer_data_results = qa_transformer_data_results.reset_index(drop=True)

In [None]:
(train_df, val_df) = split(qa_transformer_data_results)

Train_data context:  119
Test_data context:  14
Train_data:  1631
Test_data:  185


In [None]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
hf_dataset = DatasetDict(
    {
        "train": train_ds,
        "validation": val_ds,
    }
)

In [None]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context_chunks', 'document_id', 'id'],
        num_rows: 1631
    })
    validation: Dataset({
        features: ['question', 'answer', 'context_chunks', 'document_id', 'id'],
        num_rows: 185
    })
})

In [None]:
hf_dataset.push_to_hub(
    "minh21/COVID-QA-Chunk-64-question-answering-biencoder-data-90_10"
)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]