#### QA DATASET CREATION

* Requirements: 
    * the dataset must have been annotated beforehand. We used Deepset.ai’s software tool for the annotation which we exported as json file
    * an initial version of the QA, already hosted on HF, for merging: 
* So, this notebook is to finalize the processing a la SQuAD style and spliting into train-dev-test

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import json
import pandas as pd
import json

In [None]:
import string

def replace_weird_characters(sentence, replacement_char_default="'"):
    # Define a set of printable characters
    printable_characters = set(string.printable)

    # Initialize variables to store the cleaned sentence and list of weird characters
    cleaned_sentence = ""
    weird_characters = []

    # Create a mapping to replace weird characters with the specified replacement character
    for char in sentence:
        if char not in printable_characters:
            if char == "—":
                replacement_char = "-"
            else:
                replacement_char = replacement_char_default
            cleaned_sentence += replacement_char
            weird_characters.append(char)
        else:
            cleaned_sentence += char
    cleaned_sentence = cleaned_sentence.replace("\n", " ")
    return cleaned_sentence, weird_characters

In [5]:
annotations_file = "./for_qa_data_annotated/annotations_part_B_Nov03_Dec23_2023.json"

In [6]:
fname = annotations_file

with open(fname, 'r', encoding='utf-8') as json_file:
    squad_data = json.load(json_file)

# Initialize empty lists to store the data
titles = []
contexts = []
questions = []
answers = []
answer_starts = []
is_impossible_values = []
answer_id_list = []
document_id_list = []
question_id_list = []


# Extract title, context, question, answer, answer_start, and is_impossible data from the JSON
for article in squad_data['data']:
    # title = article['title']
    title = "US_Irrigation"
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            if 'is_impossible' in qa:
                is_impossible = qa['is_impossible']
            else:
                is_impossible = False  # Set to False if not provided
            if not is_impossible and qa['answers']:
                answer = qa['answers'][0]['text']
                answer_start = qa['answers'][0]['answer_start']

                answer_id = qa['answers'][0]['answer_id']
                document_id = qa['answers'][0]['document_id']
                question_id = qa['answers'][0]['question_id']

            else:
                answer = None  # Set to None if no answer available for possible questions
                answer_start = None
            
            answer_id_list.append(answer_id) 
            document_id_list.append(document_id) 
            question_id_list.append(question_id) 

            titles.append(title)
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
            answer_starts.append(answer_start)
            is_impossible_values.append(is_impossible)

df_aequad_latest = pd.DataFrame({
                                'document_id': document_id_list, 
                                'title': titles, 
                                'context': contexts, 
                                'question_id': question_id_list, 
                                'question': questions, 
                                'answer_id': answer_id_list, 
                                'answers.text': answers, 
                                'answers.answer_start': answer_starts, 
                                'is_impossible': is_impossible_values})

fout = fname.replace("json", "csv")
df_aequad_latest.to_csv(fout, index=False)

In [7]:
df_aequad_latest.loc[:, 'id'] = df_aequad_latest["question_id"].astype(str)

In [8]:
new_cols_for_final_ds = ['title',
 'answers.answer_start',
 'question',
 'answers.text',
 'id',
 'context',
 'is_impossible']

df_all_annotations_recent = df_aequad_latest[new_cols_for_final_ds]
display(df_all_annotations_recent.head())

Unnamed: 0,title,answers.answer_start,question,answers.text,id,context,is_impossible
0,US_Irrigation,37,what is the key to early water management whil...,to apply irrigation only when it is needed to ...,1,The key to early water management is to apply ...,False
1,US_Irrigation,182,what can help guide early season irrigation de...,Precision water and nitrogen management,2,The key to early water management is to apply ...,False
2,US_Irrigation,32,what can leave room for the soil to any rainfa...,Monitoring soil moisture and leaving it modera...,3,Leave Room for Storing Rainfall\nMonitoring so...,False
3,US_Irrigation,230,if the soil is too wet what will you lose rain...,runoff or deep percolation,4,Leave Room for Storing Rainfall\nMonitoring so...,False
4,US_Irrigation,272,"what saves irrigation, input costs and prevent...",rainfall you store,5,Leave Room for Storing Rainfall\nMonitoring so...,False


In [9]:
df_all_annotations_recent_final = pd.DataFrame()
df_all_annotations_recent_final["id"] = df_all_annotations_recent["id"]
df_all_annotations_recent_final["title"] = df_all_annotations_recent["title"]
df_all_annotations_recent_final["context"], _ =  zip(*df_all_annotations_recent['context'].apply(lambda x: replace_weird_characters(x)))
df_all_annotations_recent_final["question"], _ =  zip(*df_all_annotations_recent['question'].apply(lambda x: replace_weird_characters(x)))
df_all_annotations_recent_final["answers.text"], _ =  zip(*df_all_annotations_recent['answers.text'].apply(lambda x: replace_weird_characters(x)))

df_all_annotations_recent_final['answers.answer_start'] = df_all_annotations_recent_final.apply(lambda row: row['context'].find(row['answers.text']), axis=1)

display(df_all_annotations_recent_final.head())

Unnamed: 0,id,title,context,question,answers.text,answers.answer_start
0,1,US_Irrigation,The key to early water management is to apply ...,what is the key to early water management whil...,to apply irrigation only when it is needed to ...,37
1,2,US_Irrigation,The key to early water management is to apply ...,what can help guide early season irrigation de...,Precision water and nitrogen management,182
2,3,US_Irrigation,Leave Room for Storing Rainfall Monitoring soi...,what can leave room for the soil to any rainfa...,Monitoring soil moisture and leaving it modera...,32
3,4,US_Irrigation,Leave Room for Storing Rainfall Monitoring soi...,if the soil is too wet what will you lose rain...,runoff or deep percolation,230
4,5,US_Irrigation,Leave Room for Storing Rainfall Monitoring soi...,"what saves irrigation, input costs and prevent...",rainfall you store,272


find cases where questions are duplicated. In such cases, we have several possible answers that we should group together.
We will use this for non-training datasets (validation and/or testing datasets)

In [11]:
df_mostly_training_ds = df_all_annotations_recent_final.copy()

# Find duplicate rows based on the "question" column - df1
duplicate_mask = df_mostly_training_ds.duplicated(subset='question', keep=False)

# Create a new df containing duplicate rows - df2
df_notfor_training_ds = df_mostly_training_ds[duplicate_mask].copy()

# Remove duplicate rows from df1
df_mostly_training_ds.drop(df_mostly_training_ds[duplicate_mask].index, inplace=True)

# Reset the index of df1
df_mostly_training_ds.reset_index(drop=True, inplace=True)

# Reset the index of df2
df_notfor_training_ds.reset_index(drop=True, inplace=True)


because `df_notfor_training_ds` still contains duplicated questions, aggregate them such as:
- each question is mapped to a list of its corresponding `answers.text` and `answers.answer_start`

In [14]:
# Group by 'question' and aggregate 'id', 'title', 'context', 'answers.text', and 'answers.answer_start'
agg_functions = {
    'id': 'first',
    'title': 'first',
    'context': 'first',
    'answers.text': list,
    'answers.answer_start': list
}

column_order = ['id', 'title', 'question', 'context', 'answers.text', 'answers.answer_start']


In [17]:

df_notfor_training_ds_final = df_notfor_training_ds.groupby('question').agg(agg_functions).reset_index()
df_notfor_training_ds_final = df_notfor_training_ds_final[column_order]
df_notfor_training_ds_final

Unnamed: 0,id,title,question,context,answers.text,answers.answer_start
0,34,US_Irrigation,what is one contributing factor that leads to ...,May and June are particularly vulnerable times...,[the fields are left fairly wet from last seas...,"[127, 189, 284, 338]"
1,272,US_Irrigation,what is one method that can be used in determi...,Different methods can be used to determine how...,['speed chart' (water application depth per pa...,"[200, 384]"
2,376,US_Irrigation,what is the minimum balance in top 4 feet at 4...,Table II. Total available water in top 4 feet ...,"[3.2 in/ft, 2.5 in/ft]","[443, 609]"


an example of questions with several answers

In [18]:
sample = df_notfor_training_ds_final.iloc[2, :]
c = sample["context"]
q = sample["question"]
for e in sample["answers.text"]: 
    print(q, e)
    # print()
            

what is the minimum balance in top 4 feet at 40% of available water for loam, very fine sandy loam, or silt loam topsoil which is silty clay loam or silty clay subsoil? 3.2 in/ft
what is the minimum balance in top 4 feet at 40% of available water for loam, very fine sandy loam, or silt loam topsoil which is silty clay loam or silty clay subsoil? 2.5 in/ft


In [19]:

df_mostly_training_ds_final = df_mostly_training_ds.groupby('question').agg(agg_functions).reset_index()
df_mostly_training_ds_final = df_mostly_training_ds_final[column_order]
df_mostly_training_ds_final.head(2)

Unnamed: 0,id,title,question,context,answers.text,answers.answer_start
0,375,US_Irrigation,what is the available water in 4ft of soil at...,Table II. Total available water in top 4 feet ...,[8.0 in/4 ft],[367]
1,193,US_Irrigation,Does the amount of irrigation applied in years...,The amount of irrigation applied last year or ...,[very little to do with the amount needed this...,[71]


##### Finally do the splitting
* do a 70%-30% split on df_mostly_training_ds_final
    * use the 70% for additional training
    * let the 30% called "additional_ds"

* keep 100% of df_notfor_training_ds_final
    * Testing set:    100% + 56% of additional_ds

In [24]:
from sklearn.model_selection import train_test_split

In [119]:
xtra_train_df_, additional_ds = train_test_split(df_mostly_training_ds_final, test_size=0.24, random_state=42) 
xtra_dev_df_, xtra_test_df_ = train_test_split(additional_ds, test_size=0.55, random_state=42) 



new_train_df = xtra_train_df_.sample(frac=1, random_state=42).reset_index(drop=True)
new_dev_df   = xtra_dev_df_.sample(frac=1, random_state=42).reset_index(drop=True)


new_test_df_ = pd.concat([df_notfor_training_ds_final, xtra_test_df_], ignore_index=True)
new_test_df = new_test_df_.sample(frac=1, random_state=42).reset_index(drop=True)

# # shuffle a litle bit
# dev_df = dev_df_.sample(frac=1, random_state=42).reset_index(drop=True)
# test_df = test_df_.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Size of train set:      {len(new_train_df)}")
print(f"Size of dev set:      {len(new_dev_df)}")
print(f"Size of test set:     {len(new_test_df)}")

Size of train set:      427
Size of dev set:      60
Size of test set:     78


In [87]:
from datasets import load_dataset

prev_qa_dataset = load_dataset("eusojk/aequad-2023-11-19")
prev_qa_dataset

Found cached dataset json (/home/kpodojos/.cache/huggingface/datasets/eusojk___json/eusojk--aequad-2023-11-19-f3a26905552111be/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1076
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 293
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 252
    })
})

In [141]:
dfs = {
    'train': final_train_,
    'validation': final_dev_,
    'test': final_test_,
}

In [None]:
import os
# import json
# from datasets import load_dataset

# input_filename = TRAIN_V2_FILE
# # input_filename = DEV_V2_FILE
# output_filename = f"{DATA_DIR}/squad-train-v2.0-line-sep.json"
# # output_filename = f"{DATA_DIR}/squad-dev-v2.0-line-sep.json"
# source: https://discuss.huggingface.co/t/question-answering-bot-fine-tuning-with-custom-dataset/4412
def fix_to_hf_squad_format(input_filename):

    output_filename = input_filename.replace("json", "jsonl")
    with open(input_filename, encoding="utf-8") as f:
        dataset = json.load(f)

    with open(output_filename, "w", encoding="utf-8") as f:
        for article in dataset["data"]:
            title = article["title"]
            for paragraph in article["paragraphs"]:
                context = paragraph["context"]
                answers = {}
                for qa in paragraph["qas"]:
                    question = qa["question"]
                    idx = qa["id"]
                    answers["text"] = [a["text"] for a in qa["answers"]]
                    answers["answer_start"] = [a["answer_start"] for a in qa["answers"]]
                    f.write(
                        json.dumps(
                            {
                                "id": idx,
                                "title": title,
                                "context": context,
                                "question": question,
                                "answers": answers,
                            }
                        )
                    )
                    f.write("\n")


def create_paragraphs(df):
    paragraphs = []
    id2context = dict(zip(df["id"], df["context"]))
    for review_id, review in id2context.items():
        qas = []
        # Filter for all question-answer pairs about a specific context
        review_df = df.query(f"id == '{review_id}'")
        id2question = dict(zip(review_df["id"], review_df["question"]))
        # Build up the qas array
        for qid, question in id2question.items():
            # Filter for a single question ID
            question_df = df.query(f"id == '{qid}'").to_dict(orient="list")
            ans_start_idxs = question_df["answers.answer_start"][0]#.tolist()
            ans_text = question_df["answers.text"][0]#.tolist()
            # Fill answerable questions
            if len(ans_start_idxs):
                answers = [
                    {"text": text, "answer_start": answer_start}
                    for text, answer_start in zip(ans_text, ans_start_idxs)]
                is_impossible = False
            else:
                answers = []
                is_impossible = True
            # Add question-answer pairs to qas
            qas.append({"question": question, "id": qid, 
                        "is_impossible": is_impossible, "answers": answers})
        # Add context and question-answer pairs to paragraphs
        paragraphs.append({"qas": qas, "context": review})
    return paragraphs

from datetime import datetime
import json

# Get the current date
current_date = datetime.now().date()

# Format the date as a string
date_string = current_date.strftime("%Y-%m-%d")
aequad_version = 1.1

DATA_DIR = r"./for_qa_datasets_final/v_1.1"

def convert_to_squad(dfs, version=date_string):
    list_files_returned = []
    for split, df in dfs.items():
        aequad_data = {}
        # Create `paragraphs` for each product ID
        groups = (df.groupby("title").apply(create_paragraphs)
            .to_frame(name="paragraphs").reset_index())
        aequad_data["data"] = groups.to_dict(orient="records")
        # Save the result to disk
        fout = f"{DATA_DIR}{os.sep}aequad-{split}-{version}.json"
        list_files_returned.append(fout)
        with open(fout, "w+", encoding="utf-8") as f:
            json.dump(aequad_data, f, default=int)
    return list_files_returned
            
list_files_returned = convert_to_squad(dfs)
list_files_returned

In [143]:
[fix_to_hf_squad_format(f) for f in list_files_returned]

[None, None, None]

After manually uploading the *jsonl files to the HF dataset hub, test if you can download them
 * [dataset repo](https://huggingface.co/datasets/eusojk/aequad-2023-09-20)
 * [tutorial to structure your folder](https://huggingface.co/docs/datasets/repository_structure#structure-your-repository)