In [35]:
import pandas as pd
from datasets import load_dataset

def extract_qa_dataset_with_keyword( keyword='intelligence', split='train'):
    # Load the specified dataset
    dataset = load_dataset('pubmed_qa', 'pqa_artificial', split=split)
    #Check size for debugging
    print('Dataset Size: ', len(dataset))
    # Extract questions, answers, and contexts
    questions = dataset['question']
    long_answers = dataset['long_answer']
    final_answers = dataset['final_decision']
    contexts = dataset['context']

    # Filter questions and two type of answers based on the presence of the keyword in the context
    filtered_qa_pairs = []

    for q, la, fa, c in zip(questions, long_answers, final_answers, contexts):
        if 'contexts' in c and isinstance(c['contexts'], list):
            if any(keyword in str(element).lower() for element in c['contexts'] if isinstance(element, str)):
                filtered_qa_pairs.append((q, la, fa, c))

    # Print information for debugging
    print(f"Number of pairs before filtering: {len(questions)}")
    print(f"Number of pairs after filtering: {len(filtered_qa_pairs)}")

    return filtered_qa_pairs

# Example usage: Extracting the 'pubmedqa' dataset with keyword 'intelligence'
keyword = 'intelligence'
split = 'train'
filtered_pubmedqa_qa_dataset = extract_qa_dataset_with_keyword(keyword, split)

print('length of filtered dataset: ', len(filtered_pubmedqa_qa_dataset))

# Display the first few pairs in the filtered dataset
for i in range(min(5, len(filtered_pubmedqa_qa_dataset))):
    print(f"Question: {filtered_pubmedqa_qa_dataset[i][0]}")
    print(f"Long Answer: {filtered_pubmedqa_qa_dataset[i][1]}")
    print(f"Final Answer: {filtered_pubmedqa_qa_dataset[i][2]}")
    print(f"Context: {filtered_pubmedqa_qa_dataset[i][3]}")
    print('-' * 50)

# Convert the filtered dataset to a DataFrame
df = pd.DataFrame(filtered_pubmedqa_qa_dataset, columns=['Question', 'LongAnswer', 'Yes/No', 'Contexts'])

# Save the DataFrame to a CSV file
csv_filename = 'filtered_qa_dataset.csv'
df.to_csv(csv_filename, index=False)

print(f"Filtered dataset saved to {csv_filename}")

Using the latest cached version of the module from C:\Users\maria\.cache\huggingface\modules\datasets_modules\datasets\pubmed_qa\dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924 (last modified on Sun Dec 24 20:54:37 2023) since it couldn't be found locally at pubmed_qa., or remotely on the Hugging Face Hub.


Dataset Size:  211269
Number of pairs before filtering: 211269
Number of pairs after filtering: 253
length of filtered dataset:  253
Question: Is cognitive reserve a determinant of health-related quality of life in patients with cirrhosis , independent of covert hepatic encephalopathy and model for end-stage liver disease score?
Long Answer: A higher cognitive reserve is associated with a better HRQOL in patients with cirrhosis, despite similar disease severity and prevalence. This indicates that patients with good cognitive reserve are better able to withstand the demands of cirrhosis progression and CHE, leading to a better HRQOL. Patients with lower cognitive reserve may need more dedicated and earlier measures to improve HRQOL. Cognitive reserve should be considered when interpreting HRQOL and cognitive tests to evaluate patients with cirrhosis.
Final Answer: yes
Context: {'contexts': ['Covert hepatic encephalopathy (CHE) is associated with cognitive dysfunction, which affects dail

In [5]:
from datasets import load_dataset

def verify_context_keys(dataset_name, split='train'):
    # Load the specified dataset
    dataset = load_dataset('pubmed_qa', 'pqa_labeled', split=split)

    # Print the keys of the 'context' dictionary for the first example
    first_example_context = dataset['context'][0]
    print("Keys in the 'context' dictionary:", first_example_context.keys())
    print("Original 'context' value:", str(first_example_context))

# Example usage: Verifying keys in the 'context' dictionary
dataset_name = 'pubmed_qa'
verify_context_keys(dataset_name)


Keys in the 'context' dictionary: dict_keys(['contexts', 'labels', 'meshes', 'reasoning_required_pred', 'reasoning_free_pred'])
Original 'context' value: {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PC