In [36]:
import pandas as pd
import ast

In [146]:
data_path = "./annotated_data/data_splits/stratified_entities/"
dev_file = f"{data_path}ct_neuro_dev_merged_153.csv"
train_file = f"{data_path}ct_neuro_train_merged_787.csv"
test_file = f"{data_path}ct_neuro_test_merged_153.csv"

In [148]:
dev = pd.read_csv(dev_file)
test = pd.read_csv(test_file)
train = pd.read_csv(train_file)

In [150]:
dev['text'][0]

"The Contribution of a Speech Perception Intervention to the Prevention of Phonological Awareness Deficits in Children With Speech Sound Disorders | Recent research reveals genetic and symptomatic overlap among children with speech sound disorders (i.e., those who (misarticulate more sounds than would be expected for their age) and children with dyslexia (i.e., those who struggle to learn to read). Children who have speech sound disorders as preschoolers are at risk for the later emergence of dyslexia, a risk that often reveals itself in the form of poor phonological awareness skills during the preschool period. Traditional speech therapy methods focus on articulation accuracy and do not focus on the child's more abstract knowledge of the sound system of the language. The ultimate objective of this research program is to prevent reading disability in children who present with speech sounds disorders. The relative effectiveness of different interventions to help these children achieve a

In [154]:
dev['text'] = dev['text'].astype(str)
train['text'] = train['text'].astype(str)
test['text'] = test['text'].astype(str)

# Now, split the 'text' column into 'study_official_title' and 'study_brief_summary'
dev[['study_official_title', 'study_brief_summary']] = dev['text'].str.split('|', expand=True, n=1)
train[['study_official_title', 'study_brief_summary']] = train['text'].str.split('|', expand=True, n=1)
test[['study_official_title', 'study_brief_summary']] = test['text'].str.split('|', expand=True, n=1)

In [156]:
dev.head()

Unnamed: 0,tokens,ner_tags,id,nct_id,text,ner_manual_ct_target,study_official_title,study_brief_summary
0,"['The', 'Contribution', 'of', 'a', 'Speech', '...","['O', 'O', 'O', 'O', 'B-OTHER', 'I-OTHER', 'I-...",NCT00818428,NCT00818428,The Contribution of a Speech Perception Interv...,"[(22, 52, 'OTHER', 'Speech Perception Interven...",The Contribution of a Speech Perception Interv...,Recent research reveals genetic and symptomat...
1,"['A', 'Pilot', 'Study', 'of', 'Mitoxantrone', ...","['O', 'O', 'O', 'O', 'B-DRUG', 'O', 'O', 'O', ...",NCT00304291,NCT00304291,A Pilot Study of Mitoxantrone for the Treatmen...,"[(17, 29, 'DRUG', 'Mitoxantrone'), (51, 81, 'C...",A Pilot Study of Mitoxantrone for the Treatmen...,Neuromyelitis optica (NMO) is a severe demyel...
2,"['Protection', 'Against', 'Emboli', 'During', ...","['O', 'O', 'B-COND', 'O', 'B-COND', 'I-COND', ...",NCT04201132,NCT04201132,Protection Against Emboli During Carotid Arter...,"[(19, 25, 'CONDITION', 'Emboli'), (33, 56, 'CO...",Protection Against Emboli During Carotid Arter...,"A prospective, multicenter single-arm, open l..."
3,"['Additional', 'Effects', 'of', 'Motor', 'Imag...","['O', 'O', 'O', 'B-OTHER', 'I-OTHER', 'I-OTHER...",NCT04086004,NCT04086004,Additional Effects of Motor Imagery Practice W...,"[(22, 44, 'OTHER', 'Motor Imagery Practice'), ...",Additional Effects of Motor Imagery Practice W...,The importance of potent rehabilitation with ...
4,"['Combined', 'Botulinum', 'Toxin', 'Type', 'A'...","['O', 'B-DRUG', 'I-DRUG', 'I-DRUG', 'I-DRUG', ...",NCT00723866,NCT00723866,Combined Botulinum Toxin Type A With Modified ...,"[(9, 31, 'DRUG', 'Botulinum Toxin Type A'), (3...",Combined Botulinum Toxin Type A With Modified ...,Botulinum toxin type A (BtxA) injection and m...


In [174]:
# Convert the 'tokens' and 'ner_manual_ct_target' columns safely to lists if they are strings
def convert_to_json_format(row):
    ner_manual_ct_target = ast.literal_eval(row['ner_manual_ct_target']) if isinstance(row['ner_manual_ct_target'], str) else row['ner_manual_ct_target']

    # Processing entities
    entities = [
        {
            "start": ent[0],
            "end": ent[1],
            "text": ent[3],
            "type": ent[2]
        } for ent in ner_manual_ct_target
    ]
    
    # Returning the formatted JSON output
    return {
        "nctid": row['nct_id'],
        "study_official_title": row['study_official_title'] + "|",
        "study_brief_summary": row['study_brief_summary'],
        "text": row['text'],
        "tokens": str(row['tokens']),  # Joining tokens as a single string
        "token_bio_labels": row['ner_tags'],  # Joining token_bio_labels as a single string
        "entities": entities
    }

# Convert the dataframe to the desired JSON format
json_output_dev = [convert_to_json_format(row) for index, row in dev.iterrows()]
json_output_test = [convert_to_json_format(row) for index, row in test.iterrows()]
json_output_train = [convert_to_json_format(row) for index, row in train.iterrows()]


In [176]:
json_output_dev[0]

{'nctid': 'NCT00818428',
 'study_official_title': 'The Contribution of a Speech Perception Intervention to the Prevention of Phonological Awareness Deficits in Children With Speech Sound Disorders |',
 'study_brief_summary': " Recent research reveals genetic and symptomatic overlap among children with speech sound disorders (i.e., those who (misarticulate more sounds than would be expected for their age) and children with dyslexia (i.e., those who struggle to learn to read). Children who have speech sound disorders as preschoolers are at risk for the later emergence of dyslexia, a risk that often reveals itself in the form of poor phonological awareness skills during the preschool period. Traditional speech therapy methods focus on articulation accuracy and do not focus on the child's more abstract knowledge of the sound system of the language. The ultimate objective of this research program is to prevent reading disability in children who present with speech sounds disorders. The rela

In [178]:
def save_to_json(output_file_path, json_output):
    with open(output_file_path, 'w') as json_file:
        json.dump(json_output, json_file, indent=4)

In [180]:
json_outputs = [json_output_dev, json_output_test, json_output_train]
file_suffix = ["dev","test","train"]
bigbio_path = "./annotated_data/bigbio/"

for json_out, filename in zip(json_outputs, file_suffix):
    save_to_json(f'{bigbio_path}{filename}.json', json_out)

In [182]:
import json

In [91]:
with open("./annotated_data/bigbio/dev.json", "r") as f:
    data_dev = json.load(f)

In [99]:
with open("./annotated_data/bigbio/test.json", "r") as f:
    data_test = json.load(f)

In [101]:
with open("./annotated_data/bigbio/train.json", "r") as f:
    data_test = json.load(f)