In [36]:
import pandas as pd
import ast

In [20]:
data_path = "./annotated_data/data_splits/stratified_entities/"
dev_file = f"{data_path}ct_neuro_dev_merged_153.csv"
train_file = f"{data_path}ct_neuro_train_merged_787.csv"
test_file = f"{data_path}ct_neuro_test_merged_153.csv"

In [66]:
dev = pd.read_csv(dev_file)
test = pd.read_csv(test_file)
train = pd.read_csv(train_file)

In [68]:
dev.head()

Unnamed: 0,tokens,ner_tags,id,nct_id,text,ner_manual_ct_target
0,"['The', 'Contribution', 'of', 'a', 'Speech', '...","['O', 'O', 'O', 'O', 'B-OTHER', 'I-OTHER', 'I-...",NCT00818428,NCT00818428,The Contribution of a Speech Perception Interv...,"[(22, 52, 'OTHER', 'Speech Perception Interven..."
1,"['A', 'Pilot', 'Study', 'of', 'Mitoxantrone', ...","['O', 'O', 'O', 'O', 'B-DRUG', 'O', 'O', 'O', ...",NCT00304291,NCT00304291,A Pilot Study of Mitoxantrone for the Treatmen...,"[(17, 29, 'DRUG', 'Mitoxantrone'), (51, 81, 'C..."
2,"['Protection', 'Against', 'Emboli', 'During', ...","['O', 'O', 'B-COND', 'O', 'B-COND', 'I-COND', ...",NCT04201132,NCT04201132,Protection Against Emboli During Carotid Arter...,"[(19, 25, 'CONDITION', 'Emboli'), (33, 56, 'CO..."
3,"['Additional', 'Effects', 'of', 'Motor', 'Imag...","['O', 'O', 'O', 'B-OTHER', 'I-OTHER', 'I-OTHER...",NCT04086004,NCT04086004,Additional Effects of Motor Imagery Practice W...,"[(22, 44, 'OTHER', 'Motor Imagery Practice'), ..."
4,"['Combined', 'Botulinum', 'Toxin', 'Type', 'A'...","['O', 'B-DRUG', 'I-DRUG', 'I-DRUG', 'I-DRUG', ...",NCT00723866,NCT00723866,Combined Botulinum Toxin Type A With Modified ...,"[(9, 31, 'DRUG', 'Botulinum Toxin Type A'), (3..."


In [70]:
# Convert the 'tokens' and 'ner_manual_ct_target' columns safely to lists if they are strings
def convert_to_json_format(row):
    ner_manual_ct_target = ast.literal_eval(row['ner_manual_ct_target']) if isinstance(row['ner_manual_ct_target'], str) else row['ner_manual_ct_target']

    # Processing entities
    entities = [
        {
            "start": ent[0],
            "end": ent[1],
            "text": ent[3],
            "type": ent[2]
        } for ent in ner_manual_ct_target
    ]
    
    # Returning the formatted JSON output
    return {
        "nctid": row['nct_id'],
        "text": row['text'],
        "tokens": str(row['tokens']),  # Joining tokens as a single string
        "token_bio_labels": row['ner_tags'],  # Joining token_bio_labels as a single string
        "entities": entities
    }

# Convert the dataframe to the desired JSON format
json_output_dev = [convert_to_json_format(row) for index, row in dev.iterrows()]
json_output_test = [convert_to_json_format(row) for index, row in test.iterrows()]
json_output_train = [convert_to_json_format(row) for index, row in train.iterrows()]


In [82]:
def save_to_json(output_file_path, json_output):
    with open(output_file_path, 'w') as json_file:
        json.dump(json_output, json_file, indent=4)

In [84]:
json_outputs = [json_output_dev, json_output_test, json_output_train]
file_suffix = ["dev","test","train"]
bigbio_path = "./annotated_data/bigbio/"

for json_out, filename in zip(json_outputs, file_suffix):
    save_to_json(f'{bigbio_path}{filename}.json', json_out)