In [11]:
import json
import pandas as pd

In [12]:
import os

# Define the datasets and replacement directions
datasets = [
    "anatomy",
    "clinical_knowledge",
    "college_biology",
    "college_medicine",
    "medical_genetics",
    "medmcqa",
    "medqa",
    "professional_medicine",
    "pubmed_qa",
    "usmle_sa_step1",
    "usmle_sa_step2",
    "usmle_sa_step3",
]
replacement_directions = ["brand_to_generic", "generic_to_brand"]

# Initialize an empty DataFrame to store the final results
final_df = pd.DataFrame()

# Iterate through each combination of dataset and replacement direction
for dataset in datasets:
    for direction in replacement_directions:
        # Construct the file path
        file_path = f"onBrand/replacement_counts/{dataset}/2024-05-13/{direction}_replacements.json"

        # Load the JSON file if it exists
        if os.path.exists(file_path):
            with open(file_path, "r") as file:
                drug_replacement_counts = json.load(file)

            # Create a DataFrame from the dictionary
            df_replacements = pd.DataFrame(
                list(drug_replacement_counts.items()), columns=["Drug", "Replacements"]
            )
            df_replacements["Dataset"] = dataset
            df_replacements["Direction"] = direction

            # Append to the final DataFrame
            final_df = pd.concat([final_df, df_replacements], ignore_index=True)

# Sort the final DataFrame by Replacements in descending order
final_df = final_df.sort_values("Replacements", ascending=False)

# Display the final DataFrame
final_df.head(20)

Unnamed: 0,Drug,Replacements,Dataset,Direction
39283,glucose,64,medqa,generic_to_brand
39451,metformin,50,medqa,generic_to_brand
33990,water,44,medmcqa,generic_to_brand
39147,aspirin,41,medqa,generic_to_brand
39877,lisinopril,41,medqa,generic_to_brand
39610,nitrogen,36,medqa,generic_to_brand
39761,water,35,medqa,generic_to_brand
39083,albuterol,35,medqa,generic_to_brand
35892,Ery,35,medqa,brand_to_generic
39278,chlorothiazide,35,medqa,generic_to_brand


In [13]:
# Save the final DataFrame to a CSV file
final_df.to_csv("all_replacement_counts.csv", index=False)