In [1]:
import pandas as pd
import plotly as plt
import plotly.express as px
import kaleido


In [2]:
data = pd.read_excel('20240429_metrics_output.xlsx').drop(columns='Unnamed: 0').reset_index(drop=True)
data[0:5]

Unnamed: 0,brand_name,application,original_text,metrics,values,descriptors
0,Rabeprazole sodium,ANDA202376,14 CLINICAL STUDIES 14.1 Healing of Erosive or...,Healing of Erosive or Ulcerative GERD in Adults,Percentage of Patients Healed,"Rabeprazole sodium delayed-release tablets, pl..."
1,Rabeprazole sodium,ANDA202376,14 CLINICAL STUDIES 14.1 Healing of Erosive or...,Complete resolution of GERD heartburn frequency,Percentage of Patients Healed,"Rabeprazole sodium delayed-release tablets, pl..."
2,Phentermine hydrochloride,ANDA205017,14 CLINICAL STUDIES In relatively short-term c...,Weight Loss,Drug-treated patients lost more weight on aver...,The magnitude of increased weight loss is only...
3,venlafaxine,ANDA090555,CLINICAL TRIALS The efficacy of venlafaxine hy...,Efficacy of Venlafaxine Hydrochloride as a Tre...,"Established in 5 placebo-controlled, short-ter...",The studies involved adult outpatients and inp...
4,Duloxetine Delayed-Release,ANDA203088,14 CLINICAL STUDIES 14.1 Overview of the Clini...,Efficacy of duloxetine in Major Depressive Dis...,"Established in 4 randomized, double-blind, pla...",Studies MDD-1 and MDD-2: Patients were randomi...


## Broad Buckets

In [3]:
categories = {
    "SURVIVAL_RESPONSE" : [
        "Clinical success",
        "clinical success",
        "Clinical Success Rate",
        "Clinical Success",
        "clinical success rate",
        "Success rate",
        "Clinical success rate",
        "treatment success",
        "Overall Treatment Success",
        "clinical efficacy rate",
        "Efficacy",
        "Treatment efficacy",
        "Clinical Effectiveness",
        "Overall Survival",
        "Survival",
        "Patient Survival",
        "1 year patient survival",
        "1 year graft survival",
        "Graft Survival",
        "overall survival",
        "survival",
        "survival rate",
        "Mortality Rate",
        "Total Mortality",
        "total mortality",
        "Mortality",
        "mortality",
        "3-month mortality",
        "response rate",
        "Response rate",
        "clinical response",
        "Clinical response",
        "Overall Response Rate",
        "Complete response rate",
        "overall response rate",
        "Clinical Response Rate",
        "Treatment response",
        "Overall response rate",
        "Objective Response Rate",
        "Clinical Response",
        "Number of responders",
        "Response Rate",
        "Clinical response rate",
        "responder rate",
        "Overall response rate (first randomized drug)",
        "Responder Rate",
        "Response rate to treatment",
        "Treatment Response",
        "Physician's Assessment of Clinical Response",
        "Complete Response",
        "Complete Response Rate",
        "CR rate",
        "Complete Remission Rate",
        "Complete Remission",
        "complete remission rate",
        "Annualized Relapse Rate (ARR)",
        "Relapse Rate",
        "Percentage of patients relapse-free",
        "disease-free survival",
        "Disease-Free Survival",
        "Progression-free survival",
        "proportion of patients meeting escape criteria",
        "Proportion of patients who met escape criteria",
        "number of patients meeting exit criteria",
        "Percentage of patients meeting escape criteria",
        "Number of patients meeting exit criteria",
        "Clinical Failure",
        "Time to exit from the trial",
        "Risk reduction"
    ],
"MICRO_INFECTION_ERADICATION" : [
    "Bacteriological success rate",
    "Bacteriologic Eradication by Patient at 5 to 9 Days Post-Treatment",
    "presumptive microbiologic eradication rates",
    "microbiological eradication rate",
    "Bacteriologic Eradication by Patient",
    "Bacteriologic Eradication of the Baseline Pathogen at 5 to 9 Days Post-Treatment",
    "Microbiological Eradication Rate",
    "H. pylori Eradication Rates",
    "Microbiologic eradication",
    "MAC Bacteremia",
    "pathogen eradication rate",
    "Mycological Cure",
    "bacteriologic eradication",
    "Mycologic eradication rate",
    "Complete Cure",
    "Therapeutic cure rate",
    "Mycological Cure Plus Clinical Cure",
    "microbiological success rates",
    "Clinical Cure Rate",
    "clinical cure rates",
    "clinical cure rate",
    "Cure rate",
    "cure rate",
    "eradication rates",
    "Bacteriologic Eradication",
    "Helicobacter pylori Eradication",
    "Bacterial Eradication",
    "Percentage of endoscopically confirmed healed duodenal ulcers",
    "Cure Rate",
    "bacteriologic eradication rate",
    "Microbiological eradication",
    "Bacteriologic Eradication of Baseline Pathogen",
    "Percentage change in inflammatory lesion counts from Baseline to 12 weeks",
    "Effective Treatment",
    "clinical improvement rate",
    "Wound Infection"
],
"HEART_HEALTH_AND_METRICS" : [
    "LDL-C",
    "HDL-C",
    "LDL-C reduction",
    "Non-HDL-C",
    "VLDL-C",
    "Lipid parameters",
    "Low-Density Lipoprotein Cholesterol (LDL-C)",
    "Low-Density Lipoprotein Cholesterol",
    "blood pressure reduction",
    "HbA1c reduction from baseline",
    "HbA1c",
    "HbA1C",
    "HbA 1C",
    "HbA1c reduction",
    "A1C",
    "serum triglycerides",
    "Triglycerides",
    "TG",
    "Total Cholesterol",
    "Total-C",
    "Apolipoprotein B (Apo B)",
    "apo B",
    "Apo-B",
    "Apo B",
    "Apo B levels",
    "rate of first coronary events",
    "Death, MI, revascularization",
    "Time to first occurrence of major cardiovascular events",
    "Cardiovascular deaths",
    "Coronary events",
    "Myocardial Infarction",
    "CHD events",
    "myocardial infarction mortality",
    "VTE rate",
    "CHD Mortality",
    "Coronary Heart Disease",
    "CHD mortality",
    "cardiovascular deaths",
    "Reduction in combined rate of coronary heart disease death plus non-fatal myocardial infarction",
    "Reduction of myocardial revascularization procedures",
    "Myocardial Revascularization Procedures",
    "Myocardial revascularization procedures",
    "myocardial revascularization procedures",
    "Reduction of first coronary events",
    "Primary Prevention of Cardiovascular Disease",
    "Suppression of Holter monitor evidence of sustained ventricular tachycardia",
    "Suppression of PES induced ventricular tachycardia",
    "Conversion rate",
    "AF/AFl to NSR",
    "Reversible myocardial perfusion defects",
    "Risk Reduction at 2 Years"
],

"GASTROINTESTINAL": [
    "Ulcer Healing",
    "Ulcer Healing Rate",
    "Healing rates",
    "Healing rate",
    "Percentage of patients healed",
    "Percentage of Patients Healed",
    "Healing of Erosive or Ulcerative GERD in Adults",
    "Percentage of endoscopically confirmed healed DU",
    "Duodenal Ulcer Healing Rates",
    "Proportion of Adult Patients with Successful Colon Cleansing",
    "Percentage of Patients Improved",
    "Complete relief of heartburn and regurgitation"
],

"KIDNEY_FUNCTION": [
    "serum phosphorus levels",
    "stone-passage remission rate",
    "serum calcium levels"
],

"RESPIRATORY": [
    "FEV1",
    "PCP event rate"
],

"ANTIEPILEPTIC":[
    "Percentage Reduction in Weekly Partial Seizure Frequency",
    "median percent reduction from baseline in partial seizure frequency"
],

"DEPRESSION_ANXIETY": [
    "HDRS total score",
    "Panic Disorder"
],

"VIROLOGIC": [
    "HIV-1 disease progression or death",
    "Virologic response",
    "Virologic failure"
],

"PREGNANCY": [
    "Pregnancy rate",
    "Pregnancy rate"
],

"OCULAR_HYPERTENSION_GLAUCOMA": [
    "IOP-lowering effect",
    "ocular pain resolution"
],

"UNSURE": [
    "Percentage",
    "Adverse Reactions",
    "incidence",
    "percentage",
    "percentage change",
    "14.2",
    "Completion rate",
    "treatment outcomes",
    "Adverse events",
    "Adverse Events",
    "Percentage of patients",
    "Safety",
    "Specificity",
    "Sensitivity",
    "Toxicity",
    "adverse reactions",
    "discontinuation rate"
],

"STEROIDAL": [
    "Percentage of patients achieving at least 90% improvement based on physician's global evaluation of clinical response",
],

"GAUCHER_DISEASE": [
    "Liver Volume",
    "Spleen Volume",
    "Spleen volume reduction"
],

"DERMATOLOGIC": [
    "clearance rate",
    "proportion of subjects with at least a 90% reduction in total nodular lesion count",
    "proportion of subjects with at least a 90% reduction in total nodular lesion count",
    "Percentage improvement in inflammatory lesions from baseline to 12 weeks",
    "Major Abscess",
    "Percentage of subjects with an Evaluator's Global Severity Assessment (EGSA) of clear or almost clear at 12 weeks",
    "pruritus improvement",
    "Inflammatory Lesion Counts",
    "Percentage of patients achieving treatment success",
    "Treatment success",
    "Percent reduction in lesion counts"
],

"PAIN_RELIEF": [
    "completion rate",
    "Pain Reduction",
    "pain reduction",
    "relief from starting backache",
    "Morphine Requirement",
    "Morphine requirement",
    "global impression of change",
    "proportion of responders",
    "Headache response rate",
    "proportion of successful anesthesia",
    "Pain intensity",
    "No pain at post-operative day 8"
]
}

In [4]:
term_to_category = {term: category for category, terms in categories.items() for term in terms}


In [5]:
def categorize_metrics(metric):
    return term_to_category.get(metric, "Uncategorized")  # Defaults to "Uncategorized" if not found

data = data[data['values'].str.contains('%')==True]
data['bucket'] = data['metrics'].apply(categorize_metrics)

In [6]:
data['bucket'].value_counts()

bucket
SURVIVAL_RESPONSE               364
Uncategorized                   348
HEART_HEALTH_AND_METRICS        294
MICRO_INFECTION_ERADICATION     270
GASTROINTESTINAL                 91
UNSURE                           81
PAIN_RELIEF                      36
DERMATOLOGIC                     32
ANTIEPILEPTIC                    26
RESPIRATORY                      21
KIDNEY_FUNCTION                  15
VIROLOGIC                        12
PREGNANCY                         6
GAUCHER_DISEASE                   6
DEPRESSION_ANXIETY                5
OCULAR_HYPERTENSION_GLAUCOMA      5
STEROIDAL                         2
Name: count, dtype: int64

### RESPONSE RATE

In [7]:
sub_categories = {
    "RESPONSE_RATE": [
        "response rate",
        "Response rate",
        "clinical response",
        "Clinical response",
        "Overall Response Rate",
        "Complete response rate",
        "overall response rate",
        "Clinical Response Rate",
        "Treatment response",
        "Overall response rate",
        "Objective Response Rate",
        "Clinical Response",
        "Number of responders",
        "Response Rate",
        "Clinical response rate",
        "responder rate",
        "Overall response rate (first randomized drug)",
        "Responder Rate",
        "Response rate to treatment",
        "Treatment Response",
        "Physician's Assessment of Clinical Response"
        ],
    "COMPLETE_RESPONSE_RATE": [
        "Complete Response",
        "Complete Response Rate",
        "CR rate"
        ],
    "OVERALL_SURVIVAL": [
        "Overall Survival",
        "Survival",
        "Patient Survival",
        "1 year patient survival",
        "1 year graft survival",
        "Graft Survival",
        "overall survival",
        "survival",
        "survival rate"
        ],
    "CLINICAL_SUCCESS_RATE": [
        "Clinical success",
        "clinical success",
        "Clinical Success Rate",
        "Clinical Success",
        "clinical success rate",
        "Success rate",
        "Clinical success rate",
        "treatment success",
        "Overall Treatment Success"
        ],
    "CLINICAL_EFFICACY": [
        "clinical efficacy rate",
        "Efficacy",
        "Treatment efficacy",
        "Clinical Effectiveness"
        ],
    "TOTAL_MORTALITY": [
        "Mortality Rate",
        "Total Mortality",
        "total mortality",
        "Mortality",
        "mortality",
        "3-month mortality"
        ],
    "DISEASE_FREE_SURVIVAL": [
        "disease-free survival",
        "Disease-Free Survival",
        "Progression-free survival"
        ],
    "ESCAPE_CRITERIA_PROPORTION": [
        "proportion of patients meeting escape criteria",
        "Proportion of patients who met escape criteria",
        "number of patients meeting exit criteria",
        "Percentage of patients meeting escape criteria",
        "Number of patients meeting exit criteria"
        ],
    "RELAPSE_RATE": [
        "Annualized Relapse Rate (ARR)",
        "Relapse Rate",
        "Percentage of patients relapse-free"
        ],
    "COMPLETE_REMISSION_RATE": [
        "Complete Remission Rate",
        "Complete Remission",
        "complete remission rate"
        ]
}

In [8]:
term_to_subcategory = {term: category for category, terms in sub_categories.items() for term in terms}
def categorize_metrics(metric):
    return term_to_subcategory.get(metric, "Uncategorized")  # Defaults to "Uncategorized" if not found

data['sub_bucket'] = data['metrics'].apply(categorize_metrics)

In [9]:
tdf = data[data['bucket']=='SURVIVAL_RESPONSE'].reset_index(drop=True)
tdf[0:5]

Unnamed: 0,brand_name,application,original_text,metrics,values,descriptors,bucket,sub_bucket
0,clobetasol propionate,ANDA090974,14 CLINICAL STUDIES The safety and efficacy of...,Success rate,40 (42.1%),"Study A, defined as the proportion of subjects...",SURVIVAL_RESPONSE,CLINICAL_SUCCESS_RATE
1,clobetasol propionate,ANDA090974,14 CLINICAL STUDIES The safety and efficacy of...,Success rate,28 (28.3%),"Study A, defined as the proportion of subjects...",SURVIVAL_RESPONSE,CLINICAL_SUCCESS_RATE
2,Colchicine,ANDA204711,14 CLINICAL STUDIES The evidence for the effic...,Number of responders,38%,Based on target joint pain score at 24 hours p...,SURVIVAL_RESPONSE,RESPONSE_RATE
3,Cefuroxime axetil,ANDA065496,14 CLINICAL STUDIES 14.1 Acute Bacterial Maxil...,Clinical Effectiveness,77% (US) and 64% (South America),Percentage of subjects who achieved clinical s...,SURVIVAL_RESPONSE,CLINICAL_EFFICACY
4,Clofarabine,NDA021673,14 CLINICAL STUDIES Seventy-eight (78) pediatr...,Complete Remission Rate,12%,The rate of complete remissions in pediatric p...,SURVIVAL_RESPONSE,COMPLETE_REMISSION_RATE


In [10]:
tdf['sub_bucket'].value_counts()

sub_bucket
RESPONSE_RATE                 119
CLINICAL_SUCCESS_RATE          77
TOTAL_MORTALITY                43
OVERALL_SURVIVAL               32
CLINICAL_EFFICACY              21
ESCAPE_CRITERIA_PROPORTION     19
COMPLETE_REMISSION_RATE        17
COMPLETE_RESPONSE_RATE         12
DISEASE_FREE_SURVIVAL          11
Uncategorized                   7
RELAPSE_RATE                    6
Name: count, dtype: int64

In [11]:
tdf[tdf['sub_bucket']=="RESPONSE_RATE"][['brand_name','metrics','values','descriptors']]

Unnamed: 0,brand_name,metrics,values,descriptors
2,Colchicine,Number of responders,38%,Based on target joint pain score at 24 hours p...
9,Dicyclomine Hydrochloride,clinical response,82%,"functional bowel/irritable bowel syndrome, ini..."
10,Dicyclomine Hydrochloride,clinical response,55%,placebo group
18,FLUTICASONE PROPIONATE,Physician's Assessment of Clinical Response,"Fluticasone Propionate Cream, 0.05% was signif...",Study 1 (n=59) Study 2 (n=74)
19,FLUTICASONE PROPIONATE,Physician's Assessment of Clinical Response,"Fluticasone Propionate Cream, 0.05% was equiva...",Study 1 (n=64) Study 2 (n=106)
...,...,...,...,...
355,Dicyclomine Hydrochloride,clinical response,55%,placebo group
356,Ribavirin,Response rate to treatment,52% (264/511),"Study 2, PegIntron 1.5 mcg/kg subcutaneously o..."
357,Ribavirin,Response rate to treatment,46% (231/505),"Study 2, INTRON A 3 MIU subcutaneously three t..."
360,fluoxetine hydrochloride,response rate,fluoxetine produced a significantly higher rat...,


In [12]:
res_tdf = tdf[tdf['sub_bucket']=="RESPONSE_RATE"] # check for long strings longer than 52% (264/511)

In [13]:

# Regex pattern to match two numbers followed by a % and optionally more numbers
pattern = r'^\d{2,3}%(\s\(\d+/\d+\))?$'


# Filter the DataFrame to exclude long strings
res_tdf = res_tdf[res_tdf['values'].str.match(pattern)].reset_index(drop=True)
res_tdf


Unnamed: 0,brand_name,application,original_text,metrics,values,descriptors,bucket,sub_bucket
0,Colchicine,ANDA204711,14 CLINICAL STUDIES The evidence for the effic...,Number of responders,38%,Based on target joint pain score at 24 hours p...,SURVIVAL_RESPONSE,RESPONSE_RATE
1,Dicyclomine Hydrochloride,ANDA040319,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,82%,"functional bowel/irritable bowel syndrome, ini...",SURVIVAL_RESPONSE,RESPONSE_RATE
2,Dicyclomine Hydrochloride,ANDA040319,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,55%,placebo group,SURVIVAL_RESPONSE,RESPONSE_RATE
3,Levoleucovorin,ANDA206263,14 CLINICAL STUDIES 14.1 Rescue after High-Dos...,Response rate,26%,Patients aged 18 years or older with metastati...,SURVIVAL_RESPONSE,RESPONSE_RATE
4,Levoleucovorin,ANDA206263,14 CLINICAL STUDIES 14.1 Rescue after High-Dos...,Response rate,43%,Patients aged 18 years or older with metastati...,SURVIVAL_RESPONSE,RESPONSE_RATE
...,...,...,...,...,...,...,...,...
56,Colchicine,NDA022352,14 CLINICAL STUDIES The evidence for the effic...,response rate,16%,primary treatment outcome for patients with go...,SURVIVAL_RESPONSE,RESPONSE_RATE
57,Dicyclomine Hydrochloride,ANDA216782,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,82%,"functional bowel/irritable bowel syndrome, ini...",SURVIVAL_RESPONSE,RESPONSE_RATE
58,Dicyclomine Hydrochloride,ANDA216782,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,55%,placebo group,SURVIVAL_RESPONSE,RESPONSE_RATE
59,Ribavirin,ANDA077224,14 CLINICAL STUDIES Clinical Study 1 evaluated...,Response rate to treatment,52% (264/511),"Study 2, PegIntron 1.5 mcg/kg subcutaneously o...",SURVIVAL_RESPONSE,RESPONSE_RATE


### Make a Graph?

In [24]:
data_for_graph = data['bucket'].value_counts().rename_axis('Disease_Scope').reset_index(name='counts')

fig = px.bar(data_for_graph, x='Disease_Scope',y='counts', text_auto=True)
fig.update_layout(xaxis_title='"Scope" of Metric', 
                  yaxis_title='# of Evidence Statements',
                  title='# of Evidence Statements from FDA Labels Containing a Rate-based Metric in Clinical Trial Section',
                  xaxis=dict(
                        showline=True,     
                        linewidth=2,       
                        linecolor='black', 
                        mirror=True        
                    ),
                    yaxis=dict(            
                        showline=True,
                        linewidth=2,
                        linecolor='black',
                        mirror=True
                    ))
fig.write_image('scope_of_metric_bar_chart.jpg',width=1920/1.5,height=1080/1.5,scale=2)
fig.show()


In [25]:
data_for_graph = tdf['sub_bucket'].value_counts().rename_axis('Metric').reset_index(name='counts')

fig = px.bar(data_for_graph, x='Metric', y='counts',text_auto=True)
fig.update_layout(xaxis_title='Metric Name', 
                  yaxis_title="# of Evidence Statements", 
                  title='# of Evidence Statements for SURVIVAL_RESPONSE-scoped, Rate-based Metrics',
                  xaxis=dict(
                        showline=True,     
                        linewidth=2,       
                        linecolor='black', 
                        mirror=True        
                    ),
                    yaxis=dict(            
                        showline=True,
                        linewidth=2,
                        linecolor='black',
                        mirror=True
                    ))
fig.write_image('SURVIVAL_RESPONSE_metric_bar_chart.jpg',width=1920/1.5,height=1080/1.5,scale=2)
fig.show()

In [41]:
res_tdf

Unnamed: 0,brand_name,application,original_text,metrics,values,descriptors,bucket,sub_bucket,trimmed_values
0,Colchicine,ANDA204711,14 CLINICAL STUDIES The evidence for the effic...,Number of responders,38%,Based on target joint pain score at 24 hours p...,SURVIVAL_RESPONSE,RESPONSE_RATE,38
1,Dicyclomine Hydrochloride,ANDA040319,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,82%,"functional bowel/irritable bowel syndrome, ini...",SURVIVAL_RESPONSE,RESPONSE_RATE,82
2,Dicyclomine Hydrochloride,ANDA040319,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,55%,placebo group,SURVIVAL_RESPONSE,RESPONSE_RATE,55
3,Levoleucovorin,ANDA206263,14 CLINICAL STUDIES 14.1 Rescue after High-Dos...,Response rate,26%,Patients aged 18 years or older with metastati...,SURVIVAL_RESPONSE,RESPONSE_RATE,26
4,Levoleucovorin,ANDA206263,14 CLINICAL STUDIES 14.1 Rescue after High-Dos...,Response rate,43%,Patients aged 18 years or older with metastati...,SURVIVAL_RESPONSE,RESPONSE_RATE,43
...,...,...,...,...,...,...,...,...,...
56,Colchicine,NDA022352,14 CLINICAL STUDIES The evidence for the effic...,response rate,16%,primary treatment outcome for patients with go...,SURVIVAL_RESPONSE,RESPONSE_RATE,16
57,Dicyclomine Hydrochloride,ANDA216782,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,82%,"functional bowel/irritable bowel syndrome, ini...",SURVIVAL_RESPONSE,RESPONSE_RATE,82
58,Dicyclomine Hydrochloride,ANDA216782,14 CLINICAL STUDIES In controlled clinical tri...,clinical response,55%,placebo group,SURVIVAL_RESPONSE,RESPONSE_RATE,55
59,Ribavirin,ANDA077224,14 CLINICAL STUDIES Clinical Study 1 evaluated...,Response rate to treatment,52% (264/511),"Study 2, PegIntron 1.5 mcg/kg subcutaneously o...",SURVIVAL_RESPONSE,RESPONSE_RATE,52


In [23]:
res_tdf['trimmed_values'] = res_tdf['values'].apply(lambda x: int((x.split('%')[0])))

fig = px.histogram(res_tdf, x='trimmed_values',text_auto=True)
fig.update_layout(xaxis_title='% Response Rate', 
                  yaxis_title="# of Evidence Statements", 
                  title='Distribution of Clinical Response Rates for SURVIVAL_RESPONSE-scoped, Rate-based Metrics', 
                    xaxis=dict(
                        showline=True,     
                        linewidth=2,       
                        linecolor='black', 
                        mirror=True        
                    ),
                    yaxis=dict(            
                        showline=True,
                        linewidth=2,
                        linecolor='black',
                        mirror=True
                    ))
fig.write_image('distribution_of_rates_SURVIVAL_RESPONSE_bar_chart.jpg',width=1920/1.5,height=1080/1.5,scale=2)
fig.show()