**Notes:**
* This code creates a time sequence version of dataset for drug-indication pair over the start and primary completion dates of the trial
* All the variables are appropriately aggregated over drug-indication pair and date time variable
* The aggregation of variables is cumulative over time, with first value in the sequence as actual, and then cumulatively aggregating over time in sequential order


In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
!cp /content/drive/MyDrive/Capstone2024/Datasets/nov_23/merged_df_1123.txt ./

In [None]:
# Action items on time sequence code -
#     1. move the coding of drug_outcome to 1 and 0 before the cumulative code
#     2. override the success in duplicate cases, instead of dropping (change the idxmax code line)
#     3. gender hot encoding, and then appropriately accumulating based on max value of gender
#     4. for enrollment, the cumulative values based on avg, sum, etc.
#     5. for trial_duration, the cumulative values based on avg, sum, etc.

In [None]:
# load the big merged table with drugs and multiple trial rows
df_merged_1123_seq = pd.read_csv('merged_df_1123.txt', sep='|')
df_merged_1123_seq.shape

  df_merged_1123_seq = pd.read_csv('merged_df_1123.txt', sep='|')


(24959, 140)

In [None]:
df_merged_1123_seq.head()

Unnamed: 0.1,Unnamed: 0,nct_id,intervention_type,description,trial_drug_cleaned,conditions,Drug Name,Highest Status,Other Drug Names,Originator Company,Originator Company HQ,Active Companies,Active Companies HQ,Therapy Area,Active Indications,Action,Technologies,Regulatory Designations,Inactive Indications,Inactive Companies,Has Deals,Last Change Date,Added Date,First Launched Date,Extract,Drug Id,cortellis_cleaned_drug,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,source_class,delayed_posting,expanded_access_nctid,expanded_access_status_for_nctid,fdaaa801_violation,baseline_type_units_analyzed,patient_registry,drug_outcome,disease_type,new_therapy_area,id_x,number_of_facilities,number_of_nsae_subjects,number_of_sae_subjects,registered_in_calendar_year,nlm_download_date,actual_duration,were_results_reported,months_to_report_results,has_us_facility,has_single_facility,minimum_age_num,maximum_age_num,minimum_age_unit,maximum_age_unit,number_of_primary_outcomes_to_measure,number_of_secondary_outcomes_to_measure,number_of_other_outcomes_to_measure,id_y,sampling_method,gender,minimum_age,maximum_age,healthy_volunteers,population,criteria,gender_description,gender_based,adult,child,older_adult,official_role,official_name,official_affiliation,id,trial_country,country_removed,LLM_GBT_4o_Human_Importance_Ratings,LLama3_2_Criteria_Robustness,Spacy_Pregnant_Women_Excluded
0,0,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138299,Czech Republic,t,1.0,2.0,1.0
1,1,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138896,United States,f,1.0,2.0,1.0
2,2,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138897,Argentina,f,1.0,2.0,1.0
3,3,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138898,Brazil,f,1.0,2.0,1.0
4,4,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138899,Czechia,f,1.0,2.0,1.0


In [None]:
# calculate days from first start date of any trial of the drug-disease pair to the
# primary completion date of current trial
df_merged_1123_seq[['start_date_norm', 'primary_completion_date_norm']] = df_merged_1123_seq[
    ['start_month_year', 'primary_completion_date']].apply(
    lambda col: col.fillna('').apply(
      lambda x: x + '-15' if len(x) == 7 else x
    ).pipe(pd.to_datetime)
)

df_merged_1123_seq = df_merged_1123_seq.pipe(
    lambda x: x.merge(
        x.groupby([
            'trial_drug_cleaned', 'disease_type'
        ]).start_date_norm.min().rename('start_date_first').reset_index()
    )
).eval(
    'days_since_first_start = (primary_completion_date_norm - start_date_first).dt.days'
)

df_merged_1123_seq.head()

Unnamed: 0.1,Unnamed: 0,nct_id,intervention_type,description,trial_drug_cleaned,conditions,Drug Name,Highest Status,Other Drug Names,Originator Company,Originator Company HQ,Active Companies,Active Companies HQ,Therapy Area,Active Indications,Action,Technologies,Regulatory Designations,Inactive Indications,Inactive Companies,Has Deals,Last Change Date,Added Date,First Launched Date,Extract,Drug Id,cortellis_cleaned_drug,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,source_class,delayed_posting,expanded_access_nctid,expanded_access_status_for_nctid,fdaaa801_violation,baseline_type_units_analyzed,patient_registry,drug_outcome,disease_type,new_therapy_area,id_x,number_of_facilities,number_of_nsae_subjects,number_of_sae_subjects,registered_in_calendar_year,nlm_download_date,actual_duration,were_results_reported,months_to_report_results,has_us_facility,has_single_facility,minimum_age_num,maximum_age_num,minimum_age_unit,maximum_age_unit,number_of_primary_outcomes_to_measure,number_of_secondary_outcomes_to_measure,number_of_other_outcomes_to_measure,id_y,sampling_method,gender,minimum_age,maximum_age,healthy_volunteers,population,criteria,gender_description,gender_based,adult,child,older_adult,official_role,official_name,official_affiliation,id,trial_country,country_removed,LLM_GBT_4o_Human_Importance_Ratings,LLama3_2_Criteria_Robustness,Spacy_Pregnant_Women_Excluded,start_date_norm,primary_completion_date_norm,start_date_first,days_since_first_start
0,0,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138299,Czech Republic,t,1.0,2.0,1.0,2008-01-15,2010-06-24,2004-11-15,2047
1,1,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138896,United States,f,1.0,2.0,1.0,2008-01-15,2010-06-24,2004-11-15,2047
2,2,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138897,Argentina,f,1.0,2.0,1.0,2008-01-15,2010-06-24,2004-11-15,2047
3,3,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138898,Brazil,f,1.0,2.0,1.0,2008-01-15,2010-06-24,2004-11-15,2047
4,4,NCT00606502,DRUG,150 mg orally in tablet form~Administered dail...,erlotinib,non-small cell lung cancer,erlotinib,Launched,CP-358774; CP-358774-01; NSC-718781; OSI-420; ...,OSI Pharmaceuticals Inc,OSI Pharmaceuticals Inc (US),Astellas Pharma Inc; Baheal Pharmaceutical gro...,Astellas Pharma Inc (Japan); Baheal Pharmaceut...,Cancer; Dermatologic,Acute myelogenous leukemia; Breast tumor; Cent...,Anticancer protein kinase inhibitor; EGFR fami...,Film coating; Oral formulation; Small molecule...,Fast Track; Orphan Drug,Cancer; Colorectal tumor; Ependymoma; Esophagu...,Nippon Roche KK; Pfizer Inc,Yes,2024-07-04,1996-03-28,2004-11-24,Erlotinib (Tarceva; OSI-744; CP-358774; NSC-71...,11961,erlotinib,,2008-01-22,2010-12-22,,2021-02-08,2008-02-01,2008-02-04,ESTIMATED,2010-12-22,2011-01-20,ESTIMATED,,,,2021-02-08,2021-03-05,ACTUAL,2008-01,,2008-01-31,2021-02,2021-02-28,2010-06-24,ACTUAL,2010-06-24,2010-06-24,ACTUAL,2010-06-24,,INTERVENTIONAL,,,Study of Pralatrexate vs. Erlotinib for Non-Sm...,"A Randomized, Phase 2b, Multi-center Study of ...",COMPLETED,,PHASE2,201.0,ACTUAL,"Spectrum Pharmaceuticals, Inc",The date of the CRF database cut-off for patie...,2.0,,,f,,,,,,,,,,,,,,,,,2024-08-04 13:24:13.009685,2024-08-04 13:24:13.009685,INDUSTRY,,,,,,,success,"Lung, Non-Small Cell",Oncology,29671336,47.0,540.0,124.0,2008,,29.0,t,6.0,t,f,18.0,,Years,,1.0,3.0,,7316876,,ALL,18 Years,,f,,Inclusion Criteria:~* Confirmed Stage IIIB/ IV...,,,t,f,t,STUDY_DIRECTOR,"Garry Weems, PharmD","Spectrum Pharmaceuticals, Inc",10138899,Czechia,f,1.0,2.0,1.0,2008-01-15,2010-06-24,2004-11-15,2047


In [None]:
df_merged_1123_seq = df_merged_1123_seq.replace(
    {'drug_outcome': {'failure': 0, 'success': 1}}
).pipe(
    lambda df: df.drop(columns='drug_outcome').merge(
        df.groupby(['trial_drug_cleaned', 'disease_type'])['drug_outcome'].max().reset_index()
    )
)

  df_merged_1123_seq = df_merged_1123_seq.replace(


In [None]:
# replace NaN with median enrollment
df_merged_1123_seq['enrollment'] = df_merged_1123_seq['enrollment'].fillna(df_merged_1123_seq['enrollment'].median())

In [None]:
# replace NaN with avg actual trial duration
df_merged_1123_seq['actual_duration'] = df_merged_1123_seq['actual_duration'].fillna(df_merged_1123_seq['actual_duration'].mean())

In [None]:
# replace NaN with 1 in number_of_primary_outcomes_to_measure
df_merged_1123_seq['number_of_primary_outcomes_to_measure'] = df_merged_1123_seq['number_of_primary_outcomes_to_measure'].fillna(1)

In [None]:
# replace NaN with 0 in number_of_secondary_outcomes_to_measure
df_merged_1123_seq['number_of_secondary_outcomes_to_measure'] = df_merged_1123_seq['number_of_secondary_outcomes_to_measure'].fillna(0)

In [None]:
# replace NaN with 1 in number_of_arms
df_merged_1123_seq['number_of_arms'] = df_merged_1123_seq['number_of_arms'].fillna(1)

In [None]:
# encode adult and child flags with 1 and 0
df_merged_1123_seq['adult'] = np.where(df_merged_1123_seq['adult'] == 't', 1, 0)
df_merged_1123_seq['child'] = np.where(df_merged_1123_seq['child'] == 't', 1, 0)

In [None]:
# replace NaN with false in US facility
df_merged_1123_seq['has_us_facility'] = df_merged_1123_seq['has_us_facility'].fillna('f')

In [None]:
# encode single and US facility flags with 1 and 0
df_merged_1123_seq['has_single_facility'] = np.where(df_merged_1123_seq['has_single_facility'] == 't', 1, 0)
df_merged_1123_seq['has_us_facility'] = np.where(df_merged_1123_seq['has_us_facility'] == 't', 1, 0)

In [None]:
# encode healthy_volunteers flags with 1 and 0
df_merged_1123_seq['healthy_volunteers'] = np.where(df_merged_1123_seq['healthy_volunteers'] == 't', 1, 0)

In [None]:
# replace NaN with false in healthy_volunteers
df_merged_1123_seq['healthy_volunteers'] = df_merged_1123_seq['healthy_volunteers'].fillna('f')

In [None]:
# replace NaN with All in gender flag
df_merged_1123_seq['gender'] = df_merged_1123_seq['gender'].fillna('ALL')

In [None]:
# encode gender flag
# df_merged_1123_seq['gender'] = df_merged_1123_seq['gender'].replace({'FEMALE': 1, 'MALE': 2, 'ALL': 3})

In [None]:
df_merged_1123_seq['gender_f'] = df_merged_1123_seq['gender'].isin(['FEMALE', 'ALL']).astype(int)
df_merged_1123_seq['gender_m'] = df_merged_1123_seq['gender'].isin(['MALE', 'ALL']).astype(int)
df_merged_1123_seq = df_merged_1123_seq.drop(columns='gender')

In [None]:
df_merged_1123_seq.columns

Index(['Unnamed: 0', 'nct_id', 'intervention_type', 'description',
       'trial_drug_cleaned', 'conditions', 'Drug Name', 'Highest Status',
       'Other Drug Names', 'Originator Company',
       ...
       'LLM_GBT_4o_Human_Importance_Ratings', 'LLama3_2_Criteria_Robustness',
       'Spacy_Pregnant_Women_Excluded', 'start_date_norm',
       'primary_completion_date_norm', 'start_date_first',
       'days_since_first_start', 'drug_outcome', 'gender_f', 'gender_m'],
      dtype='object', length=145)

In [None]:
# replace NaN with 18 in min_age
df_merged_1123_seq['minimum_age_num'] = df_merged_1123_seq['minimum_age_num'].fillna(18)

In [None]:
# replace NaN with 99 in max_age
df_merged_1123_seq['maximum_age_num'] = df_merged_1123_seq['maximum_age_num'].fillna(99)

In [None]:
# replace NaN with 1 in number_of_facilities
df_merged_1123_seq['number_of_facilities'] = df_merged_1123_seq['number_of_facilities'].fillna(1)

In [None]:
# df_merged_1123_seq['nct_id_2'] = df_merged_1123_seq['nct_id'].str[-8:]

In [None]:
# calculate cumulative average enrollment size for each group of trials
rollup_df_1123_seq = df_merged_1123_seq.pipe(
    lambda x: x.assign(pcd=pd.to_datetime(x.primary_completion_date))
)[
    'trial_drug_cleaned disease_type new_therapy_area drug_outcome nct_id pcd '
    'enrollment actual_duration number_of_primary_outcomes_to_measure '
    'number_of_secondary_outcomes_to_measure number_of_arms adult '
    'child has_single_facility has_us_facility healthy_volunteers gender_f gender_m '
    'minimum_age_num maximum_age_num number_of_facilities '
    'Spacy_Pregnant_Women_Excluded LLama3_2_Criteria_Robustness '
    'LLM_GBT_4o_Human_Importance_Ratings'.split()
].drop_duplicates().sort_values(
    'trial_drug_cleaned disease_type new_therapy_area drug_outcome pcd'.split()
)

df_1123_trials_gb = rollup_df_1123_seq.groupby(
    'trial_drug_cleaned disease_type new_therapy_area drug_outcome'.split()
)

rollup_df_1123_seq = rollup_df_1123_seq.assign(
    cumsum_enrollment=df_1123_trials_gb.enrollment.expanding().sum().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumavg_enrollment=df_1123_trials_gb.enrollment.expanding().mean().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumsum_trial_duration=df_1123_trials_gb.actual_duration.expanding().sum().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumavg_trial_duration=df_1123_trials_gb.actual_duration.expanding().mean().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_number_of_primary_outcomes_to_measure=df_1123_trials_gb.number_of_primary_outcomes_to_measure.expanding().sum().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_number_of_secondary_outcomes_to_measure=df_1123_trials_gb.number_of_secondary_outcomes_to_measure.expanding().sum().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_number_of_arms=df_1123_trials_gb.number_of_arms.expanding().sum().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_adult=df_1123_trials_gb.adult.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_child=df_1123_trials_gb.child.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_single_facility=df_1123_trials_gb.has_single_facility.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_us_facility=df_1123_trials_gb.has_us_facility.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_healthy_volunteers=df_1123_trials_gb.healthy_volunteers.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_gender_f=df_1123_trials_gb.gender_f.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_gender_m=df_1123_trials_gb.gender_m.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_min_age=df_1123_trials_gb.minimum_age_num.expanding().min().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_max_age=df_1123_trials_gb.maximum_age_num.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_number_of_facilities=df_1123_trials_gb.number_of_facilities.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_Spacy_Pregnant_Women_Excluded=df_1123_trials_gb.Spacy_Pregnant_Women_Excluded.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_LLama3_2_Criteria_Robustness=df_1123_trials_gb.LLama3_2_Criteria_Robustness.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    cumulative_LLM_GPT_4o_Human_Importance_Ratings=df_1123_trials_gb.LLM_GBT_4o_Human_Importance_Ratings.expanding().max().reset_index(
        [0, 1, 2, 3], drop=True
    ),
    pair_trial_seq=df_1123_trials_gb.pcd.rank()
)

In [None]:
# if gender > 3, then force it to 3
# rollup_df_1123_seq.loc[rollup_df_1123_seq['cumulative_gender'] > 3, 'cumulative_gender'] = 3

In [None]:
# encode drug outcome
# rollup_df_1123_seq['drug_outcome'] = rollup_df_1123_seq['drug_outcome'].replace({'success': 1, 'failure': 0})

In [None]:
# remove duplicate drug_outcome by grouping and taking max of outcome
# rollup_df_1123_seq = rollup_df_1123_seq.loc[rollup_df_1123_seq.groupby(['trial_drug_cleaned', 'disease_type', 'nct_id', 'pcd'])['drug_outcome'].idxmax()]

In [None]:
rollup_df_1123_seq

Unnamed: 0,trial_drug_cleaned,disease_type,new_therapy_area,drug_outcome,nct_id,pcd,enrollment,actual_duration,number_of_primary_outcomes_to_measure,number_of_secondary_outcomes_to_measure,number_of_arms,adult,child,has_single_facility,has_us_facility,healthy_volunteers,gender,minimum_age_num,maximum_age_num,number_of_facilities,Spacy_Pregnant_Women_Excluded,LLama3_2_Criteria_Robustness,LLM_GBT_4o_Human_Importance_Ratings,cumulative_enrollment,cumulative_trial_duration,cumulative_number_of_primary_outcomes_to_measure,cumulative_number_of_secondary_outcomes_to_measure,cumulative_number_of_arms,cumulative_adult,cumulative_child,cumulative_single_facility,cumulative_us_facility,cumulative_healthy_volunteers,cumulative_gender,cumulative_min_age,cumulative_max_age,cumulative_number_of_facilities,cumulative_Spacy_Pregnant_Women_Excluded,cumulative_LLama3_2_Criteria_Robustness,cumulative_LLM_GBT_4o_Human_Importance_Ratings,pair_trial_seq
23972,123iiodometomidate,Renal,Oncology,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,3,30.0,99.0,1.0,0.0,0.0,0.0,122.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,30.0,99.0,1.0,0.0,0.0,0.0,1.0
23974,123iiodometomidate,Renal Disease,Other,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,3,30.0,99.0,1.0,0.0,0.0,0.0,122.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,30.0,99.0,1.0,0.0,0.0,0.0,1.0
23971,123iiodometomidate,Unspecified Cancer,Oncology,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,3,30.0,99.0,1.0,0.0,2.0,0.0,122.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,30.0,99.0,1.0,0.0,2.0,0.0,1.0
23973,123iiodometomidate,Unspecified Solid Tumor,Oncology,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,3,30.0,99.0,1.0,0.0,0.0,0.0,122.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,30.0,99.0,1.0,0.0,0.0,0.0,1.0
5685,2deoxyglucose,Prostate,Oncology,0,NCT00633087,2011-03-31,12.0,52.0,1.0,3.0,1.0,1,0,1,1,0,3,18.0,99.0,1.0,1.0,2.0,1.0,12.0,52.0,1.0,3.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,18.0,99.0,1.0,1.0,2.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3075,zoledronicacid,Unspecified Solid Tumor,Oncology,1,NCT00558272,2010-01-31,139.0,23.0,1.0,16.0,2.0,1,0,0,1,0,3,18.0,99.0,28.0,0.0,0.0,0.0,139.0,23.0,1.0,16.0,2.0,1.0,0.0,0.0,1.0,0.0,3.0,18.0,99.0,28.0,0.0,0.0,0.0,1.0
20176,zonisamide,Alcohol Dependence,Psychiatry and Mental Health,1,NCT00406692,2008-08-31,16.0,21.0,2.0,1.0,1.0,1,0,1,1,0,3,21.0,64.0,1.0,1.0,2.0,0.0,16.0,21.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,21.0,64.0,1.0,1.0,2.0,0.0,1.0
20173,zonisamide,Alcohol Dependence,Psychiatry and Mental Health,1,NCT00862563,2013-08-31,85.0,51.0,1.0,7.0,4.0,1,0,1,1,0,3,21.0,65.0,1.0,0.0,0.0,0.0,101.0,72.0,3.0,8.0,5.0,1.0,0.0,1.0,1.0,0.0,3.0,21.0,65.0,1.0,1.0,2.0,0.0,2.0
20175,zonisamide,Alcohol Dependence,Psychiatry and Mental Health,1,NCT01847469,2019-01-01,24.0,67.0,3.0,0.0,2.0,1,0,1,1,0,3,21.0,65.0,1.0,1.0,2.0,1.0,125.0,139.0,6.0,8.0,7.0,1.0,0.0,1.0,1.0,0.0,3.0,21.0,65.0,1.0,1.0,2.0,1.0,3.0


In [None]:
# hot encode therapy area
rollup_df_1123_seq_final = pd.get_dummies(rollup_df_1123_seq, columns=['new_therapy_area'])
rollup_df_1123_seq_final.head()

Unnamed: 0,trial_drug_cleaned,disease_type,drug_outcome,nct_id,pcd,enrollment,actual_duration,number_of_primary_outcomes_to_measure,number_of_secondary_outcomes_to_measure,number_of_arms,adult,child,has_single_facility,has_us_facility,healthy_volunteers,gender_f,gender_m,minimum_age_num,maximum_age_num,number_of_facilities,Spacy_Pregnant_Women_Excluded,LLama3_2_Criteria_Robustness,LLM_GBT_4o_Human_Importance_Ratings,cumsum_enrollment,cumavg_enrollment,cumsum_trial_duration,cumavg_trial_duration,cumulative_number_of_primary_outcomes_to_measure,cumulative_number_of_secondary_outcomes_to_measure,cumulative_number_of_arms,cumulative_adult,cumulative_child,cumulative_single_facility,cumulative_us_facility,cumulative_healthy_volunteers,cumulative_gender_f,cumulative_gender_m,cumulative_min_age,cumulative_max_age,cumulative_number_of_facilities,cumulative_Spacy_Pregnant_Women_Excluded,cumulative_LLama3_2_Criteria_Robustness,cumulative_LLM_GPT_4o_Human_Importance_Ratings,pair_trial_seq,new_therapy_area_Autoimmune,new_therapy_area_Bone,new_therapy_area_Cardiovascular,new_therapy_area_Dermatology,new_therapy_area_Endocrinology,new_therapy_area_Gastrointestinal,new_therapy_area_Genetic and Rare Diseases,new_therapy_area_Hematology,new_therapy_area_Hepatic diseases,new_therapy_area_Infections (bacterial),new_therapy_area_Infections (viral),new_therapy_area_Infectious Diseases,new_therapy_area_Metabolic disorders,new_therapy_area_Neurology,new_therapy_area_Neuroscience (non-progressive),new_therapy_area_Oncology,new_therapy_area_Ophthalmology,new_therapy_area_Other,new_therapy_area_Pain,new_therapy_area_Progressive neurological disorders,new_therapy_area_Psychiatry and Mental Health,new_therapy_area_Respiratory,new_therapy_area_Rheumatology,new_therapy_area_Urology and Reproductive Health
23867,123iiodometomidate,Renal,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,1,1,30.0,99.0,1.0,0.0,0.0,0.0,122.0,122.0,21.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,30.0,99.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
23869,123iiodometomidate,Renal Disease,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,1,1,30.0,99.0,1.0,0.0,0.0,0.0,122.0,122.0,21.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,30.0,99.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
23866,123iiodometomidate,Unspecified Cancer,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,1,1,30.0,99.0,1.0,0.0,2.0,0.0,122.0,122.0,21.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,30.0,99.0,1.0,0.0,2.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
23868,123iiodometomidate,Unspecified Solid Tumor,0,NCT00454103,2008-12-31,122.0,21.0,1.0,0.0,1.0,1,0,1,0,0,1,1,30.0,99.0,1.0,0.0,0.0,0.0,122.0,122.0,21.0,21.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,30.0,99.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2482,2deoxyglucose,Prostate,0,NCT00633087,2011-03-31,12.0,52.0,1.0,3.0,1.0,1,0,1,1,0,1,1,18.0,99.0,1.0,1.0,2.0,1.0,12.0,12.0,52.0,52.0,1.0,3.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,18.0,99.0,1.0,1.0,2.0,1.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [None]:
rollup_df_1123_seq_final.to_csv('rollup_df_1123_seq_final.txt', sep='|', index=True)

In [None]:
!cp rollup_df_1123_seq_final.txt /content/drive/MyDrive/Capstone2024/Datasets/nov_23/

In [None]:
# df_merged_1123_seq['number_of_facilities'].unique()