In [4]:
import pandas as pd
import requests
from sqlalchemy import create_engine
from pipelineFunctions import json_flatten, json_structure
requests.packages.urllib3.disable_warnings()

In [42]:
#Protrak Portion
protrak_active_protocol_url = 'https://clinweb.cc.nih.gov/pqs/api/protocoldata/activeprotocols'
protrak_protocol_detail_url = 'https://clinweb.cc.nih.gov/pqs/api/protocoldata/bynum/'
data_fields = ['protocol_number', 'protocol_title', 'accrual_inst','accrual_status','coord_site','protrak_accrual_status','research_type','research_phase', 'study_type',
               'start_date_of_study', 'date_first_part_enrolled', 'irb_name', ]

In [43]:
session = requests.Session()
protocol_detail_session = requests.Session()
response = session.get(protrak_active_protocol_url, verify=False)

In [44]:
active_study_json = response.json()['protocols']
error_protocols = set()

In [45]:
detail_data = []
pi_data = []

#testing with first 10 protocols
for p in active_study_json:
    protocol_number = p['protocolNumber']


    #calling api for specific protocol details
    protocol_detail_response = protocol_detail_session.get(protrak_protocol_detail_url + protocol_number, verify=False)
    if protocol_detail_response.json()['responseCode'] != 200:
        error_protocols.add(protocol_number)
        continue
    detail_json = protocol_detail_response.json()['returnedProtocol']
    
    #flattened_detail_table = pd.json_normalize(json_flatten(detail_json))
    flattened_detail_json = json_flatten(detail_json)

    #getting pi information from API call
    investigators = detail_json['investigators']
    for i in investigators:
        if i['r'] == 'PI':
            pi_table = pi_data.append({'protocolNumber':protocol_number, 'firstName':i['n']['fn'], 'lastName':i['n']['ln'], 'middleName':i['n']['mn']})

    #detail_table = pd.concat([detail_table, flattened_detail_table])
    detail_data.append(flattened_detail_json)

pi_table = pd.json_normalize(pi_data)
detail_table = pd.json_normalize(detail_data)

In [51]:
protrak_table = detail_table[data_fields]

In [53]:
protrak_table

Unnamed: 0,protocol_number,protocol_title,accrual_inst,accrual_status,coord_site,protrak_accrual_status,research_type,research_phase,study_type,start_date_of_study,date_first_part_enrolled,irb_name
0,002360-I,Single Use Expanded Access for 10E8.4/iMab and...,NIAID,Special Exemption,,Special Exemption,,,Expanded Access,12/03/2024,12/03/2024,Panel 1
1,002357-AG,Deprescribing of Antipsychotic Medication amon...,NIA,"No Longer Recruiting, subject follow-up only",,No longer recruiting/follow-up only,,,Observational,12/06/2024,12/06/2024,Panel 1
2,002355-I,Olorofim Multiple Patient Access Program Singl...,NIAID,Special Exemption,,Special Exemption,,,Expanded Access,12/02/2024,12/02/2024,Panel 1
3,002350-I,Single Use Compassionate Use for 10E8.4/iMab i...,NIAID,Special Exemption,,Special Exemption,,,Expanded Access,12/03/2024,12/03/2024,Panel 1
4,002333-C,Expanded Use for Retreatment for a Single Pati...,NCI,Special Exemption,,Special Exemption,,,Expanded Access,10/30/2024,10/30/2024,Panel 1
...,...,...,...,...,...,...,...,...,...,...,...,...
1743,77-DK-0002,Natural History of Thyroid Function Disorders,NIDDK,Recruiting,,Participants currently recruited/enrolled,R:NH,,Observational,02/01/1977,02/01/1977,Panel 1
1744,76-HG-0238,Diagnosis and Treatment of Patients with Inbor...,NHGRI,Recruiting,,Participants currently recruited/enrolled,R:NH,,Observational,09/12/1978,09/12/1978,Panel 1
1745,76-H-0051,Lipoprotein Metabolism in Normal Volunteers an...,NHLBI,Open for Data Analysis,,Completed Study; data analyses ongoing,R:NH,,Observational,09/03/1976,09/03/1976,Panel 1
1746,OH76-DK-0256,Prospective Studies of Diabetes Mellitus and i...,NIDDK,Open for Data Analysis,,Completed Study; data analyses ongoing,R:NH,,Observational,09/03/1976,09/03/1976,Panel 1


In [9]:
#Pulling Protect API Data

main_api = 'https://protect-training.cc.nih.gov/TRAINING-IRB/sd/PublicCustomLayouts/PSLib/WebApi/multiResult?interfaceID=MultiIRBInformation'

session.auth = (u'crispi', '900DF7F9456F3B31A73C532C20986DEE45A0EC0A331CCBC519000E11DB4FBC0E')
session.verify = False
response = session.get(main_api)

In [10]:
study_id_list = set()
submission_list = []
all_submission_list = list()

for j in response.json():
    submission_list.append(j['ID'])
    study_id_list.add(j['parentStudyID'])
    all_submission_list.append(j)

In [14]:
basic_irb_API = "https://protect-training.cc.nih.gov/TRAINING-IRB/sd/PublicCustomLayouts/PSLib/WebApi/singleResult?interfaceID=BasicIRBInformation&resourceID="
multi_irb_api = "https://protect-training.cc.nih.gov/TRAINING-IRB/sd/PublicCustomLayouts/PSLib/WebApi/multiResult?interfaceID=MultiIRBInformation&$filter=parentProjectID eq "
add_submission_api = "https://protect-training.cc.nih.gov/TRAINING-IRB/sd/PublicCustomLayouts/PSLib/WebApi/singleResult?interfaceID=BasicIRBInformation&resourceID="

In [15]:
protect_data_fields = ['parentStudyID', 'piFirstName', 'piMiddleName', 'piLastName', 'piEmail', 'title', 'shortTitle', 'riskLevel', 'dateApproved', 'dateExpiration', 'studyTeamMembers',
                       'reportableNewInformation_category', 'reportableNewInformation_relatedStudies', 'reportableNewInformation_supportingDocuments', 'reportableNewInformation_rniDetermination']

In [16]:
protocols_done = list()

In [35]:
temp = list()

In [39]:
study_details_list = list()

for id in list(study_id_list)[:1]:
    study_detail_api = basic_irb_API + id

    response2 = session.get(study_detail_api)
    if response.status_code != 200:
        continue

    if 'data' in response2.json().keys():
        data = response2.json()['data']
    else:
        data = response2.json()


    #protocols_done.append(id)
    #study_details_list.append(data)
    temp.append(data)

In [32]:
len(study_details_list)

4264

In [31]:
len(study_id_list)

4264

In [33]:
protect_protocol_table = pd.json_normalize(study_details_list)[protect_data_fields]

In [54]:
#outer join on pi name, title, protocol number in order to find any discrepencies between the data sources
#protocolNumberAnti = protect_protocol_table.join(detail_table, how='outer', indicator=True, lsuffix='parentStudyID', rsuffix='protocolNumber')
protocolNumberAnti = pd.merge(protect_protocol_table, protrak_table, how='outer', indicator=True, left_on='parentStudyID', right_on='protocol_number')

In [55]:
protocolNumberAnti

Unnamed: 0,parentStudyID,piFirstName,piMiddleName,piLastName,piEmail,title,shortTitle,riskLevel,dateApproved,dateExpiration,...,accrual_status,coord_site,protrak_accrual_status,research_type,research_phase,study_type,start_date_of_study,date_first_part_enrolled,irb_name,_merge
0,15AA0031,Gene-Jack,,Wang,gene-jack.wang@nih.gov,Human Brain Mapping of the Apparent Diffusion ...,ADC changes during sleep,No greater than minimal risk,2024-09-03,2025-09-02,...,,,,,,,,,,left_only
1,19CH0026,Jack,A,Yanovski,yanovskj@mail.nih.gov,"Randomized, Double-Blind, Placebo-Controlled S...",DCCR for Hyperphagia,Greater than minimal risk,2024-05-07,2025-05-06,...,,,,,,,,,,left_only
2,01M0254,Carlos,,Zarate,zaratec@mail.nih.gov,The Evaluation of Patients with Mood and Anxie...,Screening Protocol for MADP,Greater than minimal risk,2024-09-25,2025-03-25,...,,,,,,,,,,left_only
3,IRB002063,Lothar,,Hennighausen,lotharh@niddk.nih.gov,Immune responses in COVID-19 patients and in v...,NHSR - COVID vaccine response,,2024-03-04,,...,,,,,,,,,,left_only
4,IRB002236,Krishnan,,Patel,krishnan.patel@nih.gov,Understanding What Evidence Oncologists Consid...,Oncology Practice Survey,,2024-07-24,,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6007,,,,,,,,,,,...,Recruiting,,Participants currently recruited/enrolled,R:NH,,Observational,02/01/1977,02/01/1977,Panel 1,right_only
6008,,,,,,,,,,,...,Recruiting,,Participants currently recruited/enrolled,R:NH,,Observational,09/12/1978,09/12/1978,Panel 1,right_only
6009,,,,,,,,,,,...,Open for Data Analysis,,Completed Study; data analyses ongoing,R:NH,,Observational,09/03/1976,09/03/1976,Panel 1,right_only
6010,,,,,,,,,,,...,Open for Data Analysis,,Completed Study; data analyses ongoing,R:NH,,Observational,09/03/1976,09/03/1976,Panel 1,right_only
