In [1]:
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import torch
import requests

In [13]:
PICKLES_ADDRESS =  '../data/pickles/'
SBIR_ADDRESS = '../data/sbir_dataset/'

In [2]:
def gpu_info():
    device = torch.device('cuda:0')
    torch.cuda.set_device(device)
    print('GPUs available:', torch.cuda.device_count())
    print('current GPU number: ', torch.cuda.current_device())
    print('GPU name: ', torch.cuda.get_device_name(device))
    print('GPU capability: ', torch.cuda.get_device_capability(device))
    print('GPU memory: ', torch.cuda.get_device_properties(device).total_memory)
    print('GPU memory allocated: ', torch.cuda.memory_allocated(device))
    print('GPU memory cached: ', torch.cuda.memory_cached(device))
    print('GPU memory reserved: ', torch.cuda.memory_reserved(device))
    print('GPU memory free: ', torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device))


def ram_info():
    !free -h

gpu_info()  
ram_info()

GPUs available: 3
current GPU number:  0
GPU name:  NVIDIA RTX 6000 Ada Generation
GPU capability:  (8, 9)
GPU memory:  51010207744
GPU memory allocated:  0
GPU memory cached:  0
GPU memory reserved:  0
GPU memory free:  0
               total        used        free      shared  buff/cache   available
Mem:           503Gi       163Gi        65Gi       193Mi       274Gi       335Gi
Swap:          3.0Ti          0B       3.0Ti




In [6]:
sbir_df = pd.read_csv(SBIR_ADDRESS + 'award_data.csv')
sbir_df['sbid'] = sbir_df.index + 1
sbir_df['sbid'] = sbir_df['sbid'].apply(lambda x: 'S' + str(x).zfill(6))
sbir_df.head()

  sbir_df = pd.read_csv(SBIR_DATASET_PATH)


Unnamed: 0,Company,Award Title,Agency,Branch,Phase,Program,Agency Tracking Number,Contract,Proposal Award Date,Contract End Date,...,Contact Phone,Contact Email,PI Name,PI Title,PI Phone,PI Email,RI Name,RI POC Name,RI POC Phone,sbid
0,0 BASE DESIGN LLC,"Opportunistic Passive RF Detection, Classifica...",Department of Defense,Air Force,Phase II,STTR,FX20D-TCSO1-0113,FA864922P0007,10/13/2021,01/13/2023,...,(919) 606-5330,jmurray@0basedesign.com,John Swartz,,(919) 889-3361,john.swartz@wrc-nc.org,Wireless Research Center of North Carolina,John Swartz,(919) 889-3361,S000001
1,1109 Bravo L.L.C.,OPTIMIZING WARFIGHTER PERFORMANCE,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0264,FA864922P0089,11/02/2021,02/02/2022,...,(502) 641-1887,james@1109bravo.com,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000002
2,1109 Bravo L.L.C.,OPTIMIZING The Human Machine,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0269,FA864922P0091,11/03/2021,02/03/2022,...,(502) 641-1887,james@1109bravo.com,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000003
3,1st1 Technologies LLP,"Disruptive, Efficient & Flexible Image Recogni...",Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0514,FA864922P0325,11/03/2021,02/03/2022,...,(206) 550-9539,johnny@1st1.tech,Johnny Kessler,,(206) 550-9539,johnny@1st1.tech,,,,S000004
4,231 Sheep LLC,Remote Medication Adherence solution for glauc...,Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0247,FA864922P0277,11/03/2021,02/03/2022,...,(562) 335-3531,carl@231sheep.com,Albert Aalan,,(310) 729-5914,Sheep_AA@outlook.com,,,,S000005


In [9]:
def umls_id_to_mesh_id(cui):
    umls_api_key = ''
    base_url = 'https://uts-ws.nlm.nih.gov'
    
    # Search for MeSH mappings
    search_endpoint = '/rest/content/current/CUI/{cui}/atoms'
    search_url = f'{base_url}{search_endpoint}'
    params = {
        'apiKey': umls_api_key,
        'sabs': 'MSH',  # Restrict to MeSH mappings
    }
    response = requests.get(search_url.format(cui=cui), params=params)
    result = response.json()
    mesh_id = result['result'][0]['sourceDescriptor'].split('/')[-1]
    return mesh_id

In [11]:
with open(PICKLES_ADDRESS + 'sbid2cuis.pkl', 'rb') as f:
    sbid2cuis = pickle.load(f)
len(sbid2cuis)

168289

In [12]:
all_cuis_set = list(set([each for each_list in sbid2cuis.values() for each in each_list]))
len(all_cuis_set)

22319

In [8]:
with open(PICKLES_ADDRESS + 'cui2dui.pkl', 'rb') as f:
    cui2dui = pickle.load(f)
print(len(cui2dui))

c = 0
for cui in tqdm(all_cuis_set):
    if cui in cui2dui:
        continue
    try:
        dui = umls_id_to_mesh_id(cui)
        cui2dui[cui] = dui
        if c % 1000 == 0:
            print('Saving', c)
            with open(PICKLES_ADDRESS + 'cui2dui.pkl', 'wb') as f:
                pickle.dump(cui2dui, f)
    except:
        pass
    c += 1  

with open(PICKLES_ADDRESS + 'cui2dui.pkl', 'wb') as f:
    pickle.dump(cui2dui, f)
len(cui2dui)

22307


100%|██████████| 22319/22319 [00:02<00:00, 9870.49it/s] 


22307

In [9]:
d, c, e = 0, 0, 0
for k, v in cui2dui.items():
    if v[0] == 'D':
        d += 1
    elif v[0] == 'C':
        c += 1
    else:
        e += 1

d, c, e, cui2dui[list(cui2dui.keys())[-1]]

(18330, 3791, 186, 'D058609')

Map UMLS labels of SBIR abstracts to MeSH topics

In [10]:
# iterate through sbid2cuis and replace each cui with dui
sbid2duis = {}
for sbid, cuis in tqdm(sbid2cuis.items()):
    duis = []
    for cui in cuis:
        if cui in cui2dui:
            duis.append(cui2dui[cui])
    sbid2duis[sbid] = duis

100%|██████████| 168289/168289 [00:00<00:00, 366491.65it/s]


In [11]:
# add two new columns to sbir_df for cuis and duis of each sbid, note that some entries might be NaN, in those cases assign an empty list
sbir_df['cuis'] = sbir_df['sbid'].apply(lambda x: ';'.join(sbid2cuis[x]) if x in sbid2cuis else '')
sbir_df['duis'] = sbir_df['sbid'].apply(lambda x: ';'.join(sbid2duis[x]) if x in sbid2duis else '')
sbir_df.head()

Unnamed: 0,Company,Award Title,Agency,Branch,Phase,Program,Agency Tracking Number,Contract,Proposal Award Date,Contract End Date,...,PI Name,PI Title,PI Phone,PI Email,RI Name,RI POC Name,RI POC Phone,sbid,cuis,duis
0,0 BASE DESIGN LLC,"Opportunistic Passive RF Detection, Classifica...",Department of Defense,Air Force,Phase II,STTR,FX20D-TCSO1-0113,FA864922P0007,10/13/2021,01/13/2023,...,John Swartz,,(919) 889-3361,john.swartz@wrc-nc.org,Wireless Research Center of North Carolina,John Swartz,(919) 889-3361,S000001,C4760635;C2936504;C0220812;C0026126;C0005939;C...,D000069550;D058749;Q000191;D008889;D001846;D01...
1,1109 Bravo L.L.C.,OPTIMIZING WARFIGHTER PERFORMANCE,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0264,FA864922P0089,11/02/2021,02/02/2022,...,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000002,,
2,1109 Bravo L.L.C.,OPTIMIZING The Human Machine,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0269,FA864922P0091,11/03/2021,02/03/2022,...,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000003,C3263723;C2700409;C5192101;C0034992;C0086418;C...,D014947;Q000517;D000082622;Q000534;D006801;D01...
3,1st1 Technologies LLP,"Disruptive, Efficient & Flexible Image Recogni...",Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0514,FA864922P0325,11/03/2021,02/03/2022,...,Johnny Kessler,,(206) 550-9539,johnny@1st1.tech,,,,S000004,,
4,231 Sheep LLC,Remote Medication Adherence solution for glauc...,Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0247,FA864922P0277,11/03/2021,02/03/2022,...,Albert Aalan,,(310) 729-5914,Sheep_AA@outlook.com,,,,S000005,C0017601;C0042610;C0012634;C0456909;C0001792;C...,D005901;D014728;D004194;D001766;D000368;D013313


In [12]:
# write the sbir_df to data/sbir_dataset/award_data_with_cuis_duis.csv with delimiter as ','
sbir_df.to_csv(SBIR_ADDRESS + 'award_data_with_cuis_duis.csv', index=False, sep=',')
sbir_df.head()

Unnamed: 0,Company,Award Title,Agency,Branch,Phase,Program,Agency Tracking Number,Contract,Proposal Award Date,Contract End Date,...,PI Name,PI Title,PI Phone,PI Email,RI Name,RI POC Name,RI POC Phone,sbid,cuis,duis
0,0 BASE DESIGN LLC,"Opportunistic Passive RF Detection, Classifica...",Department of Defense,Air Force,Phase II,STTR,FX20D-TCSO1-0113,FA864922P0007,10/13/2021,01/13/2023,...,John Swartz,,(919) 889-3361,john.swartz@wrc-nc.org,Wireless Research Center of North Carolina,John Swartz,(919) 889-3361,S000001,C4760635;C2936504;C0220812;C0026126;C0005939;C...,D000069550;D058749;Q000191;D008889;D001846;D01...
1,1109 Bravo L.L.C.,OPTIMIZING WARFIGHTER PERFORMANCE,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0264,FA864922P0089,11/02/2021,02/02/2022,...,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000002,,
2,1109 Bravo L.L.C.,OPTIMIZING The Human Machine,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0269,FA864922P0091,11/03/2021,02/03/2022,...,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000003,C3263723;C2700409;C5192101;C0034992;C0086418;C...,D014947;Q000517;D000082622;Q000534;D006801;D01...
3,1st1 Technologies LLP,"Disruptive, Efficient & Flexible Image Recogni...",Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0514,FA864922P0325,11/03/2021,02/03/2022,...,Johnny Kessler,,(206) 550-9539,johnny@1st1.tech,,,,S000004,,
4,231 Sheep LLC,Remote Medication Adherence solution for glauc...,Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0247,FA864922P0277,11/03/2021,02/03/2022,...,Albert Aalan,,(310) 729-5914,Sheep_AA@outlook.com,,,,S000005,C0017601;C0042610;C0012634;C0456909;C0001792;C...,D005901;D014728;D004194;D001766;D000368;D013313


In [13]:
# read the award_data_with_cuis_duis.csv file and check if the cuis and duis are correct
sbir_df = pd.read_csv(SBIR_ADDRESS + 'award_data_with_cuis_duis.csv', engine='python')
sbir_df.head()

Unnamed: 0,Company,Award Title,Agency,Branch,Phase,Program,Agency Tracking Number,Contract,Proposal Award Date,Contract End Date,...,PI Name,PI Title,PI Phone,PI Email,RI Name,RI POC Name,RI POC Phone,sbid,cuis,duis
0,0 BASE DESIGN LLC,"Opportunistic Passive RF Detection, Classifica...",Department of Defense,Air Force,Phase II,STTR,FX20D-TCSO1-0113,FA864922P0007,10/13/2021,01/13/2023,...,John Swartz,,(919) 889-3361,john.swartz@wrc-nc.org,Wireless Research Center of North Carolina,John Swartz,(919) 889-3361,S000001,C4760635;C2936504;C0220812;C0026126;C0005939;C...,D000069550;D058749;Q000191;D008889;D001846;D01...
1,1109 Bravo L.L.C.,OPTIMIZING WARFIGHTER PERFORMANCE,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0264,FA864922P0089,11/02/2021,02/02/2022,...,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000002,,
2,1109 Bravo L.L.C.,OPTIMIZING The Human Machine,Department of Defense,Air Force,Phase I,STTR,FX21B-TCSO1-0269,FA864922P0091,11/03/2021,02/03/2022,...,James Sass,,(502) 641-1887,james@1109bravo.com,San Antonio Startup Club,Samuel Riehn,(573) 803-8882,S000003,C3263723;C2700409;C5192101;C0034992;C0086418;C...,D014947;Q000517;D000082622;Q000534;D006801;D01...
3,1st1 Technologies LLP,"Disruptive, Efficient & Flexible Image Recogni...",Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0514,FA864922P0325,11/03/2021,02/03/2022,...,Johnny Kessler,,(206) 550-9539,johnny@1st1.tech,,,,S000004,,
4,231 Sheep LLC,Remote Medication Adherence solution for glauc...,Department of Defense,Air Force,Phase I,SBIR,FX212-CSO1-0247,FA864922P0277,11/03/2021,02/03/2022,...,Albert Aalan,,(310) 729-5914,Sheep_AA@outlook.com,,,,S000005,C0017601;C0042610;C0012634;C0456909;C0001792;C...,D005901;D014728;D004194;D001766;D000368;D013313
