In [2]:
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import numpy as np
import torch
import requests
import scipy.stats as stats
import copy

In [52]:
PICKLES_ADDRESS = '../data/pickles/'
LAG_DATA_ADDRESS = 'lag_data/'
PUBMED_ADDRESS = '../data/pubmed_dataset/'
SBIR_ADDRESS = '../data/sbir_dataset/'
TEMP_ADDRESS = 'temp/'

In [4]:
years = [str(year) for year in range(2010, 2022)]

with open(PICKLES_ADDRESS + 'dui2children.pkl', 'rb') as handle:
    dui2children = pickle.load(handle)

with open(PICKLES_ADDRESS + 'dui2trail.pkl', 'rb') as handle:
    dui2parents = pickle.load(handle)

with open(PICKLES_ADDRESS + 'heading2dui.pkl', 'rb') as handle:
    name2dui = pickle.load(handle)

with open(PICKLES_ADDRESS + 'level2duis.pkl', 'rb') as handle:
    level2duis = pickle.load(handle)

with open(PICKLES_ADDRESS + 'dui2level.pkl', 'rb') as handle:
    dui2level = pickle.load(handle)


#reverse keys and values (inline) in dui2heading and make header2dui
dui2name = {dui:heading for heading, dui in name2dui.items()}

dui_intersections = set(dui2children.keys()) & set(dui2parents.keys()) & set(dui2name.keys()) & set(dui2level.keys())

#gather a list of excluded DUIs, i.e. those that are not in the intersection of the three dictionaries
excluded_duis = [dui for dui in dui2children.keys() if dui not in dui_intersections]
excluded_duis += [dui for dui in dui2parents.keys() if dui not in dui_intersections]
excluded_duis += [dui for dui in dui2name.keys() if dui not in dui_intersections]
excluded_duis += [dui for dui in dui2level.keys() if dui not in dui_intersections]

print('years:', years)
print('len(dui2children)', len(dui2children))
print('len(dui2parents)', len(dui2parents))
print('len(name2dui)', len(dui2name))
print('len(dui2level)', len(dui2level))
print('Mututal DUIs count:', len(dui_intersections))

# at each dict, select only the DUIs that are in the intersection of the three dictionaries
dui2children = {k:v for k,v in dui2children.items() if k in dui_intersections}
dui2parents = {k:v for k,v in dui2parents.items() if k in dui_intersections}
dui2name = {k:v for k,v in dui2name.items() if k in dui_intersections}
name2dui = {v:k for k,v in dui2name.items()}
dui2level = {k:v for k,v in dui2level.items() if k in dui_intersections}

# remove a dui from values of level2dui if it is not in the intersection
for level, dui_list in level2duis.items():
    if level == 0:
        continue
    level2duis[level] = [dui for dui in dui_list if dui in dui_intersections]


print('=' * 50)
print('INTERSECTED')
print('Excluded DUIs:', len(excluded_duis))
print('len(dui2children)', len(dui2children))
print('len(dui2parents)', len(dui2parents))
print('len(name2dui)', len(dui2name))
print('len(dui2level)', len(dui2level))
print('len(level2dui)', len(level2duis))

years: ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
len(dui2children) 29432
len(dui2parents) 29432
len(name2dui) 29655
len(dui2level) 29432
Mututal DUIs count: 29307
INTERSECTED
Excluded DUIs: 723
len(dui2children) 29307
len(dui2parents) 29307
len(name2dui) 29307
len(dui2level) 29307
len(level2dui) 14


In [5]:
with open(PICKLES_ADDRESS + 'dui2children_final.pkl', 'wb') as handle:
    pickle.dump(dui2children, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'dui2trail_final.pkl', 'wb') as handle:
    pickle.dump(dui2parents, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'heading2dui_final.pkl', 'wb') as handle:
    pickle.dump(dui2name, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'dui2header_final.pkl', 'wb') as handle:
    pickle.dump(name2dui, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'dui2level_final.pkl', 'wb') as handle:
    pickle.dump(dui2level, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'level2duis_final.pkl', 'wb') as handle:
    pickle.dump(level2duis, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'dui_intersection_final.pkl', 'wb') as handle:
    pickle.dump(dui_intersections, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLES_ADDRESS + 'excluded_duis_final.pkl', 'wb') as handle:
    pickle.dump(excluded_duis, handle, protocol=pickle.HIGHEST_PROTOCOL)

print('saved dictionaries')

saved dictionaries


In [5]:
sbir_df = pd.read_csv(SBIR_ADDRESS + 'award_data_with_cuis_duis.csv', engine='python')
sbir_df = sbir_df[sbir_df['Proposal Award Date'].notna()]
sbir_df = sbir_df[sbir_df['Proposal Award Date'].str.contains('/')]
sbir_df['Proposal Award Date'] = pd.to_datetime(sbir_df['Proposal Award Date'])
sbir_df['Year'] = sbir_df['Proposal Award Date'].dt.year.astype(str)
sbir_df = sbir_df[sbir_df['Year'].isin(years)]
sbir_df['Award Amount'] = sbir_df['Award Amount'].str.replace('$', '')
sbir_df['Award Amount'] = sbir_df['Award Amount'].str.replace(',', '')
sbir_df['Award Amount'] = sbir_df['Award Amount'].str.replace('.*', '')
sbir_df['Award Amount'] = sbir_df['Award Amount'].astype(int)
#only select rows where their agency is either 'Department of Health and Human Services' or NSF
# sbir_df = sbir_df[sbir_df['Agency'].isin(['NIH', 'NSF'])]
sbir_df = sbir_df[['sbid', 'Year', 'Award Title', 'Abstract', 'Agency', 'Award Amount', 'duis']]
sbir_df = sbir_df[sbir_df['Agency'].notna()]
sbir_df = sbir_df[sbir_df['Abstract'].notna()]
sbir_df = sbir_df[sbir_df['duis'].notna()]
print('len(sbir_df)', len(sbir_df))
sbir_df.head()

len(sbir_df) 63488


Unnamed: 0,sbid,Year,Award Title,Abstract,Agency,Award Amount,duis
0,S000001,2021,"Opportunistic Passive RF Detection, Classifica...",In the last decade there has been an exponenti...,Department of Defense,749922,D000069550;D058749;Q000191;D008889;D001846;D01...
2,S000003,2021,OPTIMIZING The Human Machine,"1109 BRAVO’s Neuropak provides an efficient, e...",Department of Defense,49999,D014947;Q000517;D000082622;Q000534;D006801;D01...
4,S000005,2021,Remote Medication Adherence solution for glauc...,"Glaucoma, a disease that damages your eye's op...",Department of Defense,49984,D005901;D014728;D004194;D001766;D000368;D013313
5,S000006,2021,Extendable Workflow Automation For The USAF Wi...,2X4Lab will investigate the applicability of i...,Department of Defense,49691,D005240;D008921
8,S000009,2021,Architecting Resilient and Sustainable Remote ...,"Technology is changing at a rapid pace, especi...",Department of Defense,49021,Q000601;D013672


In [6]:
agencies = sbir_df['Agency'].unique()
agencies

array(['Department of Defense', 'Department of Transportation',
       'Department of Health and Human Services',
       'National Science Foundation', 'Department of Agriculture',
       'Department of Energy',
       'National Aeronautics and Space Administration',
       'Department of Commerce', 'Environmental Protection Agency',
       'Department of Homeland Security', 'Department of Education'],
      dtype=object)

In [7]:
sbir_df.to_csv(SBIR_ADDRESS + 'sbir_df.csv', index=False)
print('saved sbir_df.csv')

saved sbir_df.csv


In [9]:
pubmed = pd.read_csv(PUBMED_ADDRESS + 'pubmed_dataset_with_duis.csv', engine='python')
pubmed.head()

In [None]:
# copy pubmed to pubmed_df
pubmed_df = pubmed.copy()
print(len(pubmed_df))

13517353


In [None]:
pubmed_df['pub_date'] = pd.to_datetime(pubmed_df['pub_date'])
pubmed_df['year'] = pubmed_df['pub_date'].dt.year.astype(str)
pubmed_df = pubmed_df[pubmed_df['year'].isin(years)]
pubmed_df = pubmed_df[pubmed_df['abstract'].notna()]
pubmed_df = pubmed_df[pubmed_df['duis'].notna()]
pubmed_df = pubmed_df[['pmid', 'year', 'abstract', 'citation_count', 'duis']]
print('len(pubmed_df)', len(pubmed_df))
pubmed_df.head()

len(pubmed_df) 10928078


Unnamed: 0,pmid,year,abstract,citation_count,duis
0,28867731,2017,Epithelial-to-mesenchymal transition (EMT) is ...,18,D051541;D047628;D010957;D000071250;D012333;D01...
1,23611783,2013,"In plants, flavonoids have been shown to be su...",15,D044949;D005810;D029681;D005419;D017386;D01080...
2,31440131,2019,"a priori In recent years, digital communicatio...",2,D044949;D005810;D029681;D005419;D017386;D01080...
3,30551143,2019,Toxin-antitoxin (TA) systems are ubiquitous am...,14,D013203;D004352;D013211;D006801;D001426;D01596...
5,21134855,2010,Efavirenz (EFV) tablets of different doses wer...,3,D048588;D003692;D000480;D002626;D012995;D00352...


In [11]:
pubmed_df.to_csv(PUBMED_ADDRESS + 'pubmed_df.csv', index=False)
print('saved pubmed_df.csv')

In [12]:
#load pubmed_df
pubmed_df = pd.read_csv(PUBMED_ADDRESS + 'pubmed_df.csv', engine='python')
pubmed_df.head()

Unnamed: 0,pmid,year,abstract,citation_count,duis
0,28867731,2017,Epithelial-to-mesenchymal transition (EMT) is ...,18,D051541;D047628;D010957;D000071250;D012333;D01...
1,23611783,2013,"In plants, flavonoids have been shown to be su...",15,D044949;D005810;D029681;D005419;D017386;D01080...
2,31440131,2019,"a priori In recent years, digital communicatio...",2,D044949;D005810;D029681;D005419;D017386;D01080...
3,30551143,2019,Toxin-antitoxin (TA) systems are ubiquitous am...,14,D013203;D004352;D013211;D006801;D001426;D01596...
4,21134855,2010,Efavirenz (EFV) tablets of different doses wer...,3,D048588;D003692;D000480;D002626;D012995;D00352...


In [13]:
sbir_df = pd.read_csv(SBIR_ADDRESS + 'sbir_df.csv', engine='python')
sbir_df.head()

Unnamed: 0,sbid,Year,Award Title,Abstract,Agency,Award Amount,duis
0,S000001,2021,"Opportunistic Passive RF Detection, Classifica...",In the last decade there has been an exponenti...,Department of Defense,749922,D000069550;D058749;Q000191;D008889;D001846;D01...
1,S000003,2021,OPTIMIZING The Human Machine,"1109 BRAVO’s Neuropak provides an efficient, e...",Department of Defense,49999,D014947;Q000517;D000082622;Q000534;D006801;D01...
2,S000005,2021,Remote Medication Adherence solution for glauc...,"Glaucoma, a disease that damages your eye's op...",Department of Defense,49984,D005901;D014728;D004194;D001766;D000368;D013313
3,S000006,2021,Extendable Workflow Automation For The USAF Wi...,2X4Lab will investigate the applicability of i...,Department of Defense,49691,D005240;D008921
4,S000009,2021,Architecting Resilient and Sustainable Remote ...,"Technology is changing at a rapid pace, especi...",Department of Defense,49021,Q000601;D013672


In [14]:
pubmed_df['abstract'].apply(lambda x: len(x.split())).describe()

count    1.092808e+07
mean     2.057577e+02
std      8.014588e+01
min      2.000000e+00
25%      1.530000e+02
50%      2.040000e+02
75%      2.490000e+02
max      6.274000e+03
Name: abstract, dtype: float64

In [15]:
sbir_df['Abstract'].apply(lambda x: len(x.split())).describe()

count    63488.000000
mean       251.400910
std        138.849364
min          1.000000
25%        164.000000
50%        200.000000
75%        338.000000
max       1712.000000
Name: Abstract, dtype: float64

In [16]:
len(dui_intersections), len(excluded_duis)

(29307, 723)

In [17]:
dui2pubfreq = {dui: {year: [] for year in years} for dui in dui_intersections}

for index, row in tqdm(pubmed_df.iterrows()):
    pmid = row['pmid']
    year = row['year']
    duis = row['duis'].split(';')
    for dui in duis:
        try:
            if dui not in excluded_duis:
                dui2pubfreq[dui][str(year)] += [pmid]
        except KeyError:
            pass
        except Exception as e:
            print(e)
            print(dui, year, pmid, duis)
            print('=' * 50)
            
    if index % 1000000 == 0:
        with open(PICKLES_ADDRESS + 'dui2pubfreq.pkl', 'wb') as handle:
            pickle.dump(dui2pubfreq, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Saved dui2pubfreq.pkl', index)

# replace each pmid list at each year of each dui with the list of its unique elements
for dui, year2pmids in dui2pubfreq.items():
    for year, pmids in year2pmids.items():
        dui2pubfreq[dui][year] = list(set(pmids))
    
with open(PICKLES_ADDRESS + 'dui2pubfreq.pkl', 'wb') as handle:
    pickle.dump(dui2pubfreq, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2pubfreq.pkl', len(dui2pubfreq))

1765it [00:01, 2206.31it/s]

Saved dui2pubfreq.pkl 0


1001696it [01:50, 4326.75it/s]

Saved dui2pubfreq.pkl 1000000


2001420it [03:37, 2725.18it/s]

Saved dui2pubfreq.pkl 2000000


3001020it [05:25, 2072.32it/s]

Saved dui2pubfreq.pkl 3000000


4001027it [07:14, 1614.11it/s]

Saved dui2pubfreq.pkl 4000000


5001240it [09:03, 1385.00it/s]

Saved dui2pubfreq.pkl 5000000


6001676it [10:52, 1197.62it/s]

Saved dui2pubfreq.pkl 6000000


7001243it [12:41, 1029.33it/s]

Saved dui2pubfreq.pkl 7000000


8001089it [14:31, 951.73it/s] 

Saved dui2pubfreq.pkl 8000000


9001781it [16:22, 845.22it/s] 

Saved dui2pubfreq.pkl 9000000


10001070it [18:13, 768.57it/s]

Saved dui2pubfreq.pkl 10000000


10928078it [19:52, 9166.10it/s]


Saved dui2pubfreq.pkl 29307


In [18]:
# do the same for sbir
dui2sbfreq = {dui: {year: [] for year in years} for dui in dui_intersections}

for index, row in tqdm(sbir_df.iterrows()):
    sbid = row['sbid']
    year = row['Year']
    duis = row['duis'].split(';')
    for dui in duis:
        try:
            if dui not in excluded_duis:
                dui2sbfreq[dui][str(year)] += [sbid]
        except KeyError:
            pass
        except Exception as e:
            print(e, type(e))
            print(dui, year, sbid, duis)
            print('=' * 50)
    

for dui, year2sbids in dui2sbfreq.items():
    for year, sbids in year2sbids.items():
        dui2sbfreq[dui][year] = list(set(sbids))

with open(PICKLES_ADDRESS + 'dui2sbfreq.pkl', 'wb') as handle:
    pickle.dump(dui2sbfreq, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2sbfreq.pkl', len(dui2sbfreq))

63488it [00:07, 8232.40it/s] 


Saved dui2sbfreq.pkl 29307


In [6]:
#load dui2pubfreq and dui2sbfreq
with open(PICKLES_ADDRESS + 'dui2pubfreq.pkl', 'rb') as handle:
    dui2pubfreq = pickle.load(handle)
print('Loaded dui2pubfreq.pkl', len(dui2pubfreq))

with open(PICKLES_ADDRESS + 'dui2sbfreq.pkl', 'rb') as handle:
    dui2sbfreq = pickle.load(handle)
print('Loaded dui2sbfreq.pkl', len(dui2sbfreq))

Loaded dui2pubfreq.pkl 29307
Loaded dui2sbfreq.pkl 29307


In [7]:
print('len(dui2pubfreq)', len(dui2pubfreq))
print('len(dui2sbfreq)', len(dui2sbfreq))
print('Machine Learning, D000069550, 2010:')
print('pubmed (raw):', len(dui2pubfreq['D000069550']['2010']))
print('sbir (raw):', len(dui2sbfreq['D000069550']['2010']))

len(dui2pubfreq) 29307
len(dui2sbfreq) 29307
Machine Learning, D000069550, 2010:
pubmed (raw): 117
sbir (raw): 13


In [22]:
dui2pubfreqchild = copy.deepcopy(dui2pubfreq)
for dui in tqdm(dui_intersections):
    for year in years:
        children_pmids = dui2pubfreqchild[dui][year]  # set the default value to the parent's pmid list
        for dui_child in dui2children[dui]:
            if dui_child in dui_intersections:
                children_pmids += dui2pubfreq[dui_child][year]
        dui2pubfreqchild[dui][year] = list(set(children_pmids))

with open(PICKLES_ADDRESS + 'dui2pubfreqchild.pkl', 'wb') as handle:
    pickle.dump(dui2pubfreqchild, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2pubfreqchild.pkl', len(dui2pubfreqchild))

100%|██████████| 29307/29307 [00:19<00:00, 1468.32it/s]


Saved dui2pubfreqchild.pkl 29307


In [23]:
dui2sbfreqchild = copy.deepcopy(dui2sbfreq)
for dui in tqdm(dui_intersections):
    for year in years:
        children_pmids = dui2sbfreqchild[dui][year]  # set the default value to the parent's pmid list
        for dui_child in dui2children[dui]:
            if dui_child in dui_intersections:
                children_pmids += dui2sbfreq[dui_child][year]
        dui2sbfreqchild[dui][year] = list(set(children_pmids))

with open(PICKLES_ADDRESS + 'dui2sbfreqchild.pkl', 'wb') as handle:
    pickle.dump(dui2sbfreqchild, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2sbfreqchild.pkl', len(dui2sbfreqchild))

100%|██████████| 29307/29307 [00:00<00:00, 44035.88it/s]


Saved dui2sbfreqchild.pkl 29307


In [8]:
#load dui2pubfreqchild and dui2sbfreqchild
with open(PICKLES_ADDRESS + 'dui2pubfreqchild.pkl', 'rb') as handle:
    dui2pubfreqchild = pickle.load(handle)
print('Loaded dui2pubfreqchild.pkl', len(dui2pubfreqchild))

with open(PICKLES_ADDRESS + 'dui2sbfreqchild.pkl', 'rb') as handle:
    dui2sbfreqchild = pickle.load(handle)
print('Loaded dui2sbfreqchild.pkl', len(dui2sbfreqchild))

Loaded dui2pubfreqchild.pkl 29307
Loaded dui2sbfreqchild.pkl 29307


In [9]:
print('len(dui2pubfreqchild)', len(dui2pubfreqchild))
print('len(dui2sbfreqchild)', len(dui2sbfreqchild))
print('Machine Learning, D000069550, 2010:')
print('- Solo')
print('pubmed (chilren):', len(set(dui2pubfreq['D000069550']['2010'])))
print('sbir (chilren):', len(set(dui2sbfreq['D000069550']['2010'])))
print('+ Children')
print('pubmed (chilren):', len(set(dui2pubfreqchild['D000069550']['2010'])))
print('sbir (chilren):', len(set(dui2sbfreqchild['D000069550']['2010'])))

len(dui2pubfreqchild) 29307
len(dui2sbfreqchild) 29307
Machine Learning, D000069550, 2010:
- Solo
pubmed (chilren): 117
sbir (chilren): 13
+ Children
pubmed (chilren): 210
sbir (chilren): 16


In [10]:
sample_dui = 'D000069550'
year = '2010'
sample_name = dui2name[sample_dui]

print(sample_dui, sample_name)
print(len(dui2pubfreq[sample_dui][year]), 'S072387' in dui2pubfreq[sample_dui][year])
print(dui2sbfreq[sample_dui][year])
print(len(dui2pubfreqchild[sample_dui][year]), dui2pubfreqchild[sample_dui][year])
print(dui2sbfreqchild[sample_dui][year])

D000069550 Machine Learning
117 False
['S062445', 'S073651', 'S069251', 'S073186', 'S074149', 'S063768', 'S069063', 'S069131', 'S065745', 'S067460', 'S069797', 'S061704', 'S073267']
210 [20579842, 21137411, 21836806, 21127688, 21231115, 20336142, 21322254, 21103121, 25213970, 20703761, 21374995, 21374997, 22272023, 21588509, 20571677, 20727327, 21194784, 21194783, 20572709, 20842538, 20449323, 21580332, 20866608, 20355633, 21186096, 21181491, 21833268, 20355636, 21054004, 20819009, 20590146, 22163524, 21594197, 20877909, 20234839, 21135448, 20445782, 21122139, 21230171, 23133286, 20842600, 20388970, 20201578, 20545131, 20023405, 20387950, 20545133, 27352170, 20465266, 20111987, 21346931, 20689014, 21611135, 21540992, 21052547, 21165187, 21346953, 20141202, 21346963, 20947093, 20459160, 30754458, 20508315, 21616797, 20658334, 20658336, 21045922, 21311651, 20658339, 21607077, 20145834, 21231275, 21160621, 21082798, 21386415, 26190000, 22163632, 20880056, 20877496, 21253307, 20729019, 215

In [17]:
# create a pmid2citation based on the pmid and citation_count columns of the pubmed_df
pmid2citation = {pmid: citation_count for pmid, citation_count in zip(pubmed_df['pmid'], pubmed_df['citation_count'])}
sbid2citation = {sbid: award_amount for sbid, award_amount in zip(sbir_df['sbid'], sbir_df['Award Amount'])}

with open(PICKLES_ADDRESS + 'pmid2citation.pkl', 'wb') as handle:
    pickle.dump(pmid2citation, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(PICKLES_ADDRESS + 'sbid2citation.pkl', 'wb') as handle:
    pickle.dump(sbid2citation, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved pmid2citation.pkl', len(pmid2citation))
print('Saved sbid2citation.pkl', len(sbid2citation))

In [18]:
with open(PICKLES_ADDRESS + 'pmid2citation.pkl', 'rb') as handle:
    pmid2citation = pickle.load(handle)

with open(PICKLES_ADDRESS + 'sbid2citation.pkl', 'rb') as handle:
    sbid2citation = pickle.load(handle)
    
print('Loaded pmid2citation.pkl', len(pmid2citation))
print('Loaded sbid2citation.pkl', len(sbid2citation))

Loaded pmid2citation.pkl 10928078
Loaded sbid2citation.pkl 63488


In [19]:
zero2duis = {
    'Anatomy [A]': 'U000002',
    'Organisms [B]': 'U000012',
    'Diseases [C]': 'U000006',
    'Chemicals and Drugs [D]': 'U000005',
    'Analytical, Diagnostic and Therapeutic Techniques and Equipment [E]': 'U000001',
    'Psychiatry and Psychology [F]': 'U000014',
    'Phenomena and Processes [G]': 'U000004',
    'Disciplines and Occupations [H]': 'U000013',
    'Anthropology, Education, Sociology and Social Phenomena [I]': 'U000003',
    'Technology, Industry, Agriculture [J]': 'U000009',
    'Information Science [K]': 'U000010',
    'Named Groups [M]': 'U000011',
    'Health Care [N]': 'U000008',
    'Publication Characteristics [V]': 'U000016',
    'Geographicals [Z] ': 'U000015',
}
dui2zero = {v: k for k, v in zero2duis.items()}
zero2chilren = {zdui: [] for zdui in level2duis[0]}

for zero_level_dui in level2duis[0]:
    for first_level_dui in level2duis[1]:
        parent = dui2parents[first_level_dui][-2]
        if zero_level_dui == parent:
            zero2chilren[zero_level_dui] += [first_level_dui]

zero2chilren['U000010']

['D007254']

In [20]:
zero2allpubchild = {zdui: {year: [] for year in years} for zdui in level2duis[0]}
for zero_level_dui, zero_level_dui_firstlevel_children in zero2chilren.items():
      print(zero_level_dui, zero_level_dui_firstlevel_children)
      for first_level_dui in zero_level_dui_firstlevel_children:
         for year in years:
               zero2allpubchild[zero_level_dui][year] += dui2pubfreqchild[first_level_dui][year]

print('=' * 100)

zero2allsbchild = {zdui: {year: [] for year in years} for zdui in level2duis[0]}
for zero_level_dui, zero_level_dui_firstlevel_children in zero2chilren.items():
      print(zero_level_dui, zero_level_dui_firstlevel_children)
      for first_level_dui in zero_level_dui_firstlevel_children:
         for year in years:
               zero2allsbchild[zero_level_dui][year] += dui2sbfreqchild[first_level_dui][year]

U000001 ['D013812', 'D013514', 'D003933', 'D004864']
U000013 ['D006281', 'D010811']
U000003 ['D006802', 'D004493']
U000012 ['D056890', 'D014780', 'D001419', 'D001105']
U000004 ['D055633', 'D055641', 'D055598', 'D055585', 'D055827', 'D055614', 'D008827', 'D001686', 'D008660', 'D010829', 'D018521', 'D009799', 'D002468']
U000011 []
U000015 ['D000072182']
U000006 ['D012140', 'D010038', 'D009422', 'D002318', 'D004066', 'D014947', 'D007154', 'D009140', 'D064419', 'D009057', 'D005128', 'D004700', 'D000820', 'D000091642', 'D009358', 'D009784', 'D007239', 'D009369']
U000014 ['D001523', 'D011579']
U000009 ['D006809']
U000005 ['D001697', 'D004364', 'D001685', 'D008055', 'D007287', 'D046911', 'D006571', 'D045762', 'D045424', 'D011083', 'D002241', 'D009930']
U000008 []
U000002 ['D002319', 'D018514', 'D002477', 'D056224', 'D012137', 'D034582', 'D000825', 'D009420', 'D056229', 'D004703', 'D013284', 'D014566', 'D009141', 'D012679', 'D004064', 'D056226']
U000010 ['D007254']
U000001 ['D013812', 'D013514

In [21]:
len(dui2pubfreqchild['D007254']['2010']), len(zero2allpubchild['U000010']['2010']), len(set(zero2allpubchild['U000010']['2010']))

(105622, 105622, 105622)

In [69]:
#write a function that receives a list and returns the quartiles of the list
def calculate_quartiles(audience):
    quarts = np.quantile(audience, [0.25, 0.5, 0.75], method='midpoint')
    return quarts.tolist()

def quantize_cit(cit, quarts):
    if not quarts:
        return 0
    if cit <= quarts[0]:
        return 0.25
    elif cit <= quarts[1]:
        return 0.5
    elif cit <= quarts[2]:
        return 0.75
    else:
        return 1
    
def quantize_citations(citations, quarts):
    quantized_citations = []
    for cit in citations:
        quantized_cit = quantize_cit(cit, quarts)
        quantized_citations.append(quantized_cit)
    return quantized_citations
    
audience = [1, 1, 1, 7, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 10, 10, 10, 10, 11]
test = [0, 1, 3, 7, 9, 10, 12]
quantize_citations(test, calculate_quartiles(audience))

[0.25, 0.25, 0.25, 0.5, 0.75, 1, 1]

In [70]:
zero2allpubchildquarts = {zdui: {year: [] for year in years} for zdui in level2duis[0]}
for zero_level_dui in tqdm(list(zero2allpubchild.keys())):
    for year in years:
        children_pmids = zero2allpubchild[zero_level_dui][year]
        children_citations = [pmid2citation[pmid] for pmid in children_pmids]
        try:
            zero2allpubchildquarts[zero_level_dui][year] = calculate_quartiles(children_citations)
        except:
            pass

zero2allsbchildquarts = {zdui: {year: [] for year in years} for zdui in level2duis[0]}
for zero_level_dui in tqdm(list(zero2allsbchild.keys())):
    for year in years:
        children_sbids = zero2allsbchild[zero_level_dui][year]
        children_citations = [sbid2citation[pmid] for pmid in children_sbids]
        try:
            zero2allsbchildquarts[zero_level_dui][year] = calculate_quartiles(children_citations)
        except:
            pass

  0%|          | 0/14 [00:00<?, ?it/s]

100%|██████████| 14/14 [00:33<00:00,  2.36s/it]
100%|██████████| 14/14 [00:00<00:00, 44.92it/s]


In [71]:
zero2allpubchildquarts

{'U000001': {'2010': [6.0, 16.0, 35.0],
  '2011': [5.0, 14.0, 31.0],
  '2012': [5.0, 13.0, 29.0],
  '2013': [5.0, 12.0, 27.0],
  '2014': [4.0, 11.0, 25.0],
  '2015': [4.0, 10.0, 22.0],
  '2016': [4.0, 9.0, 19.0],
  '2017': [3.0, 8.0, 17.0],
  '2018': [3.0, 7.0, 14.0],
  '2019': [2.0, 5.0, 11.0],
  '2020': [1.0, 3.0, 7.0],
  '2021': [0.0, 1.0, 3.0]},
 'U000013': {'2010': [6.0, 16.0, 34.0],
  '2011': [5.0, 14.0, 31.0],
  '2012': [5.0, 13.0, 28.0],
  '2013': [5.0, 12.0, 26.0],
  '2014': [4.0, 11.0, 24.0],
  '2015': [4.0, 10.0, 22.0],
  '2016': [3.0, 9.0, 19.0],
  '2017': [3.0, 8.0, 17.0],
  '2018': [3.0, 7.0, 14.0],
  '2019': [2.0, 5.0, 11.0],
  '2020': [1.0, 3.0, 7.0],
  '2021': [0.0, 1.0, 3.0]},
 'U000003': {'2010': [5.0, 14.0, 30.0],
  '2011': [4.0, 12.0, 27.0],
  '2012': [4.0, 11.0, 24.0],
  '2013': [4.0, 11.0, 23.0],
  '2014': [4.0, 10.0, 21.0],
  '2015': [3.0, 9.0, 19.0],
  '2016': [3.0, 7.0, 16.0],
  '2017': [3.0, 7.0, 14.0],
  '2018': [2.0, 5.0, 11.0],
  '2019': [1.0, 4.0, 8.0],
 

In [78]:
zero2allsbchildquarts

{'U000001': {'2010': [99394.0, 99992.0, 624537.0],
  '2011': [99974.5, 149894.0, 500000.0],
  '2012': [148101.0, 149999.0, 729995.0],
  '2013': [149246.0, 150000.0, 737036.0],
  '2014': [149965.0, 311003.0, 990086.0],
  '2015': [150000.0, 224993.0, 792747.0],
  '2016': [150000.0, 225000.0, 969177.0],
  '2017': [154497.0, 228079.0, 991582.0],
  '2018': [176111.5, 225001.0, 749999.0],
  '2019': [197594.5, 230690.5, 833834.0],
  '2020': [200000.0, 298198.0, 1000000.0],
  '2021': [216595.0, 299508.5, 800000.0]},
 'U000013': {'2010': [99411.0, 99991.0, 599998.0],
  '2011': [99974.0, 149934.0, 500000.0],
  '2012': [128378.0, 149996.0, 600000.0],
  '2013': [148135.0, 150000.0, 527992.5],
  '2014': [149871.5, 150000.0, 750000.0],
  '2015': [149943.0, 222566.0, 750000.0],
  '2016': [149967.0, 224934.0, 749963.0],
  '2017': [149989.0, 225000.0, 750000.0],
  '2018': [149986.0, 225000.0, 749937.0],
  '2019': [149970.5, 225000.0, 750000.0],
  '2020': [149995.0, 249373.0, 940038.5],
  '2021': [14997

In [72]:
dui2pubcit = {dui: {year: None for year in years} for dui, _ in dui2pubfreq.items()}
for c, dui in tqdm(enumerate(list(dui2pubfreq.keys()))):
    parent = dui2parents[dui][0]
    for year in years:
        pmids = dui2pubfreq[dui][year]
        citations = [pmid2citation[pmid] for pmid in pmids]
        quarts = zero2allpubchildquarts[parent][year]
        citation_percentiles = quantize_citations(citations, quarts)
        if dui in dui2pubcit:
            dui2pubcit[dui][year] = citation_percentiles
            
with open(TEMP_ADDRESS + 'dui2pubcit.pkl', 'wb') as handle:
    pickle.dump(dui2pubcit, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2pubcit.pkl', len(dui2pubcit))

29307it [01:30, 322.33it/s]


Saved dui2pubcit.pkl 29307


In [74]:
dui2sbcit = {dui: {year: None for year in years} for dui, _ in dui2sbfreq.items()}
for c, dui in tqdm(enumerate(list(dui2sbfreq.keys()))):
    parent = dui2parents[dui][0]
    for year in years:
        sbids = dui2sbfreq[dui][year]
        citations = [sbid2citation[sbid] for sbid in sbids]
        quarts = zero2allsbchildquarts[parent][year]
        citation_percentiles = quantize_citations(citations, quarts)
        if dui in dui2sbcit:
            dui2sbcit[dui][year] = citation_percentiles
            
with open(TEMP_ADDRESS + 'dui2sbcit.pkl', 'wb') as handle:
    pickle.dump(dui2sbcit, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2sbcit.pkl', len(dui2sbcit))

0it [00:00, ?it/s]

29307it [00:00, 32116.68it/s]


Saved dui2sbcit.pkl 29307


In [75]:
with open(TEMP_ADDRESS + 'dui2pubcit.pkl', 'rb') as handle:
    dui2pubcit = pickle.load(handle)
with open(TEMP_ADDRESS + 'dui2sbcit.pkl', 'rb') as handle:
    dui2sbcit = pickle.load(handle)
print('Loaded dui2sbcit.pkl', len(dui2sbcit))

Loaded dui2sbcit.pkl 29307


In [76]:
dui2pubcitchild = {dui: {year: None for year in years} for dui, _ in dui2pubfreqchild.items()}
for c, dui in tqdm(enumerate(list(dui2pubfreqchild.keys()))):
    parent = dui2parents[dui][0]
    for year in years:
        pmids = dui2pubfreqchild[dui][year]
        citations = [pmid2citation[pmid] for pmid in pmids]
        quarts = zero2allpubchildquarts[parent][year]
        citation_percentiles = quantize_citations(citations, quarts)
        if dui in dui2pubcitchild:
            dui2pubcitchild[dui][year] = citation_percentiles
            
with open(TEMP_ADDRESS + 'dui2pubcitchild.pkl', 'wb') as handle:
    pickle.dump(dui2pubcitchild, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2pubcitchild.pkl', len(dui2pubcitchild))

29307it [05:46, 84.52it/s] 


Saved dui2pubcitchild.pkl 29307


In [77]:
dui2sbcitchild = {dui: {year: None for year in years} for dui, _ in dui2sbfreqchild.items()}
for c, dui in tqdm(enumerate(list(dui2sbfreqchild.keys()))):
    parent = dui2parents[dui][0]
    for year in years:
        sbids = dui2sbfreqchild[dui][year]
        citations = [sbid2citation[sbid] for sbid in sbids]
        quarts = zero2allsbchildquarts[parent][year]
        citation_percentiles = quantize_citations(citations, quarts)
        if dui in dui2sbcitchild:
            dui2sbcitchild[dui][year] = citation_percentiles
            
with open(TEMP_ADDRESS + 'dui2sbcitchild.pkl', 'wb') as handle:
    pickle.dump(dui2sbcitchild, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved dui2sbcitchild.pkl', len(dui2sbcitchild))

29307it [00:03, 7852.55it/s]


Saved dui2sbcitchild.pkl 29307


In [58]:
with open(TEMP_ADDRESS + 'dui2pubcitchild.pkl', 'rb') as handle:
    dui2pubcitchild = pickle.load(handle)
with open(TEMP_ADDRESS + 'dui2sbcitchild.pkl', 'rb') as handle:
    dui2sbcitchild = pickle.load(handle)

print('Loaded dui2pubcitchild.pkl', len(dui2pubcitchild))
print('Loaded dui2sbcitchild.pkl', len(dui2sbcitchild))

Loaded dui2pubcitchild.pkl 29307
Loaded dui2sbcitchild.pkl 29307


In [50]:
sample_dui = 'D000069550'
sample_year = '2010'
sample_name = dui2name[sample_dui]

print(sample_dui, sample_name)
print(len(dui2pubcit[sample_dui][sample_year]), dui2pubcit[sample_dui][sample_year])
print(len(dui2sbcit[sample_dui][sample_year]), dui2sbcit[sample_dui][sample_year], np.sum(dui2sbcit[sample_dui][sample_year]))
print(len(dui2pubcitchild[sample_dui][sample_year]), dui2pubcitchild[sample_dui][sample_year])
print(len(dui2sbcitchild[sample_dui][sample_year]), dui2sbcitchild[sample_dui][sample_year], np.sum(dui2sbcitchild[sample_dui][sample_year]))

D000069550 Machine Learning
117 [0.5, 0.5, 1.0, 0.5, 1.0, 0.25, 0.25, 0.5, 0.25, 0.25, 0.25, 1.0, 0.75, 0.5, 0.5, 0.75, 1.0, 0.25, 0.5, 0.5, 0.25, 0.5, 1.0, 0.75, 0.75, 0.5, 1.0, 0.25, 1.0, 0.5, 0.25, 0.75, 0.75, 1.0, 1.0, 0.25, 0.75, 1.0, 0.75, 0.75, 0.75, 0.5, 1.0, 0.75, 0.5, 0.25, 1.0, 0.75, 0.5, 0.5, 0.75, 1.0, 0.75, 0.75, 0.25, 0.5, 1.0, 0.25, 0.75, 1.0, 1.0, 0.75, 0.5, 0.75, 0.25, 0.75, 0.25, 0.5, 0.25, 1.0, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 1.0, 0.5, 0.5, 0.25, 0.75, 1.0, 0.5, 1.0, 0.25, 0.25, 0.75, 0.75, 0.5, 1.0, 0.75, 0.75, 0.75, 0.25, 0.5, 1.0, 0.75, 1.0, 1.0, 0.75, 1.0, 0.5, 1.0, 1.0, 1.0, 0.5, 1.0, 0.75, 0.5, 1.0, 0.75, 0.25, 1.0, 0.25, 0.75, 0.25, 0.25]
13 [0.5, 0.25, 0.75, 1.0, 0.5, 0.75, 0.75, 0.25, 0.5, 1.0, 1.0, 1.0, 0.25] 8.5
210 [1.0, 0.25, 0.5, 0.5, 0.5, 1.0, 0.25, 0.5, 1.0, 1.0, 0.25, 0.25, 0.5, 0.5, 0.5, 0.25, 0.5, 1.0, 0.5, 1.0, 1.0, 0.5, 0.5, 0.25, 0.75, 0.5, 1.0, 0.75, 0.5, 0.75, 0.5, 0.75, 0.5, 1.0, 0.75, 1.0, 0.75, 0.5, 0.5, 0.5, 0.25, 0.5, 1.0, 0.25, 1.0, 