In [19]:
import numpy as np
import pandas as pd
import itertools
import collections

In [20]:
def convertsubproject(subproject):
    if subproject!=subproject:
        return ""
    else:
        return " (" + str(int(subproject)) + ")"
    
def processyear(year):
    pm_year = pd.read_csv('Precision Medicine_' + str(year) + '.csv')
    pm_year['Project Number'] = pm_year['Project Number'].astype('str') + [convertsubproject(subproject) for subproject in pm_year['Sub Project #']]
    all_year = pd.read_csv('FedRePORTER_PRJ_C_FY' + str(year) + '.csv', dtype={11:'str', 12:'str', 20:'str'}, error_bad_lines=False)
    pm_year = pm_year[['Category', 'Project Number']]
    pm_year = pm_year.merge(all_year, how='left', left_on='Project Number', right_on=' PROJECT_NUMBER')
    return pm_year

def compare_names(nsf_name, nih_name):
    nsf_name = nsf_name.split(" ")
    nih_name = nih_name.split(", ")   
    if len(nsf_name)==2:
        if nih_name[0]==nsf_name[1]:
            if nih_name[1]==nsf_name[0]:
                return(True)
            elif nih_name[1].split(" ")[0] == nsf_name[0]:
                return(True)
            elif nih_name[1].split(" ")[1] == nsf_name[0]:
                return(True)
    elif len(nsf_name)==3:
        if nih_name[0] == nsf_name[2]:
            if nih_name[1] == (nsf_name[0]+' '+nsf_name[1]):
                return(True)
        elif nih_name[0] == (nsf_name[1]+' '+nsf_name[2]):
            if nih_name[1] == nsf_name[0]:
                return(True)
    return(False)

flatten = lambda l: [item for sublist in l for item in sublist]

## Find PM researchers

I get the NIH grants listed as related to PM by the RCDC (https://report.nih.gov/categorical_spending.aspx; only years 2016, 2017, 2018, and 2019), and from there I extract all PIs. Note that I consider contact PIs but also other PIs.

In [3]:
pm_grants = pd.concat([processyear(2016),
                       processyear(2017),
                       processyear(2018),
                       processyear(2019)])

pm_grants[' CONTACT_PI_PROJECT_LEADER'] = pm_grants[' CONTACT_PI_PROJECT_LEADER'].str.replace(".", "").str.lower()
pm_grants[' CONTACT_PI_PROJECT_LEADER'] = [[x] for x in pm_grants[' CONTACT_PI_PROJECT_LEADER']]
pm_grants[' OTHER_PIS'] = [[name.strip() for name in names.replace(".", "").strip()[:-1].strip().split(";")] for names in pm_grants[' OTHER_PIS'].fillna("").str.lower()]

pi_list = ['']*len(pm_grants)
for i, names in enumerate(pm_grants[' OTHER_PIS']):
    if names == ['']:
        pi_list[i] = pm_grants[' CONTACT_PI_PROJECT_LEADER'].iloc[i]
    else:
        pi_list[i] = pm_grants[' CONTACT_PI_PROJECT_LEADER'].iloc[i] + pm_grants[' OTHER_PIS'].iloc[i]
pm_grants['All PIs'] = pi_list

b'Skipping line 87965: expected 24 fields, saw 25\nSkipping line 87967: expected 24 fields, saw 25\n'
b'Skipping line 95568: expected 24 fields, saw 25\nSkipping line 95570: expected 24 fields, saw 25\n'
b'Skipping line 74736: expected 24 fields, saw 25\nSkipping line 79976: expected 24 fields, saw 25\n'


These are the PIs associated with the largest number of projects:

In [63]:
pm_researchers_counts = collections.Counter(flatten(pm_grants['All PIs']))
pm_researchers_counts.most_common(10)

[('denny, joshua charles', 31),
 ('gatenby, robert a', 25),
 ('kretzler, matthias', 24),
 ('mills, gordon b', 23),
 ('perera, minoli', 22),
 (nan, 22),
 ('mooney, kathleen h', 20),
 ('schnall, mitchell d', 19),
 ('gharavi, ali g', 19),
 ('minna, john d', 19)]

In [64]:
print('There are ' + str(len(pm_researchers_counts)) + ' NIH-PM researchers')

There are 3803 NIH-PM researchers


## NSF Covid-related grants

The search criteria for the NSF Covid-related awards is https://www.nsf.gov/awardsearch/advancedSearchResult?PIId=&PIFirstName=&PILastName=&PIOrganization=&PIState=&PIZip=&PICountry=&ProgOrganization=&ProgEleCode=&BooleanElement=All&ProgRefCode=&BooleanRef=All&Program=&ProgOfficer=&Keyword=COVID+AND+RAPID&AwardNumberOperator=&AwardAmount=&AwardInstrument=&ActiveAwards=true&OriginalAwardDateOperator=&StartDateOperator=&ExpDateOperator= , which is a modified version of the https://covidinfocommons.datascience.columbia.edu/ resource.


These are the PIs that are connected to the largest number of Covid-related NSF projects. 

In [6]:
nsf_awards = pd.read_csv('NSF_covid_awards.csv', encoding='cp1252')
nsf_awards['PrincipalInvestigator'] = nsf_awards['PrincipalInvestigator'].str.lower().replace(".", "")
nsf_pis = list(nsf_awards['PrincipalInvestigator'].unique())

print(nsf_awards['PrincipalInvestigator'].value_counts()[0:10])
print('There are ' + str(len(nsf_awards['PrincipalInvestigator'].value_counts())) + ' NSF-Covid researchers')

jeannette sutton           2
yanfang ye                 2
angelique corthals         2
gerardo chowell-puente     2
kyle bibby                 2
joshua hartshorne          2
john yin                   2
liliana davalos alvarez    2
katriona shea              2
andrew whelton             2
Name: PrincipalInvestigator, dtype: int64
There are 826 NSF-Covid researchers


Our next goal is to find if some of these NSF-Covid researchers are also NIH-PM researchers. Matching these two lists of researchers requires, however, taking into account that the name of NSF-Covod PIs is formatted in a very different way than the NIH-PM names. The NSF dataset reports names in order (e.g. "John Doe") and in most cases do not contains a middle name. Meanwhile, in the NIH dataset names are better formatted as to identify the surname (e.g. "Doe, John") and middle name or initial is often included. I built and used a simple name matching approach to find similarities, although this could maybe be improved in the future.

In [7]:
# Checking compare_names function

print(compare_names("John Doe", "Doe, John"))
print(compare_names("John Foe Doe", "Foe Doe, John"))
print(compare_names("John Doe", "Doe, John A"))
print(compare_names("John Alexander Doe", "Doe, John B"))
print(compare_names("John Doe", "Foe, John A"))

True
True
True
False
False


Using that, I find 19 *likely* matches. Note, however, that some of these may need to be checked manually, as the compare_names function only returns possible matches (e.g. it is possible that 'nicole steinmetz' is different from 'steinmetz, nicole franziska').

In [37]:
pi_list = ['']*len(nsf_awards)
for i, names in enumerate(nsf_awards['Co-PIName(s)'].str.lower()):
    if names != names:
        pi_list[i] = [nsf_awards['PrincipalInvestigator'].iloc[i].replace(".", "").strip().lower()]
    else:
        pi_list[i] = [nsf_awards['PrincipalInvestigator'].iloc[i].replace(".", "").strip().lower()] + [name.strip() for name in names.replace(".", "").strip().split(", ")]
nsf_awards['All PIS'] = pi_list

In [38]:
pm_covid_researchers_nsf = []

for i, nsf_names in enumerate(nsf_awards['All PIS']):
    for nsf_name in nsf_names:
        for j, nih_name in enumerate(flatten(pm_grants['All PIs'])):
            try: 
                if compare_names(nsf_name, nih_name):
                    pm_covid_researchers_nsf.append([nsf_name, nih_name])
            except:
                pass
        
pm_covid_researchers_nsf.sort()
pm_covid_researchers_nsf = list(k for k,_ in itertools.groupby(pm_covid_researchers_nsf))
print(len(pm_covid_researchers_nsf))
print(pm_covid_researchers_nsf)

19
[['amber simpson', 'simpson, amber'], ['andrey grigoriev', 'grigoriev, andrey'], ['bradley malin', 'malin, bradley a'], ['faming liang', 'liang, faming'], ['heather allore', 'allore, heather gwynn'], ['janet williams', 'williams, janet k'], ['john mellnik', 'mellnik, john'], ['jun yang', 'yang, jun j'], ['kun huang', 'huang, kun'], ['marco salemi', 'salemi, marco'], ['mattia prosperi', 'prosperi, mattia'], ['michael nelson', 'nelson, michael douglas'], ['munmun de choudhury', 'de choudhury, munmun'], ['nicole steinmetz', 'steinmetz, nicole franziska'], ['olivier lichtarge', 'lichtarge, olivier'], ['steven young', 'young, steven l'], ['wesley wong', 'wong, wesley philip'], ['xiaodong wu', 'wu, xiaodong'], ['xiaoqian jiang', 'jiang, xiaoqian']]


In [65]:
pis_of_interest = [names[0] for names in pm_covid_researchers_nsf]
nsf_pm = nsf_awards[[bool(np.sum([name in pis_of_interest for name in names])) for names in nsf_awards['All PIS']]]
nsf_pm['PM PIs'] = [[name if name in pis_of_interest else "" for name in names] for names in nsf_pm['All PIS']]
nsf_pm.to_csv('NSF_Covid_PM.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## NIH Covid-related grants

The search criteria for the NIH-Covid awards is https://projectreporter.nih.gov/reporter_searchresults.cfm?redir=sh&sl=14E9CE024F8FC5D27598B8961CAA4A01A2FFCEB861BF&icde=50791235&hsid=66453521&shQID=0&go2= when clicking in here https://grants.nih.gov/policy/natural-disasters/corona-virus.htm (under "All COVID-19 research grant funding") 

In [31]:
nih_awards = pd.read_csv('NIH_covid_awards_2.csv', encoding= 'unicode_escape')

pi_list = ['']*len(nih_awards)
for i, names in enumerate(nih_awards['Other PI or Project Leader(s)'].dropna().str.lower()):
    if names == 'not applicable':
        pi_list[i] = [nih_awards['Contact PI / Project Leader'].iloc[i].replace(".", "").strip().lower()]
    else:
        pi_list[i] = [nih_awards['Contact PI / Project Leader'].iloc[i].replace(".", "").strip().lower()] + [name.strip() for name in names.replace(".", "").strip().strip().split(";")]
nih_awards['All PIS'] = pi_list


print('There are ' + str(len(set(flatten(nih_awards['All PIS'])))) + ' NIH-Covid researchers')

There are 345 NIH-Covid researchers


I find 42 researchers that are in the NIH-PM dataset and that have received NIH-Covid grants. We arelooking for exact matches in the names, so for instance "Doe, John A" and "Doe, John Alexander" will not match. In the future, we could use a more advanced name-matching approach.

In [32]:
pm_covid_researchers_nih = np.intersect1d(flatten(pm_grants['All PIs']), flatten(nih_awards['All PIS']))
print(len(pm_covid_researchers_nih))
print(pm_covid_researchers_nih)

30
['baliga, nitin s' 'banchereau, jacques f' 'baric, ralph s'
 'berin, maria cecilia' 'buckner, jane hoyt' 'cameron, mark james'
 'chatila, talal amine' 'chen, xi' 'falsey, ann r' 'freishtat, robert j'
 'fuchs, charles s' 'garcia-sastre, adolfo' 'haas, david w'
 'haendel, melissa a' 'harris, paul a' 'johnson, christine c'
 'kraft, monica' 'kuo, calvin j' 'leach, steven d' 'phipatanakul, wanda'
 'platanias, leonidas c' 'reilly, muredach p' 'reis, steven e'
 'rothenberg, marc e' 'shah, binita' 'sykes, megan' 'wang, kai'
 'weng, chunhua' 'wilkins, consuelo hopkins' 'zhou, ming-ming']


In [33]:
pis_of_interest = pm_covid_researchers_nih
nih_pm = nih_awards[[bool(np.sum([name in pis_of_interest for name in names])) for names in nih_awards['All PIS']]]
nih_pm['PM PIs'] = [[name if name in pis_of_interest else "" for name in names] for names in nih_pm['All PIS']]
nih_pm.to_csv('NIH_Covid_PM_2.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
colist = list(nih_pm.columns)
colist.pop('Project Abstract')
colist

TypeError: 'str' object cannot be interpreted as an integer

In [40]:
if 'Project Abstract' in nih_pm:
    print(True)

True
