In [22]:
import pandas as pd
import os,sys,codecs,csv,pickle,string
from pprint import pprint
import numpy as np
from stop_words import get_stop_words
import multiprocessing



In [69]:
# create a list of the CSV files and the relevant year
#csv_files = [f for f in os.listdir() if f.endswith('.csv') ]
#csv_files = [(d,s) for d in range(4000) for s in csv_files if s.endswith('FY'+str(d)+'.csv') ]

# only abstract files
csv_files = [f for f in os.listdir() if (f.endswith('.csv') and f.find('_PRJABS_')>-1) ]
csv_files = [(d,s) for d in range(4000) for s in csv_files if s.endswith('FY'+str(d)+'.csv') ]
pprint(csv_files)


[(2002, 'RePORTER_PRJABS_C_FY2002.csv'),
 (2006, 'RePORTER_PRJABS_C_FY2006.csv'),
 (2018, 'RePORTER_PRJABS_C_FY2018.csv'),
 (2019, 'RePORTER_PRJABS_C_FY2019.csv')]


In [76]:
def CountGrantsInOneYearWithWord( grants_abs_list , word):
    binary_vect = [word in words for words in grants_abs_list]
    return( sum(binary_vect), sum(binary_vect)/len(binary_vect)*100, len(binary_vect) )

# load an abstracts CSV file and return a set of unique, no-stop & no-punctuation words
def LoadWordsFromAbstractFile(abstract_csv_file,year,queue_in_which_to_store_abstract=None):
    abstract_text = list()
    table = str.maketrans(dict.fromkeys(string.punctuation))
    with codecs.open( abstract_csv_file , 'r' , encoding='utf-8', errors='ignore') as csvfile:
        csvr = csv.reader(csvfile,delimiter=',')
        for app_id, app_txt in csvr:
            app_txt = [s.translate(table) for s in app_txt.lower().split(' ')]
            filtered_words = [word for word in app_txt  if \
                              ( (word not in get_stop_words('english')) and (len(word)>4) )]
            abstract_text.append(set(filtered_words))
#    queue_in_which_to_store_abstract.put((year,abstract_text))
    return( abstract_text )

In [None]:
#load CSVs in parallel
jobs = []
q = multiprocessing.Queue()
abstracts_dict = {}
for I in range(len(csv_files)):
    print(I,csv_files[I][1],type(csv_files[I][1]))
    out_list = list()
    process = multiprocessing.Process(target=LoadWordsFromAbstractFile,args=[csv_files[I][1],csv_files[I][0],q])
    jobs.append(process)

# Start the processes (i.e. calculate the random number lists)      
for j in jobs:
    j.start()

# Ensure all of the processes have finished
for j in jobs:
    print('joining: ', j)
    j.join()
    year,abs_list = q.get()
    abstracts_dict[year] = abs_list
    

print("List processing complete.")
print(abstracts_dict.keys())

In [84]:
if __name__ == '__main__':
    pool = multiprocessing.Pool(processes=6)              # start 4 worker processes
    for I in range(len(csv_files)):
        result = pool.apply_async(LoadWordsFromAbstractFile, [csv_files[I][1],csv_files[I][0]])    # evaluate "f(10)" asynchronously
#    print(result.get(timeout=1))           # prints "100" unless your computer is *very* slow
#    print(pool.map(f, range(10)))          # prints "[0, 1, 4,..., 81]"
    print('running')
    pool.close()
    print('closed')
    pool.join()
    print('joined')

running
closed
joined


In [88]:
pprint(result.__dict__)

{'_cache': {},
 '_callback': None,
 '_error_callback': None,
 '_event': <threading.Event object at 0x1155b0ed0>,
 '_job': 14,
 '_success': True,
 '_value': [{'candidate',
             'contract',
             'development',
             'diseases',
             'efficacy',
             'include',
             'including',
             'infectious',
             'licensure',
             'models',
             'nonhuman',
             'primate',
             'products',
             'provides',
             'standardization',
             'studies',
             'support',
             'testing'},
            {'absence',
             'abstract',
             'accurate',
             'activation',
             'activity',
             'addition',
             'address',
             'anaphasepromoting',
             'apcdependent',
             'apcin',
             'approach',
             'arrest',
             'associated',
             'attachment',
             'better',
           

In [25]:
abstract_text_list = LoadWordsFromAbstractFile(csv_files[I][1])
#pprint(abstract_text_list)

In [37]:
print( f'# of abstracts = {len(abstract_text_list)}')
print("0: ", abstract_text_list[0])
print("1: ", abstract_text_list[1])
s,m,l = CountGrantsInOneYearWithWord( abstract_text_list, 'yeast')
print(f'N={s}\t%={m}\ttotal N={l}')

#s,m,l = CountGrantsInOneYearWithWord( abstracts[years==2010] , 'yeast')
#print(s,m,l)

# of abstracts = 10000
0:  {'abstracttext'}
1:  {'variety', 'testing', 'female', 'idiq0', 'center', 'support', 'operating', 'protocols', 'contract', 'contraceptive', 'shall', 'necessary', 'phase', 'develop', 'rapidly', 'statistical', 'methods', 'trials', 'provides', 'coordinating', 'preclinical', 'products', 'network', 'indefinitedelivery', 'conducts', 'clinical', 'order', 'initiate', 'contractors', 'nichds', 'candidate', 'completed', 'shortly', 'indefinitequantity'}
N=115	%=1.15	total N=10000


In [None]:
unique_years = sorted(set(years))
keywords = ['yeast','cancer','zebrafish','elegans','crispr']
for keyword in keywords:
    for year in unique_years:
        s,m,l = CountGrantsInOneYearWithWord( abstracts[years==year] , keyword)
        print(year,keyword,m,sep='\t')

In [None]:
# this crashes because of file-format problems
df = pd.DataFrame()
for I in range(0,len(csv_files),2):
    print(csv_files[I])
    print(csv_files[I+1])
    df1 = pd.read_csv(csv_files[I][1],sep=',',low_memory=False)
    df2 = pd.read_csv(csv_files[I+1][1],sep=',',low_memory=False)
    df2['Year'] = csv_files[I][0]
    df1 = df1.merge(df2,on='APPLICATION_ID')
    if I==0:
        df=df1
    else:
        df.append(df1)
    
    

In [None]:
# save the two lists

with open('abstracts.pickle', 'wb') as f:
    pickle.dump(abstracts, f, pickle.HIGHEST_PROTOCOL)
with open('years.pickle', 'wb') as f:
    pickle.dump(years, f, pickle.HIGHEST_PROTOCOL)

with open('abstracts.pickle', 'rb') as f:
    abstracts = pickle.load(f) 

with open('years.pickle', 'rb') as f:
    years = pickle.load(f)     

print(abstracts[2])

In [None]:

# this will produce two lists, one entry per grant
#   first list has grant year.   second list has grant abstract text
years = list()
abstracts = list()
table = str.maketrans(dict.fromkeys(string.punctuation))
for I in range(0,len(csv_files),1): # len(csv_files)
    print(I,csv_files[I][0] , csv_files[I][1])
    with codecs.open(csv_files[I][1] , 'r' , encoding='utf-8', errors='ignore') as csvfile:
        csvr = csv.reader(csvfile,delimiter=',')
        for app_id, app_txt in csvr:
            years.append(csv_files[I][0])
            app_txt = [s.translate(table) for s in app_txt.lower().split(' ')]
            filtered_words = [word for word in app_txt  if \
                              ( (word not in get_stop_words('english')) and (len(word)>4) )]
            abstracts.append( set(filtered_words) )

years = np.array(years)
abstracts = np.array(abstracts)