In [1]:
import pandas as pd
import os,sys,codecs,csv,pickle,string
from pprint import pprint
import numpy as np
from stop_words import get_stop_words
import multiprocessing

import bz2
import pickle
import _pickle as cPickle


In [3]:
# create a list of the CSV files and the relevant year
#csv_files = [f for f in os.listdir() if f.endswith('.csv') ]
#csv_files = [(d,s) for d in range(4000) for s in csv_files if s.endswith('FY'+str(d)+'.csv') ]

# only abstract files
csv_files = [f for f in os.listdir() if (f.endswith('.csv') and f.find('_PRJABS_')>-1) ]
csv_files = [(d,s) for d in range(4000) for s in csv_files if s.endswith('FY'+str(d)+'.csv') ]
pprint(csv_files)


[(2002, 'RePORTER_PRJABS_C_FY2002.csv'),
 (2006, 'RePORTER_PRJABS_C_FY2006.csv'),
 (2017, 'RePORTER_PRJABS_C_FY2017.csv'),
 (2018, 'RePORTER_PRJABS_C_FY2018.csv'),
 (2019, 'RePORTER_PRJABS_C_FY2019.csv')]


In [24]:
# save and load compressed pickle files
def decompress_pickle(filename):
    if os.path.isfile(filename):
        data = bz2.BZ2File(filename, 'rb')
    elif os.path.isfile( filename + '.pbz2' ):
        data = bz2.BZ2File(filename + '.pbz2'  , 'rb')
    else:
        print('cannot find ', filename)
        raise TypeError("Only integers are allowed")
    data = cPickle.load(data)
    return data

def compressed_pickle(filename, data):
    with bz2.BZ2File(filename + '.pbz2','w') as f: 
        cPickle.dump(data, f)


# grants_abs_list is a list of sets. each set contains all the words in one grant abstract
#   identify the sets in which the word appears
#   and return the #, avg, and total # of grants (# of sets)
def CountGrantsInOneYearWithWord( grants_abs_list , word):
    binary_vect = [word in words for words in grants_abs_list]
    return( sum(binary_vect), sum(binary_vect)/len(binary_vect)*100, len(binary_vect) )


# load an abstracts CSV file and return a set of unique, no-stop & no-punctuation words
#   also save the output as a pickle file
def ProcessesWordsInAbstractFile(abstract_csv_file,year):
    abstract_text = list()
    table = str.maketrans(dict.fromkeys(string.punctuation))
    with codecs.open( abstract_csv_file , 'r' , encoding='utf-8', errors='ignore') as csvfile:
        csvr = csv.reader(csvfile,delimiter=',')
        for app_id, app_txt in csvr:
            app_txt = [s.translate(table) for s in app_txt.lower().split(' ')]
            filtered_words = [word for word in app_txt  if \
                              ( (word not in get_stop_words('english')) and (len(word)>4) )]
            abstract_text.append(set(filtered_words))
    compressed_pickle(abstract_csv_file.replace('.csv',''), (year,abstract_text))
    return( abstract_text )

In [5]:
# in parallel, process the CSV files, and save the output in pickle format
if __name__ == '__main__':
    pool = multiprocessing.Pool(processes=6)              # start 4 worker processes
    for I in range(len(csv_files)):
        result = pool.apply_async(ProcessesWordsInAbstractFile, [csv_files[I][1],csv_files[I][0]])    # evaluate "f(10)" asynchronously
    pool.close()
    print('all proceses started')
    pool.join()
    print(' -- all finished --')

closed
joined


In [34]:
# load the pickle files
abstracts = list()
years = list()
for I in range(len(csv_files)):
    (year,abstract_text_list_of_sets) = decompress_pickle(csv_files[I][1].replace('.csv',''))
    abstracts.append(abstract_text_list_of_sets)
    years.append(year)
    print('Finished processing ', year)
abstracts = np.array(abstracts)
years = np.array(years)
print(years)

Finished processing  2002
Finished processing  2006
Finished processing  2017
Finished processing  2018
Finished processing  2019
[2002 2006 2017 2018 2019]


In [46]:
print(len(abstracts[years==year][0]))
print(len(abstracts[years==year] ))
print(list(abstracts[1][1])[:25])  # first 100 words in one abstract in one year

78916
1
['whether', 'necessary', 'intracranial', 'patients', 'determine', 'allows', 'comparison', 'present', 'elevated', 'skull', 'built', 'neurosurgeon', 'collect', 'purpose', 'preprototype', 'serious', 'successful', 'improved', 'preliminary', 'measuring', 'analysis', 'implementation', 'measured', 'arterial', 'favorably']


In [None]:
print( f'# of abstracts = {len(abstract_text_list)}')
print("0: ", abstract_text_list[0])
print("1: ", abstract_text_list[1])
s,m,l = CountGrantsInOneYearWithWord( abstract_text_list, 'yeast')
print(f'N={s}\t%={m}\ttotal N={l}')

#s,m,l = CountGrantsInOneYearWithWord( abstracts[years==2010] , 'yeast')
#print(s,m,l)

In [47]:
unique_years = sorted(set(years))
keywords = ['yeast','cancer','zebrafish','elegans','crispr']
for keyword in keywords:
    for year in unique_years:
        s,m,l = CountGrantsInOneYearWithWord( abstracts[years==year][0] , keyword)
        print(year,keyword,m,sep='\t')

2002	yeast	2.907286439204912
2006	yeast	2.5473700762638574
2017	yeast	1.260007152879034
2018	yeast	1.2288040636438335
2019	yeast	1.174666734249075
2002	cancer	13.328100030839105
2006	cancer	15.38642975076657
2017	cancer	19.37054664502462
2018	cancer	23.03727496825278
2019	cancer	20.012418267524964
2002	zebrafish	0.407917239059127
2006	zebrafish	0.6494221243808476
2017	zebrafish	1.0069052793749484
2018	zebrafish	0.9710913572869202
2019	zebrafish	0.9643164884180648
2002	elegans	0.5943536404160475
2006	elegans	0.7453416149068323
2017	elegans	0.7991966766623566
2018	elegans	0.7681581633923458
2019	elegans	0.7210198185412338
2002	crispr	0.0
2006	crispr	0.0015724506643604059
2017	crispr	0.550221464139316
2018	crispr	0.689723861457633
2019	crispr	0.8920877895483805


In [None]:
# save the two lists

with open('abstracts.pickle', 'wb') as f:
    pickle.dump(abstracts, f, pickle.HIGHEST_PROTOCOL)
with open('years.pickle', 'wb') as f:
    pickle.dump(years, f, pickle.HIGHEST_PROTOCOL)

with open('abstracts.pickle', 'rb') as f:
    abstracts = pickle.load(f) 

with open('years.pickle', 'rb') as f:
    years = pickle.load(f)     

print(abstracts[2])