In [1]:
import requests
import re
from lxml import etree
import xml.etree.ElementTree as et
from time import sleep
from tqdm import tqdm
from itertools import chain
import pymysql
import pandas as pd
import numpy as np
import os

In [2]:
import string
def remove_punctuation(text):
    """ Removing punctuation """
    try: # python 2.x
        text = text.translate(None, string.punctuation) 
    except: # python 3.x
        translator = text.maketrans('', '', string.punctuation)
        text = text.translate(translator)
    return text

In [3]:
class MySQLPipline(object):
    """ Define Class for MySQL Connection"""
    def __init__(self):
        """ Initialize object """
        self.conn = pymysql.connect(
            host = 'localhost', port = 3306, user = 'root', passwd = 'Kyle9975', db = 'funding', charset = 'utf8')
        self.conn.autocommit(True)
        self.cursor = self.conn.cursor()

    def process_Query(self, sql, colnames):
        """ Processing the SQL query"""
        sql = sql
        self.cursor.execute(sql)
        data = self.cursor.fetchall()
        data = pd.DataFrame(data)
        data.columns = colnames
        return data

    def close_Conn(self):
        """ Closing Connection """
        self.cursor.close()
        self.conn.close()

# REST API

- **Endpoint:** An Endpoint is a specific address (for example, https://weather-in-london.com/forecast), by referring to which you get access to certain features/data (in our case – the weather forecast for London). 

### NIH DATASET

In [None]:
NIH1 = pd.read_csv("../Data/NIH/NIHFundedData1.csv")
NIH2 = pd.read_csv("../Data/NIH/NIHFundedData2.csv")
NIH3 = pd.read_csv("../Data/NIH/NIHFundedData3.csv")
NIH4 = pd.read_csv("../Data/NIH/NIHFundedData4.csv")
NIH5 = pd.read_csv("../Data/NIH/NIHFundedData5.csv")
NIH6 = pd.read_csv("../Data/NIH/NIHFundedData6.csv")

In [None]:
frames = [NIH1, NIH2, NIH3, NIH4, NIH5, NIH6]
df = pd.concat(frames)

In [None]:
df.columns = df.columns.map(lambda x: x.replace(" ", ""))

In [None]:
# df.to_csv("../Data/NIHFundedData.csv")

### NSF DATASET

```bash
sed 's/&/&amp;/g' 1409596.xml
grep -rl -e '\s&\s' --include="*.xml" ./
sed -i '' -e 's/&/&amp;/g' ./*.xml
```

### Find all xml formatted data

In [107]:
path = "/Users/kyle/Documents/IC/ALL_Courses/Final_Projects/Data/NSF/"
subdirnames = []
filenames = []
for (dir, subdir, files) in os.walk(path):
    for names in subdir:
        subdirnames.append(names)
    for name in files:
        if name.split(".")[1] == "xml":
            filenames.append(os.path.join(dir, name))

### Extract info from single xml formatted document

In [154]:
filenames[43942]

'/Users/kyle/Documents/IC/ALL_Courses/Final_Projects/Data/NSF/2019/1927373.xml'

In [187]:
# fileNum = 0
def xml_process(fileNum):
    tree = et.parse(filenames[fileNum])
    root = tree.getroot()
    # all item attributes
    dic = {}
    for elem in root:
        for subelem in elem:
            if subelem.text == '\n':
                for subelem2 in subelem:
                    if subelem2.text == '\n':
                        for subelem3 in subelem2:
                            name = "{}_{}".format(subelem3.tag, subelem2.tag)
                            dic[name] = [subelem3.text]
                    else:
                        name = "{}_{}".format(subelem2.tag, subelem.tag)
                        dic[name] = [subelem2.text]
            else:
                dic[subelem.tag] = [subelem.text]
    dic["FiscalYear"] = filenames[fileNum].split("/")[9]
    if 'DRECONTENT_POR' in dic:
        del dic['DRECONTENT_POR']
    return dic

In [181]:
def getNSFDataset(amt = None, path = "/Users/kyle/Documents/IC/ALL_Courses/Final_Projects/Data/NSF/", verbose = False):
    """ Parse XML file to get all NSF funded data """
    ## Getting all xml filenames we have
    subdirnames = []
    filenames = []
    for (dir, subdir, files) in os.walk(path):
        for names in subdir:
            subdirnames.append(names)
        for name in files:
            if name.split(".")[1] == "xml":
                filenames.append(os.path.join(dir, name))
                
    amt = len(filenames) if amt == None else amt
    
    ## Processing xml files
    for fileNum in tqdm(range(amt)):
        if verbose:
            print("Processing number {}, file {}".format(fileNum, filenames[fileNum].split("/")[10]))
        try:
            tree = et.parse(filenames[fileNum])
            root = tree.getroot()
        except Exception as msg:
            if re.match('no element found', str(msg)):
                print("Loading file {} failed".format(filenames[fileNum].split("/")[10]))
                pass
            else:
                raise ValueError("Parse failed in file number {}, the file path is {}".format(fileNum, filenames[fileNum]))

        ## to iterate the xml formatted 'tree'
        dic = {}
        for elem in root:
            for subelem in elem:
                if subelem.text == '\n':
                    for subelem2 in subelem:
                        if subelem2.text == '\n':
                            for subelem3 in subelem2:
                                name = "{}_{}".format(subelem3.tag, subelem2.tag)
                                dic[name] = [subelem3.text]
                        else:
                            name = "{}_{}".format(subelem2.tag, subelem.tag)
                            dic[name] = [subelem2.text]
                else:
                    dic[subelem.tag] = [subelem.text]
        dic["FiscalYear"] = filenames[fileNum].split("/")[9]
        if 'DRECONTENT_POR' in dic:
            del dic['DRECONTENT_POR'] 
        
        ## Removing punctuations for AwardTitle, AbstractNarration and POR_COPY_TXT_POR columns
        dic['AwardTitle'] = [remove_punctuation(dic['AwardTitle'][0])] if dic['AwardTitle'][0] != None else dic['AwardTitle']
        dic['AbstractNarration'] = [remove_punctuation(dic['AbstractNarration'][0])] if dic['AbstractNarration'][0] != None else dic['AbstractNarration']
        if 'POR_COPY_TXT_POR' in dic:
            dic['POR_COPY_TXT_POR'] = [remove_punctuation(dic['POR_COPY_TXT_POR'][0])] 

        if fileNum == 0:
            df = pd.DataFrame(dic)
        else:
            df = df.append(dic, ignore_index=True)
    return df

In [188]:
def remove_list(x):
    """ Romving list of each elemnt within dataframe """
    for idx in range(len(x)):
        if type(x[idx]) == list:
            x[idx] = x[idx][0]
        else:
            x[idx] = x[idx]
    return "Finished!"

## Extract NSF info from all .xml files

In [183]:
%%time
df = getNSFDataset(path = "/Users/kyle/Documents/IC/ALL_Courses/Final_Projects/Data/NSF/", verbose=False)

100%|██████████| 88696/88696 [20:23:54<00:00,  1.21it/s]         

CPU times: user 8h 15min 2s, sys: 38min 35s, total: 8h 53min 38s
Wall time: 20h 23min 55s





In [189]:
%%time
df.apply(remove_list, axis = 1)
df

CPU times: user 52.6 s, sys: 509 ms, total: 53.1 s
Wall time: 53.5 s


Unnamed: 0,AwardTitle,AGENCY,AwardEffectiveDate,AwardExpirationDate,AwardTotalIntnAmount,AwardAmount,Value_AwardInstrument,Code_Organization,Abbreviation_Directorate,LongName_Directorate,...,FUND_OBLG,POR_COPY_TXT_POR,FiscalYear,AwardInstrument,Institution,Investigator,Organization,ProgramElement,ProgramOfficer,ProgramReference
0,Collaborative Research Research on Learning a...,NSF,09/15/2014,08/31/2020,264262.00,264262,Standard Grant,03010000,MPS,Direct For Mathematical Physical Scien,...,2014~264262,The goals of this project have been to condu...,2014,,,,,,,
1,SI2SSI Collaborative Research A Sustainable In...,NSF,08/01/2015,07/31/2020,1500000.00,1500000,Standard Grant,05090000,CSE,Direct For Computer Info Scie Enginr,...,2015~1500000,In a collaboration with the University of Wis...,2014,,,,,,,
2,Nodal count magnetic potentials and Dirac cone...,,07/01/2014,06/30/2018,196937.00,196937,Standard Grant,03040000,MPS,Direct For Mathematical Physical Scien,...,,,2014,,,,,,,
3,CAREER Research and education on protein foldi...,NSF,06/01/2015,05/31/2021,795544.00,797773,Continuing Grant,08070400,BIO,Direct For Biological Sciences,...,2019~2229,,2014,,,,,,,
4,USNepal Research Planning Visit Impacts of Cl...,,01/15/2015,12/31/2015,30820.00,30820,Standard Grant,01090000,O/D,Office Of The Director,...,,,2014,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88691,Radiation and Transport in QCD Matter,NSF,07/01/2016,06/30/2020,372000.00,439118,Continuing Grant,03010000,MPS,Direct For Mathematical Physical Scien,...,2018~191118,Unraveling the mechanism of the confinement o...,2016,,,,,,,
88692,Anhydrite in Arc Magmas and its Relationship t...,,07/15/2016,06/30/2021,310000.00,310000,Continuing Grant,06030000,GEO,Directorate For Geosciences,...,,,2016,,,,,,,
88693,Collaborative Research Rocky MountainsGreat P...,,05/01/2016,04/30/2019,15495.00,15495,Continuing grant,03040000,MPS,Direct For Mathematical Physical Scien,...,,,2016,,,,,,,
88694,CAREER Global Capital Flows,NSF,03/15/2017,02/28/2022,444584.00,444584,Continuing Grant,04050000,SBE,"Direct For Social, Behav Economic Scie",...,2017~444584,,2016,,,,,,,


In [103]:
# df = df[["AwardTitle", "FiscalYear", "AGENCY", "AwardEffectiveDate", "AwardExpirationDate", 
#     "AwardTotalIntnAmount", "AwardAmount", "AbstractNarration", "MinAmdLetterDate", "MaxAmdLetterDate",
#     "TRAN_TYPE", "CFDA_NUM", "NSF_PAR_USE_FLAG", "FUND_AGCY_CODE", "AWDG_AGCY_CODE", "AwardID", "FUND_OBLG"]]

In [190]:
df.to_csv("../Data/NSF_Funded_Data.csv")