In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
import urllib.request
import urllib.error
import time, itertools
from multiprocessing import Pool

# Setup Functions

In [3]:
# valHTML is a Python response from a get.
def findResultsPageMax(valHTML):
    rtnVal = -1
    valBS = BeautifulSoup(valHTML.text,'lxml')
    valTags = valBS.find_all('li')
    tmpURLs = []
    for i in valTags:
        valAs = i.find('a')
        try:
            tmpURLs.append(valAs.attrs['href'])
        except:
            None
    tmpAllURLs = pd.DataFrame({'URLs': tmpURLs})
    tmpSubset = tmpAllURLs.loc[tmpAllURLs['URLs'].str.contains('/sbirsearch/award/all\?page=')].copy()
    tmpSubset['Number'] = tmpSubset['URLs'].str.extract('([0-9]+)$',expand=False)
    tmpSubset['Number'] = pd.to_numeric(tmpSubset['Number'])
    tmpSubset.dropna(inplace=True)
    rtnVal = int(max(tmpSubset['Number']))
    return(rtnVal)

In [4]:
def getResultPage(valURL,valMaxTries=10):
    valCnt = 1
    isBad = True
    rtnVal = None
    while(isBad and valCnt < valMaxTries):
        try:
            tmpResponse = requests.get(valURL)
            if(tmpResponse.status_code==200):
                isBad = False
                rtnVal = tmpResponse
        except:
            None
        valCnt += 1
    return(rtnVal)

In [5]:
def getResultPageQuick(valURL):
    rtnValHTML = None
    rtnValStatusCode = -1
    try:
        tmpConnection = urllib.request.urlopen(valURL)
        rtnValStatusCode = tmpConnection.getcode()
        with tmpConnection as tmpResponse:
            rtnValHTML = tmpResponse.read()
    except urllib.error.HTTPError as e:
        print('HTTPError: {}'.format(e.code) + ', ' + url)
    return([rtnValStatusCode,valURL,rtnValHTML])

In [6]:
def getResultSBIRLinks(valHTML):
    valBS = BeautifulSoup(valHTML,'lxml')
    valTags = valBS.find_all('h3')
    tmpURLs = []
    for i in valTags:
        valAs = i.find('a')
        tmpURLs.append(valAs.attrs['href'])
    return(tmpURLs)

# Setup

In [7]:
valRetries = 10
valMin = 0
valURLBase = r'https://www.sbir.gov/sbirsearch/award/all?page='

valMax = findResultsPageMax(getResultPage(valURLBase+str(valMin),100))
valNumRun = 150
valRange = range(valMin,valNumRun)#valMax+1)

# Get SBIR Results

In [8]:
%%time
tmpResultsPage = []
tmpSBIRLinks = pd.DataFrame()

for i in valRange:
    tmpURL = valURLBase+str(i)
    valSearchResultsPage = getResultPage(tmpURL,valRetries)
    tmpResultsPage.append(valSearchResultsPage)
    valSBIRLinks = getResultSBIRLinks(valSearchResultsPage.text)
    tmpDF = pd.DataFrame({'URLSBIR': valSBIRLinks})
    tmpDF['ResultsPage'] = i
    tmpSBIRLinks = tmpSBIRLinks.append(tmpDF)
    if((i % 1000)==0):
        print("Finished Retrieving "+str(i)+" Pages...")

Finished Retrieving 0 Pages...
CPU times: user 20.4 s, sys: 346 ms, total: 20.7 s
Wall time: 5min 9s


In [9]:
%%time
tmpSBIRLinks['URLSBIRev'] = r'https://www.sbir.gov'+tmpSBIRLinks['URLSBIR']
tmpSBIRLinks['SBIResponse'] = tmpSBIRLinks['URLSBIRev'].apply(lambda x: getResultPage(x,valRetries))
tmpSBIRLinks['HTMLText'] = tmpSBIRLinks['SBIResponse'].apply(lambda x: x.text)

CPU times: user 30.1 s, sys: 2.57 s, total: 32.7 s
Wall time: 50min 52s


In [10]:
print(tmpSBIRLinks.shape)

(1500, 5)


In [11]:
#tmpSBIRLinks.to_excel('./SBIRAwards.xlsx')

# Multithreading

In [12]:
%%time
valRange = range(valMin,valNumRun)#valMax+1)
tmpURLs = [valURLBase+str(i) for i in list(valRange)]

if __name__ == "__main__":
    valProcessPool = Pool(processes=20)
    tmpResults = valProcessPool.map(getResultPageQuick, tmpURLs)
    tmpDFResults = pd.DataFrame(tmpResults,columns=['StatusCode','URL','Response'])
    valSBIRLinks = tmpDFResults.apply(lambda x: getResultSBIRLinks(x['Response']),axis=1)
    tmpDFSBIRLinks = pd.DataFrame(list(itertools.chain.from_iterable(valSBIRLinks)),columns=['URLSBIR'])

CPU times: user 18.1 s, sys: 311 ms, total: 18.5 s
Wall time: 1min 8s


In [13]:
%%time
if __name__ == "__main__":
    tmpDFSBIRLinks['URLSBIRev'] = r'https://www.sbir.gov'+tmpDFSBIRLinks['URLSBIR']
    tmpSBIRVals = valProcessPool.map(getResultPageQuick, tmpDFSBIRLinks['URLSBIRev'].tolist())
    tmpDFSBIRResults = pd.DataFrame(tmpSBIRVals,columns=['StatusCode','URL','HTMLText'])
    tmpSBIRResponse = pd.merge(left=tmpDFSBIRLinks,right=tmpDFSBIRResults,left_on='URLSBIRev',right_on='URL',how='outer')

CPU times: user 179 ms, sys: 119 ms, total: 297 ms
Wall time: 48.7 s


In [14]:
print(tmpSBIRResponse.shape)

(1500, 5)
