In [None]:
# Import Packages
import pandas as pd
import requests
from io import BytesIO
from urllib.request import urlopen
import glob
from zipfile import ZipFile
import os

In [None]:

# Retrieve all agency codes
url = 'https://api.usaspending.gov'
endpoint = '/api/v2/bulk_download/list_agencies/'

payload={
    "type": "award_agencies",
    "agency": 0
}
response = requests.post(f"{url}{endpoint}", json=payload)
agency_json = response.json()

agency_list = []

for i in range(len(agency_json['agencies']['cfo_agencies'])):
    agency_list.append(agency_json['agencies']['cfo_agencies'][i]['toptier_agency_id'])

In [None]:
# Iterate through all agency codes to retrieve and download all funding data
url = 'https://api.usaspending.gov'
endpoint = '/api/v2/bulk_download/list_monthly_files/'

award_types = ['assistance'] # ADD 'contracts' BACK

for x in agency_list:
    for y in award_types:
        payload= {
            "agency": x,
            "fiscal_year": 2022,
            "type": y
        }

        response = requests.post(f"{url}{endpoint}", json=payload)
        data = response.json()

        print('Retrieved', str(x), str(y), 'zip file url for download.')

        for i in data['monthly_files']:
            if i['fiscal_year'] == 2022:
                with urlopen(i['url']) as zipresp:
                    with ZipFile(BytesIO(zipresp.read())) as zfile:
                        zfile.extractall('/Users/jack/PycharmProjects/exploration_of_data/usa_spending_files')
                        print('Downloaded', str(x), str(y),'.csv files.')

In [None]:
path = '/Users/jack/PycharmProjects/exploration_of_data/usa_spending_files'
csv_files = glob.glob(os.path.join(path, "*.csv"))

li=[]

# loop over the list of csv files
for f in csv_files:
    df = pd.read_csv(f, index_col=None, header=0, dtype={'recipient_city_code': object}, low_memory=False)
    li.append(df)

complete_df = pd.concat(li, axis=0, ignore_index=True)

print('Files downloaded and concatenated', len(complete_df.index), 'rows returned.')

In [None]:
# Filter for only necessary columns
filtered_complete_df = complete_df[[
    'assistance_transaction_unique_key',
    'assistance_award_unique_key',
    'business_types_code',
    'federal_action_obligation',
    'total_obligated_amount',
    'award_id_fain',
    'action_date',
    'action_date_fiscal_year',
    'period_of_performance_start_date',
    'period_of_performance_current_end_date',
    'awarding_agency_name',
    'awarding_sub_agency_name',
    'funding_agency_name',
    'program_activities_funding_this_award',
    'recipient_uei',
    'recipient_duns',
    'recipient_name',
    'recipient_address_line_1',
    'recipient_city_name',
    'recipient_state_name',
    'recipient_zip_code',
    'recipient_parent_uei',
    'recipient_parent_duns',
    'recipient_parent_name',
    'cfda_number',
    'cfda_title',
    'funding_opportunity_number']]

filtered_complete_df = filtered_complete_df[filtered_complete_df['federal_action_obligation']>0]

print('Complete dataframe filtered', len(filtered_complete_df.index), 'rows returned. Ready for Salesforce join.')

In [None]:
# Remove files from folder
pattern = r"/Users/jack/PycharmProjects/exploration_of_data/usa_spending_files/*.csv"

for item in glob.iglob(pattern, recursive=True):
    # delete file
    os.remove(item)

print('Files removed from folder.')

In [None]:

prime_award_df = filtered_complete_df[(filtered_complete_df['recipient_uei'].isna() == False) & (filtered_complete_df['recipient_state_name'].isna() == False)]

print('Award dataframe filtered for null Recipient UEI and State Name,', len(prime_award_df.index), 'rows returned.')

In [None]:
client_df = pd.read_csv('/Users/Jack/Desktop/databricks_sfdc_accounts.csv')

# Clean prime award data for merge

prime_award_df['recipient_name'] = prime_award_df['recipient_name'].str.lower()
prime_award_df['recipient_parent_name'] = prime_award_df['recipient_parent_name'].str.lower()
prime_award_df['recipient_city_name'] = prime_award_df['recipient_city_name'].str.title()
prime_award_df['recipient_state_name'] = prime_award_df['recipient_state_name'].str.title()
#prime_award_df.loc[:,'total_obligated_amount'] = prime_award_df['total_obligated_amount'].map('{:,.0f}'.format)
prime_award_df['period_of_performance_start_date'] =  pd.to_datetime(prime_award_df['period_of_performance_start_date'], format='%Y-%m-%d', errors = 'coerce')
prime_award_df['period_of_performance_current_end_date'] =  pd.to_datetime(prime_award_df['period_of_performance_current_end_date'], format='%Y-%m-%d', errors = 'coerce')

def df_name_clean (df_name, col_name):
    df_name[col_name] = df_name[col_name]\
    .str.replace(',incorporated', '')\
    .str.replace('!', '')\
    .str.replace('.inc.', '')\
    .str.replace(',inc.', '')\
    .str.replace(', inc.', '')\
    .str.replace(' inc.', '')\
    .str.replace(' inc', '')\
    .str.replace(',inc', '')\
    .str.replace(', inc', '')\
    .str.replace(',, the', '')\
    .str.replace('charities archdiocese', 'charities of the archdiocese of')\
    .str.replace('charities diocese of', 'charities of the diocese of')\
    .str.replace('of of', 'of')\
    .str.replace(', the', '')\
    .str.rstrip(', ')

client_df = client_df.rename(columns={
    'Billing State/Province' : 'client_state',
    'Billing City' : 'client_city',
    'Unique ID' : 'id',
    'Account Name' : 'account_name',
    'Zip Code': 'client_zip_code'
})

client_df['name'] = client_df['name'].str.lower()
prime_award_df['recipient_zip_code'] = pd.to_numeric(prime_award_df['recipient_zip_code'], errors='coerce').fillna(0)
prime_award_df['recipient_zip_code'] = prime_award_df['recipient_zip_code'].astype(int)
prime_award_df['recipient_zip_code'] = prime_award_df['recipient_zip_code'].apply(lambda x: '{0:0>5}'.format(x))
client_df['zip_code_id'] = pd.to_numeric(client_df['zip_code_id'], errors='coerce').fillna(0)
client_df['zip_code_id'] = client_df['zip_code_id'].astype(int)
client_df['zip_code_id'] = client_df['zip_code_id'].apply(lambda x: '{0:0>5}'.format(x))

df_name_clean(client_df, 'name')
df_name_clean(prime_award_df, 'recipient_name')

In [None]:
# Join SFDC accounts onto USA Spending Award data

prime_award_merged_df = client_df.merge(
    prime_award_df,
    left_on = ['name', 'billing_state', 'billing_city'],#, 'zip_code_id'],
    right_on = ['recipient_name', 'recipient_state_name', 'recipient_city_name'],#, 'recipient_zip_code'],
    suffixes = ['_left', '_right'],
    how = 'inner'
)

print('Using Name, City, State, matching current clients onto USA spending produced ' + str(prime_award_merged_df.shape[0]) + ' matches')

print('USA Spending returns ' + str(prime_award_merged_df[['recipient_uei_right']].drop_duplicates().shape[0]) + ' unique accounts')

In [None]:

client_mapping_dict = dict(zip(prime_award_merged_df.id, prime_award_merged_df.recipient_uei_right))
client_mapping_df = pd.DataFrame.from_dict(client_mapping_dict, orient='index').reset_index()
contract_df = complete_df[(complete_df['federal_action_obligation']>0) & (filtered_complete_df['recipient_uei'].isna()==False)]

In [None]:
# Join SFDC accounts onto USA Spending Award data

contract_merged = client_df.merge(
    contract_df,
    left_on = ['name', 'billing_state', 'billing_city'],
    right_on = ['recipient_name', 'recipient_state_name', 'recipient_city_name'],
    suffixes = ['_left', '_right'],
    how = 'inner'
)

In [None]:
print('Using Name, City, State, matching current clients onto USA spending produced ' + str(contract_merged.shape[0]) + ' matches')

print('USA Spending returns ' + str(contract_merged[['recipient_uei_left']].drop_duplicates().shape[0]) + ' unique accounts')
print('USA Spending returns ' + str(contract_merged[['recipient_uei_right']].drop_duplicates().shape[0]) + ' unique accounts')