In [None]:
# Import Packages
import pandas as pd
import requests
from io import BytesIO
from urllib.request import urlopen
import glob
from zipfile import ZipFile
import os
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

In [None]:
# Remove files from folder
pattern = r"/Users/jack.mccormick/Documents/federal_spending_files*.csv"

for item in glob.iglob(pattern, recursive=True):
    os.remove(item)# delete files

# retrieve all agency codes
endpoint = '/api/v2/bulk_download/list_agencies/'
url = f'https://api.usaspending.gov{endpoint}'

payload={
    "type": "award_agencies",
    "agency": 0
}

response = requests.post(url, json=payload)
agency_json = response.json()

agency_list = [agency['toptier_agency_id'] for agency in agency_json['agencies']['cfo_agencies']]

# download all files by agency
download_endpoint = '/api/v2/bulk_download/list_monthly_files/'
download_url = f'https://api.usaspending.gov{download_endpoint}'
file_location = '/Users/jack.mccormick/Documents/federal_spending_files'

award_types = ['assistance', 'contracts']

download_payload= {
    "fiscal_year": datetime.today().year,
}

def download_files(x, y):
    try:
        download_payload["agency"] = x
        download_payload["type"] = y
        with requests.Session() as session:
            response = session.post(download_url, json=download_payload, stream=True)
            data = response.json()
            for i in data['monthly_files']:
                if i['fiscal_year'] == datetime.today().year:
                    with urlopen(i['url']) as zipresp:
                        with ZipFile(BytesIO(zipresp.read())) as zfile:
                            zfile.extractall(file_location)
                            print('Downloaded', x, y,'.csv files.')
    except Exception as e:
        print(f"Error occurred while processing {x} {y}: {e}")

with ThreadPoolExecutor(max_workers=5) as executor:
    for x in agency_list:
        for y in award_types:
            executor.submit(download_files, x, y)

# concat all assistance and contract files
def create_files (type) -> list:
    li = [pd.read_csv(file, index_col=None, header=0, low_memory=False) for file in Path(file_location).glob(f'*{type}*.csv')]
    df = pd.concat(li, axis=0, ignore_index=True)
    return df

assistance_df = create_files('Assistance')
contract_df = create_files('Contracts')

In [None]:
prime_award_df = assistance_df[(assistance_df['recipient_uei'].isna() == False) & (assistance_df['recipient_state_name'].isna() == False) & (assistance_df['federal_action_obligation']>0)]

In [None]:
client_df = pd.read_csv('/Users/Jack/Desktop/databricks_sfdc_accounts.csv')

# clean org names
prime_award_df['recipient_name'] = prime_award_df['recipient_name'].str.lower() \
    .str.replace(r'(,|\.)?inc(\.)?', '', regex=True) \
    .str.replace(r'\!|, the', '', regex=True) \
    .str.replace(r'(\.|\,)\s*the', '', regex=True) \
    .str.replace(r'\.inc\.?|,? inc\.?', '', regex=True) \
    .str.replace(r',?\s*inc\.?', '', regex=True) \
    .str.replace(r'of\s*of', 'of', regex=True) \
    .str.replace(r'charities\s+archdiocese', 'charities of the archdiocese of', regex=True) \
    .str.replace(r'charities\s+diocese of', 'charities of the diocese of', regex=True) \
    .str.rstrip(', ')

prime_award_df['recipient_parent_name'] = prime_award_df['recipient_parent_name'].str.lower() \
    .str.replace(r'(,|\.)?inc(\.)?', '', regex=True) \
    .str.replace(r'\!|, the', '', regex=True) \
    .str.replace(r'(\.|\,)\s*the', '', regex=True) \
    .str.replace(r'\.inc\.?|,? inc\.?', '', regex=True) \
    .str.replace(r',?\s*inc\.?', '', regex=True) \
    .str.replace(r'of\s*of', 'of', regex=True) \
    .str.replace(r'charities\s+archdiocese', 'charities of the archdiocese of', regex=True) \
    .str.replace(r'charities\s+diocese of', 'charities of the diocese of', regex=True) \
    .str.rstrip(', ')

client_df['name'] = client_df['name'].str.lower() \
    .str.replace(r'(,|\.)?inc(\.)?', '', regex=True) \
    .str.replace(r'\!|, the', '', regex=True) \
    .str.replace(r'(\.|\,)\s*the', '', regex=True) \
    .str.replace(r'\.inc\.?|,? inc\.?', '', regex=True) \
    .str.replace(r',?\s*inc\.?', '', regex=True) \
    .str.replace(r'of\s*of', 'of', regex=True) \
    .str.replace(r'charities\s+archdiocese', 'charities of the archdiocese of', regex=True) \
    .str.replace(r'charities\s+diocese of', 'charities of the diocese of', regex=True) \
    .str.rstrip(', ')

# convert dates to datetime
prime_award_df['period_of_performance_start_date'] =  pd.to_datetime(prime_award_df['period_of_performance_start_date'])
prime_award_df['period_of_performance_current_end_date'] =  pd.to_datetime(prime_award_df['period_of_performance_current_end_date'])


# clean city/state names and clean zip codes
client_df = client_df.rename(columns={
    'Billing State/Province' : 'client_state',
    'Billing City' : 'client_city',
    'Unique ID' : 'id',
    'Account Name' : 'account_name',
    'Zip Code': 'client_zip_code'
})

prime_award_df['recipient_city_name'] = prime_award_df['recipient_city_name'].str.title()
prime_award_df['recipient_state_name'] = prime_award_df['recipient_state_name'].str.title()

prime_award_df['recipient_zip_code'] = pd.to_numeric(prime_award_df['recipient_zip_code'], errors='coerce').fillna(0)
prime_award_df['recipient_zip_code'] = prime_award_df['recipient_zip_code'].astype(int)
prime_award_df['recipient_zip_code'] = prime_award_df['recipient_zip_code'].apply(lambda x: '{0:0>5}'.format(x))

client_df['zip_code_id'] = pd.to_numeric(client_df['zip_code_id'], errors='coerce').fillna(0)
client_df['zip_code_id'] = client_df['zip_code_id'].astype(int)
client_df['zip_code_id'] = client_df['zip_code_id'].apply(lambda x: '{0:0>5}'.format(x))

In [None]:
# Join SFDC accounts onto USA Spending Award data

prime_award_merged_df = client_df.merge(
    prime_award_df,
    left_on = ['name', 'billing_state', 'billing_city']
    right_on = ['recipient_name', 'recipient_state_name', 'recipient_city_name']
    suffixes = ['_left', '_right'],
    how = 'inner'
)

client_mapping_dict = dict(zip(prime_award_merged_df.id, prime_award_merged_df.recipient_uei_right))
client_mapping_df = pd.DataFrame.from_dict(client_mapping_dict, orient='index').reset_index()

In [None]:
# Join SFDC accounts onto USA Spending Award data

contract_merged = client_df.merge(
    contract_df,
    left_on = ['name', 'billing_state', 'billing_city'],
    right_on = ['recipient_name', 'recipient_state_name', 'recipient_city_name'],
    suffixes = ['_left', '_right'],
    how = 'inner'
)

In [None]:
print('Using Name, City, State, matching current clients onto USA spending produced ' + str(contract_merged.shape[0]) + ' matches')
print('USA Spending returns ' + str(contract_merged[['recipient_uei_left']].drop_duplicates().shape[0]) + ' unique accounts')
print('USA Spending returns ' + str(contract_merged[['recipient_uei_right']].drop_duplicates().shape[0]) + ' unique accounts')