# Capital Spending Project Feasibility Assessment: Exploratory Analysis

## Step 0: Setting up and importing data

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re
import datetime
import matplotlib.font_manager
import time
from thefuzz import fuzz
from thefuzz import process
import fuzzymatcher
from matplotlib import ticker as ticker
from ftfy import fix_text 
from sklearn.feature_extraction.text import TfidfVectorizer
#import nmslib
from scipy.sparse import csr_matrix
from scipy.sparse import rand
import sparse_dot_topn.sparse_dot_topn as ct

In [26]:
file_names = ['checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_0.csv', 
'checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_1.csv',
'checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_2.csv',
'checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_3.csv']
df0 = pd.read_csv(file_names[0])
df1 = pd.read_csv(file_names[1])
df2 = pd.read_csv(file_names[2])
df3 = pd.read_csv(file_names[3])
frames = [df0, df1, df2, df3]
df = pd.concat(frames)

facdb = pd.read_csv('facilities.csv')

## Step 1: Data Cleaning

In [27]:
df['Issue Date'] = pd.to_datetime(df['Issue Date'])
df['Fiscal year'] = pd.to_datetime(df['Fiscal year'], format='%Y')
df['Agency'] = df['Agency'].str.upper()
df['Budget Code'] = df['Budget Code'].str.upper()
df['Contract Purpose'] = df['Contract Purpose'].str.upper()
df['Spending Category'] = df['Spending Category'].str.upper()

df = df[df['Budget Code'].notna()]
df = df[df['Check Amount']!=99999999]
df = df[df['Check Amount'] >= 0]

In [28]:
unimportant_cols = ['addressnum', 'streetname', 'city', 'zipcode',
       'boro', 'borocode', 'bin', 'bbl', 'cd', 'nta2010', 'nta2020', 'council',
       'schooldist', 'policeprct', 'ct2010', 'ct2020', 'servarea', 'opname', 
       'opabbrev', 'optype','overabbrev', 'overlevel', 'capacity', 'captype']

facdb = facdb.drop(unimportant_cols, axis=1)

facdb['facname'] = facdb['facname'].str.upper()
facdb['address'] = facdb['address'].str.upper()
facdb = facdb[facdb['facname'].notna()]
facdb = facdb[facdb['geom'].notna()]
# keep na addresses - facname more important

## Fuzzy String Matching

In [29]:
# bpcl = facdb[facdb['facname']=='BATTERY PARK CITY LIBRARY']
# bpcl_df = pd.concat([df.iloc[[129994]], df.iloc[[243370]]])

# df_joined = fuzzymatcher.fuzzy_left_join(bpcl_df, bpcl, left_on = "Budget Code", right_on="facname")
# num_records = len(df_joined)
# correct_binary = (df_joined["facname"] == df_joined["Budget Code"])
# perc_correct = correct_binary.sum()/num_records

In [30]:
# brute force regex
# regex_pattern = '(\d+\s+AVE|\d+\s+STREET|BLOCK|LOT|\d+\s+ST|\d+\s+AVENUE)+'
# pattern_results = df['Budget Code'].apply(lambda x: re.findall(regex_pattern, str(x))[0] if re.findall(regex_pattern, str(x)) != [] else None)

# df['Street Info'] = pattern_results

In [31]:
# num_mentions = df[df['Street Info'].notnull()].shape[0]
# print("Proportion of all rows from Checkbook NYC with street mentions: {}".format(num_mentions/df.shape[0]))

# Department of Education: A very bad example of record linkage

In [32]:
# identify faciltiies
doe_df = df[df['Agency']=='DEPARTMENT OF EDUCATION']

overagency = ['NYC Department of Education']
facgroup = ['DAY CARE AND PRE-KINDERGARTEN', 'YOUTH SERVICES', 'CAMPS', 'VOCATIONAL AND PROPRIETARY SCHOOLS', 'SCHOOLS (K-12)', 'LIBRARIES', 'HIGHER EDUCATION']
facsubgrp = ['AFTER-SCHOOL PROGRAMS', 'PROPRIETARY SCHOOLS', 'CHARTER K-12 SCHOOLS', 'PUBLIC K-12 SCHOOLS', 'NON-PUBLIC K-12 SCHOOLS', 
'PUBLIC AND PRIVATE SPECIAL EDUCATION SCHOOLS', 'GED AND ALTERNATIVE HIGH SCHOOL EQUIVALENCY', 'PRESCHOOLS FOR STUDENTS WITH DISABILITIES', 
'HEAD START', 'COLLEGES OR UNIVERSITIES', 'DOE UNIVERSAL PRE-KINDERGARTEN']
facdomain = ['EDUCATION, CHILD WELFARE, AND YOUTH']

doe_facdb = facdb[['facname', 'address']][(facdb['overagency'].isin(overagency)) \
     | (facdb['facgroup'].isin(facgroup)) \
     | (facdb['facsubgrp'].isin(facsubgrp)) \
     | (facdb['facdomain'].isin(facdomain))]

In [33]:
# Perform fuzzy matching and left join on 'facname'
start = time.time()
matched_results_facname = fuzzymatcher.fuzzy_left_join(doe_df, doe_facdb, left_on='Budget Code', right_on='facname')
matched_results_facname = matched_results_facname[matched_results_facname['best_match_score'] > 0.3]

# Perform fuzzy matching and left join on 'address'
matched_results_address = fuzzymatcher.fuzzy_left_join(doe_df, doe_facdb, left_on='Budget Code', right_on='address')
matched_results_address = matched_results_address[matched_results_address['best_match_score'] > 0.3]


# Combine the matched results from both joins
combined_results = pd.concat([matched_results_facname, matched_results_address])
end = time.time()
print("Time elapsed: {}".format(end-start))

Time elapsed: 27.471531629562378


In [52]:
# summary stats about DOE
print("Number of checks by DOE: {}".format(doe_df.shape[0]))
print("Percent of total $ disbursed by DOE: {}".format(sum(doe_df['Check Amount'])/sum(df['Check Amount'])))
print("Number of matches on DOE checks using fuzzy string matching: {}".format(combined_results.shape[0]))
print("Percent of records matched: {}".format(combined_results.shape[0]/doe_df.shape[0]))
print("Percent of total money disbursed by DOE matched: {}".format(sum(combined_results['Check Amount']/sum(doe_df['Check Amount']))))

doe_sample = combined_results.sample(2)
doe_sample.loc[:, ['best_match_score', 'Agency', 'Budget Code', 'facname', 'address', 'Check Amount', 'Issue Date']]

Number of checks by DOE: 1500
Percent of total $ disbursed by DOE: 0.2235927005877458
Number of matches on DOE checks using fuzzy string matching: 36
Percent of records matched: 0.024
Percent of total money disbursed by DOE matched: 0.00286363600113958


Unnamed: 0,best_match_score,Agency,Budget Code,facname,address,Check Amount,Issue Date
211349,0.511739,DEPARTMENT OF EDUCATION,"CS04 (BRONX CHARTER SCHOOL FOR THE ARTS, 950-9)",BRONX CHARTER SCHOOL FOR THE ARTS,950 LONGFELLOW AVENUE,494000.0,2012-01-05
214229,0.664189,DEPARTMENT OF EDUCATION,"CS14 (SUCCESS ACADEMY JAMAICA CHARTER SCHOOL,)",SUCCESS ACADEMY CHARTER SCHOOL - SOUTH JAMAICA,120-27 141 STREET,300000.0,2016-01-06


# another department

In [None]:
# identify faciltiies
doe_df = df[df['Agency']=='DEPARTMENT OF EDUCATION']

overagency = ['NYC Department of Education']

# SCA as a bad example 


In [35]:
# sca_df = df[df['Agency']=='SCHOOL CONSTRUCTION AUTHORITY']

# # Perform fuzzy matching and left join on 'facname'
# sca_matched_results_facname = fuzzymatcher.fuzzy_left_join(sca_df, doe_facdb, left_on='Budget Code', right_on='facname')
# sca_matched_results_facname = sca_matched_results_facname[sca_matched_results_facname['best_match_score'] > 0.3]

# # Perform fuzzy matching and left join on 'address'
# sca_matched_results_address = fuzzymatcher.fuzzy_left_join(sca_df, doe_facdb, left_on='Budget Code', right_on='address')
# sca_matched_results_address = sca_matched_results_address[sca_matched_results_address['best_match_score'] > 0.3]


# # Combine the matched results from both joins
# sca_combined_results = pd.concat([matched_results_facname, matched_results_address])

In [36]:
# grouping checks together

print(df.shape[0])
print(df.columns)

projects = df.groupby(['Capital Project', 'Fiscal year']).agg({'Check Amount': 'sum', 
'Agency':'first', 'Budget Code': 'first'
})

agencies = df['Agency'].unique()
agency_dict= {}

for a in agencies:
    agency_dict[a] = projects[projects['Agency']==a]

1742833
Index(['Agency', 'Associated Prime Vendor', 'Budget Code', 'Capital Project',
       'Check Amount', 'Contract ID', 'Contract Purpose', 'Department',
       'Document ID', 'Emerging Business', 'Expense Category', 'Fiscal year',
       'Industry', 'Issue Date', 'M/WBE Category', 'Payee Name',
       'Spending Category', 'Sub Contract Reference ID', 'Sub Vendor',
       'Woman Owned Business'],
      dtype='object')


In [68]:
all_facdomains = facdb['facdomain'].unique()
print(facdb['facdomain'].unique())
all_facgroups = facdb['facgroup'].unique()
print(facdb['facgroup'].unique())
all_facsubgrps = facdb['facsubgrp'].unique()
print(len(facdb['facsubgrp'].unique()))

['EDUCATION, CHILD WELFARE, AND YOUTH' 'HEALTH AND HUMAN SERVICES'
 'PUBLIC SAFETY, EMERGENCY SERVICES, AND ADMINISTRATION OF JUSTICE'
 'PARKS, GARDENS, AND HISTORICAL SITES'
 'CORE INFRASTRUCTURE AND TRANSPORTATION'
 'LIBRARIES AND CULTURAL PROGRAMS' 'ADMINISTRATION OF GOVERNMENT']
['CHILD SERVICES AND WELFARE' 'YOUTH SERVICES'
 'DAY CARE AND PRE-KINDERGARTEN' 'CAMPS' 'HUMAN SERVICES' 'HEALTH CARE'
 'ADULT SERVICES' 'JUSTICE AND CORRECTIONS' 'HISTORICAL SITES'
 'TRANSPORTATION' 'VOCATIONAL AND PROPRIETARY SCHOOLS' 'PARKS AND PLAZAS'
 'CULTURAL INSTITUTIONS' 'SOLID WASTE' 'SCHOOLS (K-12)' 'LIBRARIES'
 'PUBLIC SAFETY' 'OFFICES, TRAINING, AND TESTING' 'HIGHER EDUCATION'
 'CITY AGENCY PARKING, MAINTENANCE, AND STORAGE' 'WATER AND WASTEWATER'
 'EMERGENCY SERVICES' 'MATERIAL SUPPLIES AND MARKETS' 'OTHER PROPERTY'
 'TELECOMMUNICATIONS']
71


In [60]:
parks_df = agency_dict['DEPARTMENT OF PARKS AND RECREATION']
parks_groups = ['HISTORICAL SITES','PARKS AND PLAZAS',
'CULTURAL INSTITUTIONS', 'CITY AGENCY PARKING, MAINTENANCE, AND STORAGE']
parks_domains = ['PARKS, GARDENS, AND HISTORICAL SITES']
parks_subgroups = ['PARKS','RECREATION AND WATERFRONT SITES', 'OTHER CULTURAL INSTITUTIONS', 
'GARDENS', 'MAINTENANCE AND GARAGES', 'PRESERVES AND CONSERVATION AREAS']
parks_facdb = facdb[(facdb['overagency']=='NYC Department of Parks and Recreation')|facdb['overagency']=='The New York State Office of Parks, Recreation, and Historic Preservation']

parks_facdb = facdb[['facname', 'address', 'geom']][(facdb['overagency']=='NYC Department of Parks and Recreation') \
     | (facdb['overagency']=='The New York State Office of Parks, Recreation, and Historic Preservation') \
     | (facdb['facgroup'].isin(parks_groups)) \
     | (facdb['facsubgrp'].isin(parks_subgroups)) \
     | (facdb['facdomain'].isin(parks_domains))]

In [61]:
print(parks_df.shape[0])
print(parks_facdb.shape[0])

17686
5987


In [54]:
# Perform fuzzy matching and left join on 'facname'
start = time.time()
parks_matched_results_facname = fuzzymatcher.fuzzy_left_join(parks_df, parks_facdb, left_on='Budget Code', right_on='facname')
parks_matched_results_facname = parks_matched_results_facname[parks_matched_results_facname['best_match_score'] > 0.3]

# Perform fuzzy matching and left join on 'address'
parks_matched_results_address = fuzzymatcher.fuzzy_left_join(parks_df, parks_facdb, left_on='Budget Code', right_on='address')
parks_matched_results_address = parks_matched_results_address[parks_matched_results_address['best_match_score'] > 0.3]


# Combine the matched results from both joins
parks_combined_results = pd.concat([parks_matched_results_facname, parks_matched_results_address])
end = time.time()

print("time elapsed: {}".format(end-start))

time elapsed: 204.24222993850708


In [63]:
# summary stats about parks
print("Sum of checks issued by Department of Parks and Recreation: {}".format(sum(parks_df['Check Amount'])))
print("Percent of total $ disbursed by Department of Parks and Recreation: {}".format(sum(parks_df['Check Amount'])/sum(df['Check Amount'])))
print("Number of matches on Department of Parks and Recreation checks using fuzzy string matching: {}".format(parks_combined_results.shape[0]))
print("Percent of money matched: {}".format(sum(parks_combined_results['Check Amount'])/sum(parks_df['Check Amount'])))

parks_sample = parks_combined_results.sample(2)
parks_sample.loc[:, ['best_match_score', 'Agency', 'Budget Code', 'facname', 'address', 'Check Amount']]

Sum of checks issued by Department of Parks and Recreation: 5151897130.099985
Percent of total $ disbursed by Department of Parks and Recreation: 0.038550327739470175
Number of matches on Department of Parks and Recreation checks using fuzzy string matching: 583
Percent of money matched: 0.035950712939876894


Unnamed: 0,best_match_score,Agency,Budget Code,facname,address,Check Amount
842539,0.421625,DEPARTMENT OF PARKS AND RECREATION,"MEA3 (FLUSHING MEADOWS CORONA PARK, QNS: HABIT)",FLUSHING MEADOWS CORONA PARK,,5435.68
872733,0.332607,DEPARTMENT OF PARKS AND RECREATION,"JVBC (JUNIPER VALLEY PARK, QNS: RECONST EXISTI)",JUNIPER VALLEY PARK,71-01 JUNIPER BLVD SOUTH,6275.0


# how can we make this faster?

smart deduping, ngrams, grouping, knn

In [64]:
def ngrams(string, n=10):
    string = fix_text(string)
    string = string.encode('ascii', errors='ignore').decode()
    string = string.lower()
    chars_to_remove = [")", "(", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.upper()
    string = re.sub(' +',' ', string).strip()
    string = ' '+string+' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return[''.join(ngram)for ngram in ngrams]

In [65]:
facdb_facnames = list(facdb[facdb['facname'].notna()]['facname'].unique())
facdb_addresses = list(facdb[facdb['address'].notna()]['address'].unique())

# first, let's try addresses
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
facdb_addresses_tf_idf_matrix = vectorizer.fit_transform(facdb_addresses)
facdb_facnames_tf_idf_matrix = vectorizer.fit_transform(facdb_facnames)
print(facdb_addresses_tf_idf_matrix.shape)
print(facdb_facnames_tf_idf_matrix.shape)

(17092, 79955)
(24979, 226195)


In [67]:
df_budgetcodes = list(df['Budget Code'].unique())

df_tf_idf_matrix = vectorizer.transform(df_budgetcodes)

In [69]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape

    idx_dtype = np.int32
    nnz_max = M*ntop
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [70]:
matches = awesome_cossim_top(df_tf_idf_matrix, df_tf_idf_matrix.transpose(), 10, 0.85)

: 

: 

In [47]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]

    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches],dtype=object)
    right_side = np.empty([nr_matches],dtype=object)
    similarity = np.zeros(nr_matches)

    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similarity[index] = sparse_matrix.data[index]

    return pd.DataFrame({'left_side': left_side,
    'right_side': right_side,
    'similarity:': similarity})

In [48]:
# matches_df = get_matches_df(matches, facdb_facnames, top=1000)
# #matches_df = matches_df[matches_df['similarity']<0.99999]
# matches_df.sample(20)