In [1]:
# Capital Spending Project: Joining on CPDB
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re
import datetime
import matplotlib.font_manager
import time

In [2]:
# --- checkbook nyc data
file_names = ['../data/checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_0.csv', 
'../data/checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_1.csv',
'../data/checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_2.csv',
'../data/checkbooknyc/Citywide-Agencies/spending_29351_05312023_040004_part_3.csv']
df0 = pd.read_csv(file_names[0])
df1 = pd.read_csv(file_names[1])
df2 = pd.read_csv(file_names[2])
df3 = pd.read_csv(file_names[3])
frames = [df0, df1, df2, df3]
df = pd.concat(frames)

In [3]:
df.head()

Unnamed: 0,Agency,Associated Prime Vendor,Budget Code,Capital Project,Check Amount,Contract ID,Contract Purpose,Department,Document ID,Emerging Business,Expense Category,Fiscal year,Industry,Issue Date,M/WBE Category,Payee Name,Spending Category,Sub Contract Reference ID,Sub Vendor,Woman Owned Business
0,Transit Authority,,"BUSB (MTA, NYCT: PURCHASE 126 HYBRID BUSES)",998CAP2024 005,99999999.95,CT199820231408687,Bus Purch Track Replace Switch Replace Rail Su...,400-998-169,20230231715-1-DSB-EFT,No,CAPITAL PURCHASED EQUIPMENT,2023,Standardized Services,2022-12-21,Individuals and Others,METROPOLITAN TRANSPORTATION AUTHORITY,Capital Contracts,,No,No
1,Transit Authority,,"SAPS (MTA / NYCT: SUBWAY ACTION PLAN, SIGNAL A)",998CAPSAP 003,99999999.59,CT199820191409664,Subway Action Plan Signal Improvement & Modern...,400-998-169,20190025861-1-015-MD1,No,IOTB CONSTRUCTION,2019,Standardized Services,2018-10-30,Individuals and Others,METROPOLITAN TRANSPORTATION AUTHORITY,Capital Contracts,,No,No
2,Department of Education,,E706 (IMPLEMENTATION OF THE SEVENTH FIVE-YEAR),040SCA22 030,99999999.0,PON1040SCA20-24MC,,IMPLEMENTATION OF THE SEVENTH FIVE-YEAR,20220448629-1-DSB-EFT,No,ADMINISTRATIVE EXPENSES,2022,,2022-05-27,Individuals and Others,NYC SCHOOL CONSTRUCTION AUTHORITY,Capital Contracts,,No,No
3,Department of Education,,E706 (IMPLEMENTATION OF THE SEVENTH FIVE-YEAR),040SCA21 030,99999999.0,PON1040SCA20-24MC,,IMPLEMENTATION OF THE SEVENTH FIVE-YEAR,20220448626-1-DSB-EFT,No,CONSTRUCTION-BUILDINGS,2022,,2022-05-27,Individuals and Others,NYC SCHOOL CONSTRUCTION AUTHORITY,Capital Contracts,,No,No
4,Department of Education,,E706 (IMPLEMENTATION OF THE SEVENTH FIVE-YEAR),040SCA21 030,99999999.0,PON1040SCA20-24MC,,IMPLEMENTATION OF THE SEVENTH FIVE-YEAR,20220354644-1-DSB-EFT,No,CONSTRUCTION-BUILDINGS,2022,,2022-03-25,Individuals and Others,NYC SCHOOL CONSTRUCTION AUTHORITY,Capital Contracts,,No,No


In [4]:
# limited data cleaning 
df = df[df['Check Amount']<99000000] # exclude checks over 99 mil
df = df[df['Check Amount']>=0] # exclude negative checks, include 0 checks
df['FMS ID'] = df['Capital Project'].str.replace(r'\s*\d+$','') # new column for joining on CPDB

  df['FMS ID'] = df['Capital Project'].str.replace(r'\s*\d+$','') # new column for joining on CPDB


In [5]:
df.head()

Unnamed: 0,Agency,Associated Prime Vendor,Budget Code,Capital Project,Check Amount,Contract ID,Contract Purpose,Department,Document ID,Emerging Business,...,Fiscal year,Industry,Issue Date,M/WBE Category,Payee Name,Spending Category,Sub Contract Reference ID,Sub Vendor,Woman Owned Business,FMS ID
176,Department of Education,,E704 (IMPLEMENTATATION OF THE FIFTH FIVE-YEAR),040SCA13 010,98999000.0,PON1040SCA10-14MC,,IMPLEMENTATION OF THE FIFTH FIVE-YEAR ED,20140165270-1-DSB-EFT,No,...,2014,,2013-11-04,Individuals and Others,NYC SCHOOL CONSTRUCTION AUTHORITY,Capital Contracts,,No,No,040SCA13
177,Department of Education,,E704 (IMPLEMENTATATION OF THE FIFTH FIVE-YEAR),040SCA10 010,98261555.0,PON1040SCA10-14MC,,IMPLEMENTATION OF THE FIFTH FIVE-YEAR ED,20120044911-1-DSB-AD,No,...,2012,,2011-08-04,Individuals and Others,SCHOOL CONSTRUCTION AUTHORITY,Capital Contracts,,No,No,040SCA10
178,SCHOOL CONSTRUCTION AUTHORITY,,E704 (402044E704),040WRAP2010 020,98175810.0,PON104420141435313,,IMPLEMENTATATION OF THE FIFTH FIVE-YEAR,20140149753-1-DSB-AD,No,...,2014,,2014-02-05,Non-M/WBE,ACE AMERICAN INSURANCE CO.,Capital Contracts,,No,No,040WRAP2010
179,SCHOOL CONSTRUCTION AUTHORITY,,E705 (6TH PLAN - BL 2364),040WRAP2015 020,98000000.0,PON104420171733408,,IMPLEMENTATION OF THE SIXTH FIVE-YEAR ED,20170126575-1-DSB-AD,No,...,2017,,2017-01-30,Non-M/WBE,ACE AMERICAN INSURANCE CO.,Capital Contracts,,No,No,040WRAP2015
180,Department of Education,,E704 (IMPLEMENTATATION OF THE FIFTH FIVE-YEAR),040SCA12 010,98000000.0,PON1040SCA10-14MC,,IMPLEMENTATION OF THE FIFTH FIVE-YEAR ED,20130085631-1-DSB-AD,No,...,2013,,2012-11-13,Individuals and Others,SCHOOL CONSTRUCTION AUTHORITY,Capital Contracts,,No,No,040SCA12


In [43]:
temp_cols_for_grouping = ['FMS ID', 'Agency', 'Fiscal year']
temp_cols_for_limiting = temp_cols_for_grouping + ['Contract Purpose', 'Budget Code', 'Check Amount']

df_limited_cols = df.loc[:, temp_cols_for_limiting]

In [44]:
def fn_join_vals(x):
    return ';'.join([y for y in list(x) if pd.notna(y)])

agg_dict = {'Check Amount':'sum', 
'Contract Purpose': fn_join_vals,
'Budget Code': fn_join_vals}

temp_projects_grouped = df_limited_cols.groupby(temp_cols_for_grouping, as_index=False).agg(agg_dict)
temp_projects_grouped

Unnamed: 0,FMS ID,Agency,Fiscal year,Check Amount,Contract Purpose,Budget Code
0,002FINC01,Department of Citywide Administrative Services,2010,2224671.23,,
1,002FINC01,Department of Citywide Administrative Services,2011,3683303.41,,DUMY (INITIAL COI AUTHORIZATION; FY 2004C)
2,002FINC01,Department of Citywide Administrative Services,2012,4798011.14,,DUMY (INITIAL COI AUTHORIZATION; FY 2004C);DUM...
3,002FINC01,Department of Citywide Administrative Services,2013,3905556.72,,DUMY (INITIAL COI AUTHORIZATION; FY 2004C);DUM...
4,002FINC01,Department of Citywide Administrative Services,2014,5337676.18,,DUMY (INITIAL COI AUTHORIZATION; FY 2004C);DUM...
...,...,...,...,...,...,...
53984,998SITRACK,Transit Authority,2015,584020.59,NYCTA PROJECTS;PURCHASE OF 171 STANDARD BUSES ...,TST3 (ACQUISITION AND/OR CONSTRUCTIO OF CLASS)...
53985,998SITRACK,Transit Authority,2016,2700000.00,PURCHASE OF 324 STANDARD BUSES AND SIRTOA PROJ...,TST3 (ACQUISITION AND/OR CONSTRUCTIO OF CLASS)...
53986,998SITRACK,Transit Authority,2019,6375000.00,2017 MAINLINE TRACK REPLACEMENT AT VARIOUS LOC...,SUB1 (MTA /NYCT: CREATE NEW ELECTICAL POWER SU...
53987,998TCMISC,Transit Authority,2016,346000.00,PURCHASE OF 276 STANDARD BUSES;PURCHASE OF 276...,"BUS8 (MTA BUS COMPANY: BUS PURCHASES, 40' LOW-..."


In [39]:
# check that each group has consistent values
check_distinct_values_after_grouping = temp_projects_df.agg({col: 'nunique' for col in df.columns})

mean_result = check_distinct_values_after_grouping.mean()
distinct = []
nondistinct = []
for col in check_distinct_values_after_grouping.columns:
    if (mean_result[col] == 1 or mean_result[col] == 0):
        nondistinct.append((col))
    else:
        distinct.append((col,mean_result[col]))

print("Cols with nondistinct values across groups:")
print(nondistinct)
print("Cols with distinct values across groups and their mean:")
print(distinct)

Cols with nondistinct values across groups:
['Associated Prime Vendor', 'Budget Code', 'Fiscal year', 'Spending Category', 'Sub Contract Reference ID', 'Sub Vendor', 'FMS ID']
Cols with distinct values across groups and their mean:
[('Agency', 1.0001240882957163), ('Capital Project', 2.3687904148685353), ('Check Amount', 17.861076259151513), ('Contract ID', 13.55184822622675), ('Contract Purpose', 1.1645410801196763), ('Department', 1.0032814460422728), ('Document ID', 10.775027919866536), ('Emerging Business', 1.0000413627652387), ('Expense Category', 1.1157330171379725), ('Industry', 0.8636407505963132), ('Issue Date', 5.886914199837307), ('M/WBE Category', 1.2021260461332708), ('Payee Name', 2.252243930014201), ('Woman Owned Business', 1.080767692922831)]
