In [26]:
import pandas as pd
import re

In [3]:
# Data from: https://data.cityofnewyork.us/Housing-Development/Capital-Project-Schedules-and-Budgets/2xh6-psuq
# As of: 02/09/2018

budgets = pd.read_csv('data/Capital_Project_Schedules_and_Budgets.csv')

In [12]:
budgets['Project Type '].unique()

array(['SCA CIP', 'SCA CIP RESOA', 'DIIT - RESOA', 'DOE- RESOA',
       'SCA Furniture & Equi', 'Trust For Public Lan', 'SCA Capacity',
       'DOE - Skilled Trades', 'SCA IEH', 'DOE - Lead Paint', 'PRE-K',
       'DIIT - PBX Telephone', 'DIIT - Project Conne',
       'Fast Track Projects', 'DCAS/NYPA', 'SCA Lease Site Impro',
       'SCA Emergency Lighti', 'RCT CIP', 'SCA Emergency Respon',
       'DOE Managed PREK'], dtype=object)

In [54]:
budgets = budgets[(budgets['Project Type '] == 'SCA Capacity') & (budgets['Project Status Name'] == 'In-Progress')]

In [55]:
budgets.sample(5)

Unnamed: 0,Project Geographic District,Project Building Identifier,Project School Name,Project Type,Project Description,Project Phase Name,Project Status Name,Project Phase Actual Start Date,Project Phase Planned End Date,Project Phase Actual End Date,Project Budget Amount,Final Estimate of Actual Costs Through End of Phase Amount,Total Phase Actual Spending Amount,DSF Number(s)
4157,24,Q247,P.S. 128 - QUEENS,SCA Capacity,ADDITION,Construction,In-Progress,07/17/2017,11/03/2019,,35703162,$32807349.00,$1614843.00,DSF: 0000821576
1033,15,K338,P.S./I.S. 338 - BROOKLYN,SCA Capacity,NEW SCHOOL/DEMOLITION,"CM,Art,F&E",In-Progress,06/23/2014,03/08/2017,,10121335,$8596661.00,$7673583.00,DSF: 0000425868
5569,27,Q066,P.S. 66 - QUEENS,SCA Capacity,ADDITION,Construction,In-Progress,06/21/2017,08/20/2020,,31121781,$28209948.00,$1357797.00,DSF: 0000801039
3769,30,Q292,P.S. 92 - QUEENS,SCA Capacity,ADDITION/WINDOWS,Construction,In-Progress,08/13/2014,05/08/2017,,52303500,$49688287.00,$47172573.00,"DSF: 0000745712, 0000752430"
6480,28,Q144,P.S. 144 - QUEENS,SCA Capacity,ADDITION,Construction,In-Progress,07/13/2017,07/01/2020,,58763373,$53285051.00,$2156855.00,DSF: 0000798209


In [71]:
budgets.dtypes

Project Geographic District                                    int64
Project Building Identifier                                   object
Project School Name                                           object
Project Type                                                  object
Project Description                                           object
Project Phase Name                                            object
Project Status Name                                           object
Project Phase Actual Start Date                               object
Project Phase Planned End Date                                object
Project Phase Actual End Date                                 object
Project Budget Amount                                         object
Final Estimate of Actual Costs Through End of Phase Amount    object
Total Phase Actual Spending Amount                            object
DSF Number(s)                                                 object
dtype: object

In [65]:
b = []

# Create a new row per unique DSF, even if a row has more then 1
for index, row in budgets.iterrows(): 
    for dsf in re.findall('(\d+)', row['DSF Number(s)']):
        b.append([
                  "DSF%s"%(dsf),
                  row['Project Building Identifier'],
                  row['Project School Name'],
                  row['Project Phase Actual Start Date'],
                  row['Project Phase Planned End Date']
                 ])
        
budgetsClean = pd.DataFrame(data=b, columns=['dsf', 'bldg_id', 'name', 'start_date', 'planned_end_date'])

In [101]:
budgetsClean = budgetsClean.drop_duplicates(subset='dsf')

In [5]:
# Data from: https://data.cityofnewyork.us/Education/Capacity-Projects-by-schools/a94k-kjys
# As of: 03/21/2018

projects = pd.read_csv('data/Capacity_Projects__by_schools.csv')

In [79]:
projects.dtypes

Project #            object
Forecast Capacity     int64
Actual /Est.Compl    object
dtype: object

In [80]:
projects = pd.DataFrame(
                        data=projects.ix[:, ['Project #', 'Forecast Capacity', 'Actual /Est.Compl']].values,
                        columns=['dsf', 'capacity', 'est_completion']
                       )

In [107]:
projects = projects.drop_duplicates(subset='dsf')

In [84]:
# Data from: https://data.cityofnewyork.us/Housing-Development/Active-Projects-Under-Construction/8586-3zfm
# As of: 02/22/2018

locations = pd.read_csv('data/Active_Projects_Under_Construction.csv')

In [93]:
locations = pd.DataFrame(
                         data=locations.ix[:, ['Building ID', 'Latitude', 'Longitude', 'BBL']].values,
                         columns=['bldg_id', 'lat', 'lng', 'bbl']
                        )

In [117]:
locations = locations.drop_duplicates(subset=['bldg_id']).dropna(subset=['bbl'])

In [120]:
df = budgetsClean.merge(projects, on='dsf', how='inner').merge(locations, on='bldg_id', how='inner')

In [121]:
df

Unnamed: 0,dsf,bldg_id,name,start_date,planned_end_date,capacity,est_completion,lat,lng,bbl
0,DSF0000798178,X097,P.S. 97 - BRONX,07/31/2017,02/12/2018,548,Apr-21,40.8628,-73.8468,2044740000.0
1,DSF0000730952,M342,THE RIVERSIDE SCHOOL - M,06/17/2016,09/01/2017,692,Sep-17,40.7725,-73.9898,1011720000.0
2,DSF0000798185,K032,P.S. 32 - BROOKLYN,07/12/2017,07/10/2021,436,Jul-20,40.6804,-73.9919,3004370000.0
3,DSF0000798182,K557,P.S. @ 4302 4TH AVE - BROOKLYN,11/17/2017,06/12/2018,332,Jun-22,40.6503,-74.0086,3007280000.0
4,DSF0000798233,X014,P.S. 14 - BRONX,07/07/2016,10/09/2019,344,Sep-18,40.8358,-73.8257,2053130000.0
5,DSF0000798175,X046,P.S. 46 - BRONX,01/03/2017,06/25/2020,500,Jun-19,40.8668,-73.8907,2032950000.0
6,DSF0000798235,M323,I.S. 323 - MANHATTAN,03/09/2015,06/15/2017,1016,May-18,40.731,-74.0072,1006030000.0
7,DSF0000798210,Q398,P.S. 398 - QUEENS,03/06/2017,06/03/2019,476,Jun-19,40.7525,-73.8969,4012420000.0
8,DSF0000798199,Q125,I.S. 125 - QUEENS,12/23/2015,07/22/2018,728,Sep-17,40.7411,-73.9189,4022840000.0
9,DSF0000798195,Q143,P.S. 143 - QUEENS,07/18/2017,03/17/2021,980,Sep-20,40.7555,-73.8552,4017560000.0


In [126]:
df['start_date'] = pd.to_datetime(df['start_date'])
df['planned_end_date'] = pd.to_datetime(df['planned_end_date'])
df['est_completion'] = pd.to_datetime(df['est_completion'], format="%b-%y")

In [129]:
df.to_csv('export/sca-capital-projects.csv')