# Collects 'Property Tax Bill' and 'New Property Value' PDFs for NYC

In this notebook, I have made the following adjustements:
    - You can now choose which statements you want (last period, the period before that, etc)
    - You can specify all the streets and number ranges you want to scrape.

### Import Libraries

In [8]:
#Import libs
import requests
import urllib
from urllib2 import urlopen
import re
import os
from datetime import datetime
import time

In [9]:
newpath = 'All'
if not os.path.exists(newpath):
    os.makedirs(newpath)

### Define spyder functions

In [10]:
def get_PTBtaxPDF(number,street,area='1',latest = 0):
    '''
    Downloads the latest Property Tax Bill from NYC for a certain address
    Enter number and street as strings
    Use area '1' for Manhattan (Default)
    Use area '2' for Bronx
    Use area '3' for Brooklyn
    Use area '4' for Queens
    Use area '5' for Staten Island
    The default for latest is 0, which means the latest  statements, change to 1 for second lastes, 2 for third latest...
    The PDF will be stored in your current working dir
    '''
    params = {'FBORO':area,'FHOUSENUM':number,'FSTNAME' :street}
    request1 = requests.post("HTTP://webapps.nyc.gov:8084/CICS/fin1/find001i",data=params)
    request1Text = request1.text
    request1List = request1Text.split('<input type=')
    request1InputList = request1List[1:-1]
    request1Dict = {}
    for n in range(22):
        templist = request1InputList[n]
        key = templist.split('"\r\n')[0].split('"hidden" name="')[1]
        value = templist.split('value="')[1].split('">\r\n')[0]
        request1Dict[key] = value
    params = request1Dict
    request2 = requests.post("HTTP://NYCPROP.NYC.GOV/nycproperty/nynav/jsp/stmtassesslst.jsp",data=params)
    request2Text = request2.text
    bbl = re.findall('bbl=\d+',request2Text)
    bbl = bbl[0].split('=')[1]
    dateTemp1 = request2Text.split(' - Quarterly Property Tax Bill')[latest]
    dateTemp2 = dateTemp1.split('\t')[-1]
    dateTemp3 = datetime.strptime(dateTemp2,'%B %d, %Y')
    stmtDateLast = dateTemp3.strftime('%Y%m%d')
    path = 'All'
    name = '/PTB' + number + " " + street + '.pdf'
    pathName = path + name
    url = 'http://nycprop.nyc.gov/nycproperty/StatementSearch?bbl=' + bbl + '&stmtDate=' + stmtDateLast + '&stmtType=SOA'
    #print url
    urllib.urlretrieve(url, pathName)
    return url

In [11]:
def get_NPVtaxPDF(number,street,area='1',latest = 0):
    '''
    Downloads the latest Notice of Property Value from NYC for a certain address
    Enter number and street as strings
    Use area '1' for Manhattan (Default)
    Use area '2' for Bronx
    Use area '3' for Brooklyn
    Use area '4' for Queens
    Use area '5' for Staten Island
    The default for latest is 0, which means the latest  statements, change to 1 for second lastes, 2 for third latest...
    The PDF will be stored in your current working dir
    '''
    params = {'FBORO':area,'FHOUSENUM':number,'FSTNAME' :street}
    request1 = requests.post("HTTP://webapps.nyc.gov:8084/CICS/fin1/find001i",data=params)
    request1Text = request1.text
    request1List = request1Text.split('<input type=')
    request1InputList = request1List[1:-1]
    request1Dict = {}
    for n in range(22):
        templist = request1InputList[n]
        key = templist.split('"\r\n')[0].split('"hidden" name="')[1]
        value = templist.split('value="')[1].split('">\r\n')[0]
        request1Dict[key] = value
    params = request1Dict
    request2 = requests.post("HTTP://NYCPROP.NYC.GOV/nycproperty/nynav/jsp/stmtassesslst.jsp",data=params)
    request2Text = request2.text
    bbl = re.findall('bbl=\d+',request2Text)
    bbl = bbl[0].split('=')[1]
    dateTemp1 = request2Text.split(' - Notice of Property Value')[latest]
    dateTemp2 = dateTemp1.split('\t')[-1]
    dateTemp3 = datetime.strptime(dateTemp2,'%B %d, %Y')
    stmtDateLast = dateTemp3.strftime('%Y%m%d')
    path = 'All'
    name = '/NPV' + number + " " + street + '.pdf'
    pathName = path + name
    url = 'http://nycprop.nyc.gov/nycproperty/StatementSearch?bbl=' + bbl + '&stmtDate=' + stmtDateLast + '&stmtType=NPV'
    #print url
    urllib.urlretrieve(url, pathName)
    return url

### Define area to be crawled

In [5]:
areaManhatten = {
                    '7 Avenue': (580, 800),
                    'Avenue of Americas': (1000, 1300),
                    '5 Avenue': (500, 700),
                    'Madison Avenue': (280, 520),
                    'Park Avenue': (100, 400),
                    'Lexington Avenue': (300, 700),
                    '3 Avenue': (600, 1000),
                    '2 Avenue': (700, 1000),
                    'E 42nd St': (0, 300),
                    'E 43rd St': (0, 300),
                    'E 44th St': (0, 300),
                    'E 45th St': (0, 300),
                    'E 46th St': (0, 300),
                    'E 47th St': (0, 300),
                    'E 48th St': (0, 300),
                    'E 49th St': (0, 300),
                    'E 50th St': (0, 300),
                    'E 51st St': (0, 300),
                    'E 52nd St': (0, 300),
                    'W 42nd St': (0, 200),
                    'W 43rd St': (0, 200),
                    'W 44th St': (0, 200),
                    'W 45th St': (0, 200),
                    'W 46th St': (0, 200),
                    'W 47th St': (0, 200),
                    'W 48th St': (0, 200),
                    'W 49th St': (0, 200),
                    'W 50th St': (0, 200),
                    'W 51st St': (0, 200),
                    'W 52nd St': (0, 200)
                }

In [12]:
# If you want to test for a smaller set first use this
areaManhatten = {
                    'W 48th St': (7, 9),
                    'W 49th St': (12, 14),
                    'W 42nd St': (25, 27),
                    'W 51st St': (59,61),
                    'Park Avenue': (109,111)
                }

### Crawl each street to collect PDFs and wait 5 minutes

In [13]:
for key in areaManhatten.keys():
    startStreet = datetime.now()
    startStreetString = startStreet.strftime("%I:%M%p on %B %d, %Y")
    street = key
    numbers = range(areaManhatten[street][0], areaManhatten[street][1])
    print 'Starting %s at %s' %(street,startStreetString)
    for n in numbers:
        number = str(n)
        try:
            get_PTBtaxPDF(number, street, area='1', latest=1)
            get_NPVtaxPDF(number, street, area='1', latest=1)
        except:
            pass
    endStreet = datetime.now()
    endStreetString = endStreet.strftime("%I:%M%p on %B %d, %Y")
    print 'End %s at %s' %(street,endStreetString)
    #print 'Going to sleep now for 5 minutes'
    #time.sleep(5*60)

Starting W 49th St at 08:23AM on May 28, 2016
End W 49th St at 08:23AM on May 28, 2016
Starting Park Avenue at 08:23AM on May 28, 2016
End Park Avenue at 08:24AM on May 28, 2016
Starting W 51st St at 08:24AM on May 28, 2016
End W 51st St at 08:24AM on May 28, 2016
Starting W 42nd St at 08:24AM on May 28, 2016
End W 42nd St at 08:24AM on May 28, 2016
Starting W 48th St at 08:24AM on May 28, 2016
End W 48th St at 08:25AM on May 28, 2016
