# Import Libraries

In [1]:
import pandas as pd
import urllib2
from bs4 import BeautifulSoup
import time
import warnings
warnings.filterwarnings("ignore")
from random import *

# Read in and Filter Data 

In [2]:
train = pd.read_csv('Brooklyn_House_Train.csv')
mask_one = (train['sale_price'] < 100000)
train = train[mask_one]

mask_two=(train['building_class_category']=='01 ONE FAMILY HOMES') | (train['building_class_category']=='02 TWO FAMILY HOMES') | (train['building_class_category']=='03 THREE FAMILY HOMES')
train = train[mask_two]

# Peak at data to make sure it looks good

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,borough,neighborhood,building_class_category,tax_class,block,lot,easement,building_class,address,...,EDesigNum,APPBBL,APPDate,PLUTOMapID,FIRM07_FLA,PFIRM15_FL,Version,MAPPLUTO_F,SHAPE_Leng,SHAPE_Area
3,184874,3,BOROUGH PARK,02 TWO FAMILY HOMES,1,5736,16,,B1,924 63RD,...,,0.0,,1.0,,,17V1.1,0.0,256.640505,2667.781551
5,272403,3,CROWN HEIGHTS,03 THREE FAMILY HOMES,1,1148,11,,C0,782 BERGEN STREET,...,,0.0,,1.0,,,17V1.1,0.0,209.947537,1689.715147
6,341324,3,EAST NEW YORK,01 ONE FAMILY HOMES,1,4502,22,,A1,582 MILFORD STREET,...,,0.0,,1.0,,,17V1.1,0.0,261.693791,2646.177776
7,352630,3,OCEAN PARKWAY-SOUTH,01 ONE FAMILY HOMES,1,7108,159,,A1,1956 OCEAN PARKWAY,...,,0.0,,1.0,,,17V1.1,0.0,396.863102,7252.060556
11,256949,3,MIDWOOD,02 TWO FAMILY HOMES,1,6757,90,,B2,1439 EAST 19TH STREET,...,,0.0,,1.0,,,17V1.1,0.0,312.369491,3693.837663


In [4]:
train.shape

(67092, 111)

# Useful functions

In [5]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""


#Scrapes price from zillow for addresses with no apartment numbers
def scraper_no_apt(address):
    try:
        address2 = str(address)
        address2 = address2.replace(' ', '-').upper()
        my_url = 'https://www.zillow.com/homes/'+address2+'-nyc_rb/'

        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
        response = opener.open(my_url)

        soup = BeautifulSoup(response, 'html.parser')

        temp = str(soup.find_all('div', {'class':'zestimate primary-quote'}))
        price = str(find_between(temp, "$", "</div>" ))
        price = price.replace(',', '')

        return price;
    except:
        return(int(0));

#Scrapes price from zillow for addresses with with apartment numbers
def scraper_w_apt(address, apt):
    try:
        address2 = str(address)
        apt2 = str(apt)
        address2 = address2.replace(' ', '-').upper()
        apt2 = apt2.replace(' ', '-').upper()
        my_url = 'https://www.zillow.com/homes/'+address2+'-APT-'+apt+'-nyc_rb/'

        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
        response = opener.open(my_url)

        soup = BeautifulSoup(response, 'html.parser')

        temp = str(soup.find_all('div', {'class':'zestimate'}))
        price = str(find_between(temp, "$", "</div>" ))
        price = price.replace(',', '')

        if price != '':
            return price;
        else:
            temp = str(soup.find_all('div', {'class':'zestimate primary-quote'}))
            price = str(find_between(temp, "$", "</div>" ))
            price = price.replace(',', '')
            return price
    except:
        return(int(0));

# Run this code as a test

In [6]:
test_prices = []
for i in range(0,2):
    #print(i)
    if pd.isnull((train.iloc[i]['apartment_number'])):
        p = scraper_no_apt(train.iloc[i]['address'])
        if p=='':
            test_prices.append(int(0))
        else:
            test_prices.append(int(p))
    else:
        p=scraper_w_apt(train.iloc[i]['address'], train.iloc[i]['apartment_number'])
        if p=='':
            test_prices.append(int(0))
        else:
            test_prices.append(int(p))
    time.sleep(2*random())
print test_prices 

[1472076, 2344426]


In [7]:
#If the above code does not print out [1472076, 2344426], something is wrong

# If Above Code Works, run the following

In [None]:
prices=[]
for i in range(22634,44728):
    #print(i)
    if pd.isnull((train.iloc[i]['apartment_number'])):
        p = scraper_no_apt(train.iloc[i]['address'])
        if p=='':
            prices.append(int(0))
        else:
            prices.append(int(p))
    else:
        p=scraper_w_apt(train.iloc[i]['address'], train.iloc[i]['apartment_number'])
        if p=='':
            prices.append(int(0))
        else:
            prices.append(int(p))
    time.sleep(2*random())

In [None]:
prices_=pd.Series(prices)

In [None]:
casino_night = pd.concat([pd.Series(train['address'])[22634:44728].reset_index(drop=True),prices_], axis=1)

In [None]:
casino_night = casino_night.rename(columns={0:'Price',
                               'address':'Address'})

In [None]:
casino_night.to_csv('casino_night.csv')