# Import Libraries

In [None]:
import pandas as pd
import urllib2
from bs4 import BeautifulSoup
import time
import warnings
warnings.filterwarnings("ignore")
from random import *

# Read in and Filter Data 

In [None]:
train = pd.read_csv('Brooklyn_House_Train.csv')
mask_one = (train['sale_price'] < 100000)
train = train[mask_one]

mask_two=(train['building_class_category']=='01 ONE FAMILY HOMES') | (train['building_class_category']=='02 TWO FAMILY HOMES') | (train['building_class_category']=='03 THREE FAMILY HOMES')
train = train[mask_two]

# Peak at data to make sure it looks good

In [None]:
train.head()

In [None]:
train.shape

# Useful functions

In [None]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""


#Scrapes price from zillow for addresses with no apartment numbers
def scraper_no_apt(address):
    try:
        address2 = str(address)
        address2 = address2.replace(' ', '-').upper()
        my_url = 'https://www.zillow.com/homes/'+address2+'-nyc_rb/'

        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
        response = opener.open(my_url)

        soup = BeautifulSoup(response, 'html.parser')

        temp = str(soup.find_all('div', {'class':'zestimate primary-quote'}))
        price = str(find_between(temp, "$", "</div>" ))
        price = price.replace(',', '')

        return price;
    except:
        return(int(0));

#Scrapes price from zillow for addresses with with apartment numbers
def scraper_w_apt(address, apt):
    try:
        address2 = str(address)
        apt2 = str(apt)
        address2 = address2.replace(' ', '-').upper()
        apt2 = apt2.replace(' ', '-').upper()
        my_url = 'https://www.zillow.com/homes/'+address2+'-APT-'+apt+'-nyc_rb/'

        opener = urllib2.build_opener()
        opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
        response = opener.open(my_url)

        soup = BeautifulSoup(response, 'html.parser')

        temp = str(soup.find_all('div', {'class':'zestimate'}))
        price = str(find_between(temp, "$", "</div>" ))
        price = price.replace(',', '')

        if price != '':
            return price;
        else:
            temp = str(soup.find_all('div', {'class':'zestimate primary-quote'}))
            price = str(find_between(temp, "$", "</div>" ))
            price = price.replace(',', '')
            return price
    except:
        return(int(0));

# Run this code as a test

In [None]:
test_prices = []
for i in range(0,2):
    #print(i)
    if pd.isnull((train.iloc[i]['apartment_number'])):
        p = scraper_no_apt(train.iloc[i]['address'])
        if p=='':
            test_prices.append(int(0))
        else:
            test_prices.append(int(p))
    else:
        p=scraper_w_apt(train.iloc[i]['address'], train.iloc[i]['apartment_number'])
        if p=='':
            test_prices.append(int(0))
        else:
            test_prices.append(int(p))
    time.sleep(2*random())
print test_prices 

In [None]:
#If the above code does not print out [1472076, 2344426], something is wrong

# If Above Code Works, run the following

In [None]:
prices=[]
for i in range(44728,67091):
    #print(i)
    if pd.isnull((train.iloc[i]['apartment_number'])):
        p = scraper_no_apt(train.iloc[i]['address'])
        if p=='':
            prices.append(int(0))
        else:
            prices.append(int(p))
    else:
        p=scraper_w_apt(train.iloc[i]['address'], train.iloc[i]['apartment_number'])
        if p=='':
            prices.append(int(0))
        else:
            prices.append(int(p))
    time.sleep(2*random())

In [None]:
prices_=pd.Series(prices)

In [None]:
the_injury = pd.concat([pd.Series(train['address'])[44728:67091].reset_index(drop=True),prices_], axis=1)

In [None]:
the_injury = the_injury.rename(columns={0:'Price',
                               'address':'Address'})

In [None]:
the_injury.to_csv('the_injury.csv')