## Web scraping
separately scrap sold houses of five cities, **EL CERRITO, EL SOBRANTE, HERCULES, PINOLE, SAN PABLO**
1. append url of every page into a list, called page
2. use function webscraping to scrap every url in page
3. save file to city_sold.csv, for example: `sanpablo_sold.csv`

In [2]:
import numpy as np
import pandas as pd
import requests
import time

import re

from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from contextlib import closing

from requests.exceptions import MissingSchema #requests.exceptions

In [142]:
def webscraping(url, img_folder):
    
    res = requests.get(url)
    res.raise_for_status()
    soup = BeautifulSoup(res.content, 'lxml')

    houses = []
    for _ in soup.find_all('div', {'class':'ihf-grid-result-container well'}):
                    
        img_url = _.find('div', {"class":'ihf-results-grid-photo'})['data-ihf-main-source']
        img_name = img_url.split('/')[-1]    
        chunk_size = 128 # bytes ?        
        if img_url != '': # scrape house picture and save it
            with closing(requests.get(img_url, stream=True)) as r, open('./origion-img/'+img_folder+'/%s' % img_name, 'wb') as f:
                for chunk in r.iter_content(chunk_size): 
                    f.write(chunk)
                #print('Saved %s,' % img_name, 'total size：%s bytes' % r.headers['content-length'])

            # find inner url to get lot size
            url_inner = _.find('div', {'class':'ihf-results-grid-photo'}).find('a').get('href')
            res_inner = requests.get(url_inner)
            res_inner.raise_for_status()
            #print(res_inner.status_code)
            soup_inner = BeautifulSoup(res_inner.content, 'lxml')
            for lot in soup_inner.find_all('div', {'class':'listing-info-item'}):
                if 'Lot Size in Sq. Ft.:' in lot.text:
                    lotsize = lot.text.split(':')[1]
                    #print(lotsize)
            
            
            house = {
                'price': _.find('span', {'class':'ihf-sold-price-grid'}).text.strip(),
                'id': img_name,
                'lot size' : lotsize,
                'adress': _.find('span', {'class':'ihf-grid-result-address'}).text.strip(),
                'bed': _.find('div', {'class':'ihf-grid-result-basic-info-item1'}).text.strip(),
                'bath': _.find('div', {'class':'ihf-grid-result-basic-info-item2'}).text.strip(),
                'house size': _.find('div', {'class':'ihf-grid-result-basic-info-item3'}).text.strip()
            }
            houses.append(house)

        time.sleep(3) # sleep for 5 seconds so I do not overwhelm grubbco and get kicked out

    print('completed %s houses' % len(houses))

    df = pd.DataFrame(houses, columns=['id','adress', 'bed', 'bath', 'house size', 'lot size', 'price'])
    return df

In [171]:
page = []
for i in range(24,25):
    left = 'https://www.grubbco.com/homes-for-sale-results/?bedrooms=0&propertyType=SFR%2CCND&maxListPrice=&pg='
    right = '&squareFeet=&cityId=834&minListPrice=400000&status=sold&bathCount=0'
    url = left + str(i) + right
    page.append(url)
    print(url)
    

https://www.grubbco.com/homes-for-sale-results/?bedrooms=0&propertyType=SFR%2CCND&maxListPrice=&pg=24&squareFeet=&cityId=834&minListPrice=400000&status=sold&bathCount=0


In [161]:
city = 'sanpablo_sold'
csv_name = 'sanpablo_sold'
houses = []
for num, url in enumerate(page):
    print(city + '-page' + str(num))
    #break
    t0 = time.time()
    try:
        output = webscraping(url, city)
        houses.append(output)
    except MissingSchema:
        print('ERROR:')
        print(url,city)
        print('')
    t1 = time.time()
    print('Total time = %.2f seconds'%(t1 - t0))
    print()

pd.concat(houses).to_csv('./dataset/' + csv_name + '.csv', index=False)

print('***********************************************************')
print('***********************************************************')


sanpablo_sold-page0
completed 12 houses
Total time = 68.99 seconds

sanpablo_sold-page1
completed 12 houses
Total time = 71.18 seconds

sanpablo_sold-page2
completed 12 houses
Total time = 80.38 seconds

sanpablo_sold-page3
completed 12 houses
Total time = 79.39 seconds

sanpablo_sold-page4
completed 12 houses
Total time = 76.55 seconds

sanpablo_sold-page5
completed 12 houses
Total time = 78.85 seconds

sanpablo_sold-page6
completed 12 houses
Total time = 83.49 seconds

sanpablo_sold-page7
completed 12 houses
Total time = 82.09 seconds

sanpablo_sold-page8
completed 12 houses
Total time = 77.11 seconds

sanpablo_sold-page9
completed 12 houses
Total time = 76.72 seconds

sanpablo_sold-page10
completed 12 houses
Total time = 79.54 seconds

sanpablo_sold-page11
completed 12 houses
Total time = 76.08 seconds

sanpablo_sold-page12
completed 12 houses
Total time = 83.46 seconds

sanpablo_sold-page13
completed 12 houses
Total time = 79.76 seconds

sanpablo_sold-page14
completed 12 houses
Tot

In [4]:
sanpablo = pd.read_csv('./dataset/sanpablo_sold.csv')
print(sanpablo.shape)
sanpablo.head()

(296, 7)


Unnamed: 0,id,adress,bed,bath,house size,lot size,price
0,40833860.JPG,"5665 Shasta Ave SAN PABLO, CA 94806",4 Beds,2 Baths,"1,550 SqFt","3, 600","SOLD: $565,000"
1,40834112.JPG,"108 Padua SAN PABLO, CA 94806",4 Beds,2 | 1 Baths,"1,615 SqFt","1, 839","SOLD: $525,000"
2,40824385.JPG,"2121 Vale Road #3 SAN PABLO, CA 94806",3 Beds,3 Baths,"1,858 SqFt",747,"SOLD: $425,000"
3,40835001.JPG,"2455 Mahan Way SAN PABLO, CA 94806",4 Beds,2 Baths,"1,331 SqFt","10, 660","SOLD: $615,000"
4,40830917.JPG,"2943 14Th St SAN PABLO, CA 94806",3 Beds,2 Baths,"1,200 SqFt","5, 000","SOLD: $440,000"
