In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame, Series
import re
import dateutil.parser

In [2]:
baseurl = 'https://www.interest.co.nz/property/residential-auction-results?&page='

In [3]:
def create_urls(baseurl):
    num_pages = (31364/25) + 1
    urls = []
    for i in range(num_pages):
        i = i + 1
        url = baseurl+str(i)
        urls.append(url)
    return urls

In [4]:
allurls = create_urls(baseurl)

In [5]:
def scrapdata(allurls):
    out = []
    for url in allurls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "lxml")
        for listing in soup.find_all('div', id="padb-property-card"):
            property_id = listing.find('div', id="padb-property-id").get_text().strip()
            property_region = listing.find('div', id="padb-property-region").get_text().strip()
            property_address = listing.find('div', id="padb-property-address").get_text().strip()
            property_details = listing.find('div', id="padb-property-details").get_text().strip()
            property_agency = listing.find('div', id="padb-property-agency").get_text().strip()
            property_agents = listing.find('div', id="padb-property-agents").get_text().strip()
            property_auctiondetails = listing.find('div', id="padb-property-auction-details").get_text().strip()
            property_value = listing.find('div', id="padb-property-value").get_text().strip()
            df_dict = dict({'property_id' : property_id,'property_region' : property_region,
                    'property_address' : property_address, 'property_details' : property_details,
                     'property_agency' : property_agency, 'property_agents' : property_agents,
                    'property_auctiondetails' : property_auctiondetails,'property_value' : property_value})
            out.append(df_dict)
    return out

In [6]:
out = scrapdata(allurls)

In [7]:
scrapped_data = pd.DataFrame.from_dict(out, orient='columns')

In [8]:
scrapped_data.to_pickle('scrapped_data_30thjuly.pkl')

In [9]:
scrapped_data = pd.read_pickle('scrapped_data_30thjuly.pkl')

In [14]:
scrapped_data.head()

In [10]:
#Cleaning agent names
def get_agent_names(row):
    s = row['property_agents']
    return s.replace("Agent(s): ","").replace("(contact)","")

In [11]:
scrapped_data['property_agents_cleaned'] = scrapped_data.apply(get_agent_names, axis=1)

ValueError: Wrong number of items passed 0, placement implies 1

In [12]:
#Cleaning property id
def get_property_id(row):
    s = row['property_id']
    return re.match(r'Listing ID:(.*)', s).group(1)

In [13]:
scrapped_data['property_id_cleaned'] = scrapped_data.apply(get_property_id, axis=1)

ValueError: Wrong number of items passed 0, placement implies 1

In [14]:
def get_auction_date(row):
    s = row['property_auctiondetails'].strip()
    mon = r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)'
    day1 = r'\d{1,2}(?:st|nd|rd|th)?'
    year1 = r'\b\d{2}(?:\d{2})?\b'
    dummy = r'.*'
    rx = "((?:{smon}\s+{sday1}|{sday1}\s+{smon}))(?:{sdummy}({syear1}))?".format(smon=mon, sday1=day1, sdummy=dummy, syear1=year1)
    m = re.search(rx, s)
    if m:
        return("{} {}".format(m.group(1), m.group(2)))
    else:
        return("NO MATCH")

In [15]:
scrapped_data['auction_date'] = scrapped_data.apply(get_auction_date, axis=1)

In [16]:
def sold_or_not(row):
    s = row['property_value'].strip()
    if s:
        if s[0].isdigit():
            out = 'Sold'
        elif s[0] == 'S':
            out = 'Sold'
        else:
            out = 'Not_Sold'
    else:
        out = 'Not_Sold'
    return out

In [17]:
scrapped_data['sold_or_not'] = scrapped_data.apply(sold_or_not, axis=1)

In [18]:
def parse_date(row):
    if row.auction_date == 'February 20 None':
        row.auction_date = 'February 20 2017'
    elif row.auction_date == '13 October 34':
        row.auction_date = '13 October 2016'
    elif row.auction_date == '29 September 34':
        row.auction_date = '29 September 2016'
    elif row.auction_date == '2 February 34':
        row.auction_date = '2 February 2016'
    elif row.auction_date == '7th September 34':
        row.auction_date = '7 September 2016'
    elif row.auction_date == '28 September 34':
        row.auction_date = '28 September 2016'
    else:
        row.auction_date = row.auction_date
    if row.auction_date != 'NO MATCH':
        return dateutil.parser.parse(row.auction_date)
    else:
        return ''

In [19]:
scrapped_data['auction_date'] = scrapped_data.apply(parse_date, axis=1)

In [20]:
di = {"zero": 0 , "one": 2, "two" : 2, "three" : 3, "four" : 4,
      "five" : 5, "six" : 6, "seven" : 7, "eight" : 8, "nine" : 9, "ten" : 10, "eleven" : 11}

In [21]:
def getbedRoomInfo(row):
    s = row['property_details'].strip()
    numRooms = re.search(r'(\w+\+?) *(?:double +)?bed(?:room)?s?|bed(?:room)?s?:? *(\d+\+?)', s, re.IGNORECASE)
    if (numRooms):
        if (numRooms.group(1) is not None):
            return numRooms.group(1).lower()
        elif (numRooms.group(2) is not None):
            return numRooms.group(2).lower()

In [22]:
scrapped_data['num_bedrooms'] = scrapped_data.apply(getbedRoomInfo, axis=1)
scrapped_data['num_bedrooms'] = scrapped_data['num_bedrooms'].str.replace('+','')
scrapped_data['num_bedrooms'] = scrapped_data['num_bedrooms'].str.strip()
scrapped_data['num_bedrooms'] = scrapped_data['num_bedrooms'].map(di).fillna(scrapped_data['num_bedrooms'])

In [23]:
def getbathRoomInfo(row):
    s = row['property_details'].strip()
    numRooms = re.search(r'(\w+\+?) *(?:double +)?bath(?:room)?s?|bath(?:room)?s?:? *(\d+\+?)', s, re.IGNORECASE)
    if (numRooms):
        if (numRooms.group(1) is not None):
            return numRooms.group(1).lower()
        elif (numRooms.group(2) is not None):
            return numRooms.group(2).lower()

In [24]:
scrapped_data['num_bathrooms'] = scrapped_data.apply(getbathRoomInfo, axis=1)
scrapped_data['num_bathrooms'] = scrapped_data['num_bathrooms'].str.replace('+','')
scrapped_data['num_bathrooms'] = scrapped_data['num_bathrooms'].str.strip()
scrapped_data['num_bathrooms'] = scrapped_data['num_bathrooms'].map(di).fillna(scrapped_data['num_bathrooms'])

In [25]:
scrapped_data.to_csv('scrapped_data_30thjuly.csv', index = False,encoding='utf-8')