In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame, Series
import re
import dateutil.parser
from word2number import w2n

In [2]:
baseurl = 'https://www.interest.co.nz/property/residential-auction-results?&page='

In [8]:
def create_urls(baseurl):
    num_pages = (31039/25) + 1
    urls = []
    for i in range(num_pages):
        i = i + 1
        url = baseurl+str(i)
        urls.append(url)
    return urls

In [9]:
allurls = create_urls(baseurl)

In [10]:
#testurls = ["https://www.interest.co.nz/property/residential-auction-results?region=Auckland&district=Waitakere%20City&area=Hobsonville&agency=-&status=Sold&page=1"]
#testurls

In [11]:
def scrapdata(allurls):
    out = []
    for url in allurls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "lxml")
        for listing in soup.find_all('div', id="padb-property-card"):
            property_id = listing.find('div', id="padb-property-id").get_text().strip()
            property_region = listing.find('div', id="padb-property-region").get_text().strip()
            property_address = listing.find('div', id="padb-property-address").get_text().strip()
            property_details = listing.find('div', id="padb-property-details").get_text().strip()
            property_agency = listing.find('div', id="padb-property-agency").get_text().strip()
            property_agents = listing.find('div', id="padb-property-agents").get_text().strip()
            property_auctiondetails = listing.find('div', id="padb-property-auction-details").get_text().strip()
            property_value = listing.find('div', id="padb-property-value").get_text().strip()
            df_dict = dict({'property_id' : property_id,'property_region' : property_region,
                    'property_address' : property_address, 'property_details' : property_details,
                     'property_agency' : property_agency, 'property_agents' : property_agents,
                    'property_auctiondetails' : property_auctiondetails,'property_value' : property_value})
            out.append(df_dict)
    return out

In [12]:
out = scrapdata(allurls)

In [13]:
scrapped_data = pd.DataFrame.from_dict(out, orient='columns')

In [14]:
scrapped_data.to_pickle('scrapped_data.pkl')

In [2]:
scrapped_data = pd.read_pickle('scrapped_data.pkl')

In [3]:
#Cleaning agent names
def get_agent_names(row):
    s = row['property_agents']
    return s.replace("Agent(s): ","").replace("(contact)","")


scrapped_data['property_agents_cleaned'] = scrapped_data.apply(get_agent_names, axis=1)

In [4]:
#Cleaning property id
def get_property_id(row):
    s = row['property_id']
    return re.match(r'Listing ID:(.*)', s).group(1)

scrapped_data['property_id_cleaned'] = scrapped_data.apply(get_property_id, axis=1)

In [5]:
#Getting auction date
def get_auction_date_old(row):
    s = row['property_auctiondetails'].strip()
    matched = re.match(r'Auction details:\s(.*)', s)
    if matched:
        if '(On site)' in matched.group(1):
            return matched.group(1).replace("(On site)","")
        else:
            return matched.group(1)
    else:
        return ''

def get_auction_date(row):
    s = row['property_auctiondetails'].strip()
    return re.match(r'Auction details:\s*(.*)', s).group(1)

def get_auction_date_new(row):
    s = row['property_auctiondetails'].strip()
    mon = r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)'
    day1 = r'\d{1,2}(?:st|nd|rd|th)?'
    year1 = r'\b\d{2}(?:\d{2})?\b'
    dummy = r'.*'
    rx = "((?:{smon}\s+{sday1}|{sday1}\s+{smon}))(?:{sdummy}({syear1}))?".format(smon=mon, sday1=day1, sdummy=dummy, syear1=year1)
    m = re.search(rx, s)
    if m:
        return("{} {}".format(m.group(1), m.group(2)))
    else:
        return("NO MATCH")

scrapped_data['auction_date'] = scrapped_data.apply(get_auction_date_new, axis=1)

In [6]:
def sold_or_not(row):
    s = row['property_value'].strip()
    if 'Passed in' in s:
        out = 'Not_Sold'
    elif 'Withdrawn' in s:
        out = 'Not_Sold'
    elif s == '':
        out = 'Not_Sold'
    else:
        out = 'Sold'
    return out

def sold_or_notv1(row):
    s = row['property_value'].strip()
    if s:
        if s[0].isdigit():
            out = 'Sold'
        elif s[0] == 'S':
            out = 'Sold'
        else:
            out = 'Not_Sold'
    else:
        out = 'Not_Sold'
    return out
 
#scrapped_test = scrapped_data.tail()
#scrapped_test['sold_or_not'] = scrapped_test.apply(sold_or_notv1, axis=1)

scrapped_data['sold_or_not'] = scrapped_data.apply(sold_or_notv1, axis=1)

In [7]:
def sold_or_not_test(s):
    s = s.strip()
    if s:
        if s[0].isdigit():
            out = 'Sold'
        elif s[0] is 'S':
            out = 'Sold'
        else:
            out = 'Not_Sold'
    else:
        out = 'Not_Sold'
    return out

In [8]:
sold_or_not_test('Sold for:$500,000')

'Sold'

In [9]:
def parse_date(row):
    if row.auction_date == 'February 20 None':
        row.auction_date = 'February 20 2017'
    elif row.auction_date == '13 October 34':
        row.auction_date = '13 October 2016'
    elif row.auction_date == '29 September 34':
        row.auction_date = '29 September 2016'
    elif row.auction_date == '2 February 34':
        row.auction_date = '2 February 2016'
    elif row.auction_date == '7th September 34':
        row.auction_date = '7 September 2016'
    elif row.auction_date == '28 September 34':
        row.auction_date = '28 September 2016'
    else:
        row.auction_date = row.auction_date
    if row.auction_date != 'NO MATCH':
        return dateutil.parser.parse(row.auction_date)
    else:
        return ''

In [10]:
scrapped_data['auction_date'] = scrapped_data.apply(parse_date, axis=1)

In [11]:
def get_num_bed(row):
    s = row['property_details'].strip()
    rgx = r'(\w+) *(?:double +)?bed(room)?s?|bed(?:room)?s?:? *(\d+)'
    m = re.search(rgx, s.lower())
    if m :
        return filter(lambda x: x.isdigit(), m.group())
    
def get_num_bath(row):
    s = row['property_details'].strip()
    rgx = r'(\w+) *(?:double +)?bath(room)?s?|bath(?:room)?s?:? *(\d+)'
    m = re.search(rgx, s.lower())
    if m :
        return filter(lambda x: x.isdigit(), m.group())

In [12]:
def getbedRoomInfo(row):
    s = row['property_details'].strip()
    numRooms = re.search(r'(\w+\+?) *(?:double +)?bed(?:room)?s?|bed(?:room)?s?:? *(\d+\+?)', s, re.IGNORECASE)
    if (numRooms):
        if (numRooms.group(1) is not None):
            return numRooms.group(1).lower()
        elif (numRooms.group(2) is not None):
            return numRooms.group(2).lower()

In [13]:
def getbathRoomInfo(row):
    s = row['property_details'].strip()
    numRooms = re.search(r'(\w+\+?) *(?:double +)?bath(?:room)?s?|bath(?:room)?s?:? *(\d+\+?)', s, re.IGNORECASE)
    if (numRooms):
        if (numRooms.group(1) is not None):
            return numRooms.group(1).lower()
        elif (numRooms.group(2) is not None):
            return numRooms.group(2).lower()

In [14]:
#test = scrapped_data.head(10)
#test['num_bedrooms'] = test.apply(getbedRoomInfo, axis=1)

In [15]:
di = {"zero": 0 , "one": 2, "two" : 2, "three" : 3, "four" : 4,
      "five" : 5, "six" : 6, "seven" : 7, "eight" : 8, "nine" : 9, "ten" : 10, "eleven" : 11}

In [23]:
scrapped_data['num_bedrooms'] = scrapped_data.apply(getbedRoomInfo, axis=1)

In [25]:
scrapped_data['num_bedrooms'] = scrapped_data['num_bedrooms'].str.replace('+','')
scrapped_data['num_bedrooms'] = scrapped_data['num_bedrooms'].str.strip()
scrapped_data['num_bedrooms'] = scrapped_data['num_bedrooms'].map(di).fillna(scrapped_data['num_bedrooms'])

In [17]:
#scrapped_data['num_bedrooms'] = scrapped_data.apply(getbedRoomInfo, axis=1)
#scrapped_data['num_bedrooms'] = scrapped_data.apply(bed_convert_to_num, axis=1)

In [27]:
scrapped_data['num_bathrooms'] = scrapped_data.apply(getbathRoomInfo, axis=1)
scrapped_data['num_bathrooms'] = scrapped_data['num_bathrooms'].str.replace('+','')
scrapped_data['num_bathrooms'] = scrapped_data['num_bathrooms'].str.strip()
scrapped_data['num_bathrooms'] = scrapped_data['num_bathrooms'].map(di).fillna(scrapped_data['num_bathrooms'])

In [28]:
scrapped_data.to_csv('scrapped_data.csv', index = False,encoding='utf-8')

In [26]:
scrapped_data[scrapped_data.property_id_cleaned.isin(['ebt3177','ech1146','578663'])]

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date,sold_or_not,num_bedrooms
208,"81 Lynley Park Drive, Omokoroa",Eves,"Agent(s): Ron Beccard (contact), Debi Bennett ...","Auction details: Thursday, 28th February '19","3+ bedroom(s), 2 bathroom(s), 3 garage(s)",Listing ID:ebt3177,"Bay of Plenty, Western Bay Of Plenty",Passed In,"Ron Beccard , Debi Bennett",ebt3177,2019-02-28,Not_Sold,3
211,"23 Emmerdale Place, Ohauiti",Eves,Agent(s): Chris Royal (contact),"Auction details: Thursday, 28th February '19","3+ bedroom(s), 2 bathroom(s), 2 garage(s)",Listing ID:ech1146,"Bay of Plenty, Western Bay Of Plenty",Passed In,Chris Royal,ech1146,2019-02-28,Not_Sold,3
27249,"8 Reipae Street, Stonefields, Auckland",Barfoot & Thompson,Agent(s): Paul Neshausen,"Auction details: 14 September 2016, Shortland ...","Four bedroom home , double garage",Listing ID:578663,"Auckland, Auckland City",Passed in,Paul Neshausen,578663,2016-09-14,Not_Sold,4


In [75]:
def bed_convert_to_num(row):
    mylist = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen"]
    s = row.num_bedrooms
    if (s is not None):
        print s
        print 'Number'
        s = str(s.replace('+',''))
        if not s.isdigit():
            if s in mylist:
                out = w2n.word_to_num(s)
            else:
                out = None
    else:
        out = None
        print 'text'
    return out

def bath_convert_to_num(row):
    s = row.num_bathrooms
    if (s is not None):
        s = str(s.replace('+',''))
        if not s.isdigit():
            try:
                return w2n.word_to_num(s)
            except ValueError as e:
                return None
    else:
        return s

In [26]:
#scrapped_data[scrapped_data.auction_date.isnull()]

In [27]:
#scrapped_data.tail()

In [34]:
scrapped_data[scrapped_data.num_bedrooms.isnull()].head()

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date,sold_or_not,num_bedrooms,num_bathrooms
0,"38 Main Road, Tirau",Bayleys,"Agent(s): Mike Swanson (contact), Alex Ten Hov...","Auction details: Thursday, 7th March '19",,Listing ID:2310304,"Waikato, South Waikato","Sold for:$518,000Rating Valuation:(July '18)$3...","Mike Swanson , Alex Ten Hove",2310304,2019-03-07,Sold,,
1,"Ground floor 118 Rostrevor Street, Hamilton Ce...",Bayleys,"Agent(s): David Cashmore (contact), Sarah Liu ...","Auction details: Thursday, 7th March '19",,Listing ID:2310273,"Waikato, Hamilton City",Passed In,"David Cashmore , Sarah Liu (Hamilton)",2310273,2019-03-07,Not_Sold,,
2,"195 Te Kawana Road, Te Aroha",Bayleys,"Agent(s): Josh Smith (contact), David Cashmore...","Auction details: Thursday, 7th March '19","365m²,",Listing ID:2310206,"Waikato, Matamata-Piako",Withdrawn,"Josh Smith , David Cashmore",2310206,2019-03-07,Not_Sold,,
26,"A/29 Birdwood Crescent, Parnell",Bayleys,"Agent(s): Kate Kirby (contact), Stephen Scott ...","Auction details: Wednesday, 6th March '19","Block of seven 1960s-era units, each on its ow...",Listing ID:1687521,"Auckland, Auckland City","Sold for:$4,025,000","Kate Kirby , Stephen Scott",1687521,2019-03-06,Sold,,
69,"92 Pleasant Road, Titirangi",Barfoot & Thompson,Agent(s): Alan Hyslop and Robyn Rule,"Auction details: Saturday, 2nd March '19",,Listing ID:770178,"Auckland, Waitakere City","Sold for:$850,000Rating Valuation:(July '17)$9...",Alan Hyslop and Robyn Rule,770178,2019-03-02,Sold,,


In [63]:
test = scrapped_data[scrapped_data.property_id_cleaned.isin(['579087','578396','578663'])]
#scrapped_data[scrapped_data.index.isin([579087])]

In [64]:
test.head()

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date,sold_or_not,num_bedrooms,num_bathrooms
27229,"5A/22 Emily Place, Auckland City, Auckland",Barfoot & Thompson,Agent(s): Aken Yuan and Tony Jiang,"Auction details: 14 September 2016, Shortland ...",Two bedroom apartment with views,Listing ID:579087,"Auckland, Auckland City",Passed in,Aken Yuan and Tony Jiang,579087,2016-09-14,Not_Sold,Two,
27232,"1/7 Sprott Road, Kohimarama, Auckland",Barfoot & Thompson,Agent(s): Matt O'Brien,"Auction details: 14 September 2016, Shortland ...",Four bedroom home,Listing ID:578396,"Auckland, Auckland City",Passed in,Matt O'Brien,578396,2016-09-14,Not_Sold,Four,
27249,"8 Reipae Street, Stonefields, Auckland",Barfoot & Thompson,Agent(s): Paul Neshausen,"Auction details: 14 September 2016, Shortland ...","Four bedroom home , double garage",Listing ID:578663,"Auckland, Auckland City",Passed in,Paul Neshausen,578663,2016-09-14,Not_Sold,Four,


In [44]:
import re

def getRoomInfo(s):
    s = s.strip()
    numRooms = re.search(r'(\w+\+?) *(?:double +)?bed(?:room)?s?|bed(?:room)?s?:? *(\d+\+?)', s, re.IGNORECASE)
    if (numRooms):
        if (numRooms.group(1) is not None):
            return numRooms.group(1)
        elif (numRooms.group(2) is not None):
            return numRooms.group(2)


#arr = ['Two bedroom apartment with views','4 bedrooms 2 bathrooms 3 carparks','3 bedroom house','Bedrooms 2, ','beds 5,','Bedrooms 1, ','2 bedrooms, 1 bathroom, ','Four bedrooms home, double garage','Four bedrooms home','Three double bedrooms home, garage','Three bedrooms home,','2 bedroom home unit with single carport.','Garage car spaces: 2, Bathrooms: 4, Bedrooms: 7,\\\\', 'Three bedroom bungalow with conservatory and", "One bedroom unit","4+ bedroom(s), 2 bathroom(s), 2 garage(s)']
arr = list(test.property_details)
for s in arr:
    print(s, ' --> ', getRoomInfo(s))

(u'Two bedroom apartment with views', ' --> ', u'Two')
(u'Four bedroom home', ' --> ', u'Four')
(u'Four bedroom home , double garage', ' --> ', u'Four')


In [7]:
#scrapped_data = scrapped_data.join(scrapped_data['auction_date'].str.split(' ', expand=True).add_prefix('actiondate').fillna(np.nan))

In [8]:
print(len(scrapped_data[scrapped_data.actiondate3.notnull()]))

34


In [23]:
#date1 = "February 20 2017"
#dateutil.parser.parse(date1)

datetime.datetime(2017, 2, 20, 0, 0)

In [10]:
scrapped_data[scrapped_data.actiondate3.notnull()]

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date,sold_or_not,actiondate0,actiondate1,actiondate2,actiondate3
25357,"15 Mural Place, Greenhithe,Auckland",Barfoot & Thompson,Agent(s): Analia Gentile and Titus Lu,"Auction details: 6 October 2016, Bruce Mason ...","Four bedrooms home, double garage",Listing ID:580625,"Auckland, North Shore City","Sold for:$1,210,000",Analia Gentile and Titus Lu,580625,6 October 2016,Sold,6,,October,2016
25358,"51 Park Road, Glenfield, Auckland",Barfoot & Thompson,Agent(s): Jacky Yu and Jane Wang,"Auction details: 6 October 2016, Bruce Mason ...",Four bedrooms home,Listing ID:578732,"Auckland, North Shore City",Passed in,Jacky Yu and Jane Wang,578732,6 October 2016,Not_Sold,6,,October,2016
25359,"575 Laurie Southwick Parade, Gulf Harbour, Auc...",Barfoot & Thompson,Agent(s): Maxwell Zhu and Wendy Wang,"Auction details: 6 October 2016, Bruce Mason ...",Water front section,Listing ID:579806,"Auckland, Rodney",Passed in,Maxwell Zhu and Wendy Wang,579806,6 October 2016,Not_Sold,6,,October,2016
25360,"177 Metcalfe Road, Ranui, Auckland",Barfoot & Thompson,Agent(s): Cleo Zhang,"Auction details: 6 October 2016, Bruce Mason ...","1950's home, three double bedrooms, set on a 8...",Listing ID:579928,"Auckland, Waitakere City",Passed in,Cleo Zhang,579928,6 October 2016,Not_Sold,6,,October,2016
25361,"6 Phillips Road, Warkworth, Auckland",Barfoot & Thompson,Agent(s): David Dai,"Auction details: 6 October 2016, Bruce Mason ...","Character home, four bedrooms, set on a 12797sqm",Listing ID:579297,"Auckland, Rodney","Sold for:$880,000",David Dai,579297,6 October 2016,Sold,6,,October,2016
25362,"9 Steamer Road, Silverdale, Auckland",Barfoot & Thompson,Agent(s): Anna Langdon,"Auction details: 6 October 2016, Bruce Mason ...","Five double bedrooms home, four living areas",Listing ID:575146,"Auckland, Rodney",Passed in,Anna Langdon,575146,6 October 2016,Not_Sold,6,,October,2016
25363,"10 Woodall Place, Glenfield, Auckland",Barfoot & Thompson,Agent(s): Brian Li,"Auction details: 6 October 2016, Bruce Mason ...",Home set in a cul de sac,Listing ID:579832,"Auckland, North Shore City",Passed in,Brian Li,579832,6 October 2016,Not_Sold,6,,October,2016
25364,"16A Bay Street, Red Beach, Auckland",Barfoot & Thompson,Agent(s): Debbie Donovan,"Auction details: 6 October 2016, Bruce Mason ...","Two level home , three bedrooms",Listing ID:579314,"Auckland, Rodney","Sold for:$697,000",Debbie Donovan,579314,6 October 2016,Sold,6,,October,2016
25365,"30 Hornbill Drive, Albany, Auckland",Barfoot & Thompson,Agent(s): Raymond Li,"Auction details: 6 October 2016, Bruce Mason ...","Modern home, five double bedrooms, double inte...",Listing ID:579611,"Auckland, North Shore City","Sold for:$1,500,000",Raymond Li,579611,6 October 2016,Sold,6,,October,2016
25366,"5/242 Great North Road, Henderson, Auckland",Barfoot & Thompson,Agent(s): Rakhi Walecha and Richard Pearce,"Auction details: 6 October 2016, Bruce Mason ...","Single level home, two double bedrooms",Listing ID:580125,"Auckland, Waitakere City",Passed in,Rakhi Walecha and Richard Pearce,580125,6 October 2016,Not_Sold,6,,October,2016


In [20]:
scrapped_data[scrapped_data.index.isin([21707])]['auction_date']

21707    February 20 None
Name: auction_date, dtype: object

In [26]:
scrapped_data[scrapped_data.index.isin([129])]

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date
129,"1/49 Namata Road, One Tree Hill",Barfoot & Thompson,Agent(s): Kelly Fan,"Auction details: Thursday, 28th February '19","2 bed, 1 bath",Listing ID:770076,"Auckland, Auckland City",,Kelly Fan,770076,28th February 19


In [22]:
scrapped_data.iloc[np.r_[21700:21800]]
#scrapped_data.iloc[np.r_[1000:1100]]

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date,sold_or_not,actiondate0,actiondate1,actiondate2,actiondate3
21700,"54 Jandell Cres, Bucklands Beach, Auckland.",Barfoot & Thompson,Agent(s): Jessie Lu,"Auction details: 31 January 2017, Manukau Spor...","5 Bedroom, 4 bathroom house with dbl garage on...",Listing ID:585961,"Auckland, Manukau City",Passed in,Jessie Lu,585961,31 January 2017,Not_Sold,31,January,2017,
21701,"29 Annalong Rd, Dannemora, Auckland.",Barfoot & Thompson,"Agent(s): Raylene Yang, Sam Zhang","Auction details: 31 January 2017, Manukau Spor...",,Listing ID:587189,"Auckland, Manukau City","Sold for:$1,590,000","Raylene Yang, Sam Zhang",587189,31 January 2017,Sold,31,January,2017,
21702,"11 Bexley Place, Papakura, Auckland.",Barfoot & Thompson,"Agent(s): Pamela Cox, Nadia Ansell","Auction details: 31 January 2017, Manukau Spor...",,Listing ID:586510,"Auckland, Papakura","Sold for:$764,000","Pamela Cox, Nadia Ansell",586510,31 January 2017,Sold,31,January,2017,
21703,"3 Yearsley Place, Manurewa, Auckland",Barfoot & Thompson,Agent(s): Justin Coleman,Auction details: 31 January 2017. Manukau Spor...,,Listing ID:587350,"Auckland, Manukau City","Sold for:$532,000",Justin Coleman,587350,31 January 2017,Sold,31,January,2017,
21704,"14 Tarapiroe Ave, Takanini, Auckland.",Barfoot & Thompson,Agent(s): Alex Shin,Auction details: 31 January 2017. Manukau Spor...,"4 bedroom, 2 bathroom house with double garage...",Listing ID:587339,"Auckland, Papakura",Passed in,Alex Shin,587339,31 January 2017,Not_Sold,31,January,2017,
21705,"9/38 MacLeod Rd, Henderson, Auckland.",Barfoot & Thompson,"Agent(s): Michael Benns, Denzil Pi\nto","Auction details: 4 February, On site(2017)",2 bedroom home unit with single carport.,Listing ID:585663,"Auckland, Waitakere City",Passed in,"Michael Benns, Denzil Pi\nto",585663,4 February 2017,Not_Sold,4,February,2017,
21706,"30 Sturges Rd, Henderson Heights, Auckland.",Barfoot & Thompson,"Agent(s): Joseph Lee, Kasey Wayne","Auction details: 4 February 2017, On site","4 bedroom, 2 bathroom, brick and tile with dou...",Listing ID:587067,"Auckland, Waitakere City","Sold for:$1,060,000","Joseph Lee, Kasey Wayne",587067,4 February 2017,Sold,4,February,2017,
21707,"3/14 The Esplanade, Castor Bay, Auckland.",Barfoot & Thompson,Agent(s): Tim Mahon,"Auction details: February 2017, On site.","4 bedroom, 2 bathroom with double garage and s...",Listing ID:586480,"Auckland, North Shore City",Passed in,Tim Mahon,586480,February 20 None,Not_Sold,February,20,,
21708,"9 Ponderosa Drive, Oteha, Auckland.",Barfoot & Thompson,"Agent(s): Charlotte Conroy, Kathlyn Xavier","Auction details: February 2017, On site.",,Listing ID:587069,"Auckland, North Shore City","Sold for:$1,150,000","Charlotte Conroy, Kathlyn Xavier",587069,February 20 None,Sold,February,20,,
21709,"13 Rising Parade, Fairview Heights, Auckland.",Barfoot & Thompson,"Agent(s): Cici Yang, Ritz, Zhao",Auction details: On site. February 2017,,Listing ID:587333,"Auckland, North Shore City",Sold for:$Sold prior,"Cici Yang, Ritz, Zhao",587333,February 20 None,Sold,February,20,,


In [68]:
#Pring top 3 and bottom 3 rows
scrapped_data.iloc[np.r_[0:3, -3:0]]

Unnamed: 0,property_address,property_agency,property_agents,property_auctiondetails,property_details,property_id,property_region,property_value,property_agents_cleaned,property_id_cleaned,auction_date,auction_location
0,"6C/55-57 High St, CBD.",Ray White City Apartments,"Agent(s): Krister Samuel (contact), Damian Pig...","Auction details: Thursday, 28th February '19","2 bedrooms, 1 bathroom, 83sqm.",Listing ID:2045071,"Auckland, Auckland City","Sold for:$Sold post $625,000","Krister Samuel , Damian Piggin",2045071,"Thursday, 28th February '19",Offsite
1,"617/145 Nelson St, CBD. Sugartree Centro build...",Ray White City Apartments,Agent(s): Victor Liu (contact),"Auction details: Thursday, 28th February '19","1 bedroom, 1 bathroom, 57sqm.",Listing ID:2045089,"Auckland, Auckland City","Sold for:$480,000",Victor Liu,2045089,"Thursday, 28th February '19",Offsite
2,"71/8 Thompson Park Rd, Mt Wellington. Thompson...",Ray White City Apartments,Agent(s): Lucia Gao (contact),"Auction details: Thursday, 28th February '19","2 bedrooms, 2 bathrooms, 1 carpark, 76sqm.",Listing ID:2041156,"Auckland, Auckland City",Passed in,Lucia Gao,2041156,"Thursday, 28th February '19",Offsite
28315,"Belmont, 17 Creamer Ave",Harcourts,"Agent(s): Jemma Glancy (contact), Victoria Mul...",Auction details: Sunday 24 Apr 12:00 p.m. (On ...,"Off street car spaces: 3, Carport: 1, Garage: ...",Listing ID:DP8575,"Auckland, North Shore City","Sold for:$1,675,000","Jemma Glancy , Victoria Mules",DP8575,Sunday 24 Apr 12:00 p.m. (2016),Onsite
28316,"Northcote Point, 3/68 Richmond Ave",Harcourts,Agent(s): David Hibbins (contact),Auction details: Sunday 24 Apr 12:00 p.m. (On ...,"Garage: 1, Study: 1, Lounge: 1, Bathroom: 1, B...",Listing ID:BI34397,"Auckland, North Shore City","Sold for:$940,000",David Hibbins,BI34397,Sunday 24 Apr 12:00 p.m. (2016),Onsite
28317,"Devonport, 24 Niccol Avenue",Harcourts,Agent(s): Maria Stevens (contact),Auction details: Sunday 24 Apr 11:00 a.m. (On ...,"Dining Room: 1, Total lounges: 2, Total toil...",Listing ID:DP8567,"Auckland, North Shore City","Sold for:$1,560,000",Maria Stevens,DP8567,Sunday 24 Apr 11:00 a.m. (2016),Onsite


In [57]:
#Getting auction location
def get_auction_loc(row):
    s = row['property_auctiondetails']
    if '(On site)' in s:
        return 'Onsite'
    else:
        return 'Offsite'

scrapped_data['auction_location'] = scrapped_data.apply(get_auction_loc, axis=1)

In [52]:
print(len(scrapped_data))

28318


In [35]:
#response = requests.get("https://www.interest.co.nz/property/residential-auction-results?region=Auckland&district=Waitakere%20City&area=Hobsonville&agency=-&status=Sold&page=1")
#soup = BeautifulSoup(response.text, "html.parser")
#soup = BeautifulSoup(response.text, "lxml")
#soup.find_all('div', id="padb-property-card")

In [93]:
#soup.findAll('a')
#soup.find_all('div', id="padb-property-card")

In [None]:
<div id='padb-property-region'>Auckland, Auckland City </div>
<div id='padb-property-address'>6C/55-57 High St, CBD.</div>
<div id='padb-property-details'>2 bedrooms, 1 bathroom, 83sqm.</div>
<div id='padb-property-other-info'>
    <div id='padb-property-id'><span>Listing ID:<br></span>2045071</div>
    <div id='padb-property-value'><span>Sold for:<br></span>$Sold post $625,000</div>
    <div id='padb-property-value'><span style='text-transform:capitalize;'>Passed in</span></div>
    <div id='padb-property-value'><span>Sold for:<br></span>$Sold Prior<span class='padb-rv-span'>Rating Valuation:</span><br>(July '18)<br>$760,000</div>
</div>
<div id='padb-property-agency'>Ray White City Apartments</div>
<div id='padb-property-agents'><span>Agent(s): </span><a target='_blank' href='https://rwcityapartments.co.nz/agents/krister-samuel/43638'>Krister Samuel (contact)</a>,&nbsp;<a target='_blank' href='https://rwcityapartments.co.nz/agents/krister-samuel/43638'>Damian Piggin (contact)</a>&nbsp;</div>
<div id='padb-property-auction-details'><span>Auction details: </span>Thursday, 28th February '19</div>
---------------------------------------------------
 <div class='padb-wo-left'>
            <div id='padb-property-region'>Auckland, Auckland City</div>
            <div id='padb-property-address'>6C/55-57 High St, CBD.</div><br><div id='padb-property-details'>2 bedrooms, 1 bathroom, 83sqm.</div><br>
            <div id='padb-property-description'><p>Large, liveable layout (83m2). Two bedrooms. Super high stud height. Beautiful casement windows framing a quintessential Auckland outlook, over Queen Street. Chunky construction ensures structural integrity, privacy and over the top sound insulation. Complete peace of mind on offer here. The residence is a part of a boutique complex of only 27 apartments and occupies the top floor. To say opportunities like this are rare is a total understatement. This versatile property has been in the same family for years and has provided solutions as their requirements have evolved. As a lock n leave city pad for the Parents; as a University flat for the kids and most recently as a low maintenance high yielding investment property: It has been the answer.</p>
</div>
          </div>
            <div class='padb-wo-right'>
              <b>Listing ID</b><br>2045071<br><br><b>Auction details: </b><br>Thursday, 28th February '19<br><br>
              <b>Property value</b><br>$Sold post $625,000 <br><br>
              <b>Agent(s):</b><br><a style='color:#25006D;' target='_blank' href='https://rwcityapartments.co.nz/agents/krister-samuel/43638'>Krister Samuel</a><br><a style='color:#25006D;' target='_blank' href='https://rwcityapartments.co.nz/agents/krister-samuel/43638'>Damian Piggin</a><br><br><br><img style='width:100%;' src='/sites/default/files/property-images/ray-white-logo.jpg'><br>
           </div>
          <div style='clear:both'></div>
        </div>


In [51]:
#Creating the required dataframe
listings_df = pd.DataFrame()
column_names = ['listing_id','property_address','property_region','num_bedroom',
                'num_bathroom','num_garage','selling_agency','selling_agents',
                'auction_day','auction_date','sold_or_not',
                'sale_price','property_valuation_date','property_value']
listings_df = listings_df.reindex(columns=column_names)  

In [52]:
listings_df.head()

Unnamed: 0,listing_id,property_address,property_region,property_details,selling_agency,selling_agents,auction_day,auction_date,sold_or_not,sale_price,property_valuation_date,property_value


In [16]:
#Getting listing id
x = "Listing ID:etl1388"
print re.match(r'Listing ID:(.*)', x).group(1)

etl1388


In [57]:
print text2int ('four bedrooms home, double garage')
print text2int ('three double bedrooms home, garage')

4 bedrooms home, double garage 
3 double bedrooms home, garage 


In [75]:
strlist = ['4 bedrooms 2 bathrooms 3 carparks','3 bedroom house',
'Bedrooms 2','beds 5','Bedrooms 1', 'Four bedrooms home, double garage',
'Four bedrooms home','Three double bedrooms home, garage','Three bedrooms home',
'Garage car spaces: 2, Bathrooms: 4, Bedrooms: 7']

In [44]:
from word2number import w2n
print w2n.word_to_num("Four bedrooms home, double garage")

4


In [28]:
s = "4 bedroom 2 bathrooms 3 carparks"
s = "Bedrooms 1,"
#print re.match(r'(.*)bedrooms|bedroom', s).group(1)
print re.search(r'(.*)(?:bedrooms|bedroom)', s,re.I).group(1)

AttributeError: 'NoneType' object has no attribute 'group'

4 bedrooms
3 bedroom
bedrooms 2
beds 5
bedrooms 1
four bedrooms
four bedrooms
three double bedrooms
three bedrooms
bedrooms: 7


In [59]:
def get_bedroom_num(s):
    if ':' in s:
        out = re.search(r'(?:Bedrooms:|Bedroom:)(.*)', s,re.I).group(1)
    elif ',' in s:
        out = re.search(r'(?:bedrooms|bedroom|beds)(.*)', s,re.I).group(1)
    else:
        out = re.search(r'(.*)(?:bedrooms|bedroom).*', s,re.I).group(1)
    out = filter(lambda x: x.isdigit(), out)
    return out

In [61]:
s = "Garage car spaces: 2, Bathrooms: 4, Bedrooms: 7,"
#s = "Bedrooms 1,"
s = "4 bedroom 2 bathrooms 3 carparks"
#s = '4 bedrooms home, double garage'
get_bedroom_num(s)

'4'

In [21]:
x = "28th February 19"
print re.match(r'(.*)?:st|nd|rd|th', x).group(1)

AttributeError: 'NoneType' object has no attribute 'group'

In [74]:
#Getting agent names
y = "Agent(s): Anna Swift (contact), Mark Robinson (contact)"
print re.match(r'Agent\(s\):\s(.*)', y).group(1)

Anna Swift (contact), Mark Robinson (contact)


In [53]:
#Getting auction day, date
z = "Auction details: Sunday, 24th February '19"
#z = 'Auction details:'
print re.match(r'Auction details:\s(.*)', z).group(1)

Sunday, 24th February '19


In [54]:
xy = 'Sunday 24 Apr 12:00 p.m. (On site)(2016)'
if '(On site)' in xy:
    print('yes')
else:
    print('No')

yes


In [92]:
#Getting all the sales details
s = "Sold for:$980,000Rating Valuation:(July '17)$1,050,000"
print re.search(r'Sold for:(.*)Rating', s).group(1)
print re.search(r'\((.*)\)',s).group(1)
print re.search(r'\)(.*)', s).group(1)

$980,000
July '17
$1,050,000


In [None]:
s = "Thursday, 28th February '19"
s = "Thursday, 9th March '17"
s = "Sunday 24 Apr 12:00 p.m. (2016)"
s = "18 February 2017, On site"
s = "17 February 2017, Shortland Street, Auckland CBD"
s = "16 Feb 2017, 12:30pm at 2 Lorne Street, Auckland"
s = "November 16 Wednesday 2:00pm at 4 Viaduct Harbour Ave, Auckland(2016)"
s = "17 November 2016, Bruce Mason Centre, Takapuna"
s = "Friday 25 Nov 3:00 p.m. (On Site)(2016)"
s = "Auction details: 14 December 2016, Pukekohe Park"
s = "Auction details:  15th February 2017
 12.30pm, 
                    City Sales House,
                    445 Karangahape Road, Newton,
                    Auckland City 1010"
s = "Auction details: Friday 17 Feb 11:00 a.m. (98 Moorhouse Avenue)(2017)"
s = "Auction details: Friday 17 Feb 10:00 a.m. (Gold Auction Rooms, 471 Papanui Rd)(2017)"
s = "Auction details: Wednesday 27 Apr 1:00 p.m. (17/175 Millwater Parkway, Harcourts)(2016)"
s = "Auction details: Wednesday 27 Apr 1:00 p.m. (On site)(2016)"
s = "Auction details: Wednesday 27 Apr 1:00 p.m. (In Rooms - 923 Whangaparaoa Rd, Man)(2016)"
s = "Auction details: Thu 28 Apr 10:00 a.m., 50 Remuera Rd, Remuera, (2016)"
s = "Auction details: Wed 27 Apr 2:00 p.m., 4 Viaduct Harbour Ave, Auckland, (2016)"
s = "Auction details: Wednesday 1 Jun 6:00 p.m. (95 Manukau Rd Epsom (USP))(2016)"

In [26]:
#Date 'MON DD, YYYY' extraction
mon = ' (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?) '
day1 = r'\d{1,2}' 
year1 = r'\d{4}'
year2 = r'\(\d{4}\)'
dummy = r'.*'

In [27]:
match = re.search(day1 + mon + year1, "17 February 2017, Shortland Street, Auckland CBD")
print match.group()

17 February 2017


In [21]:
match = re.search(day1 + mon + year1, "16 Feb 2017, 12:30pm at 2 Lorne Street, Auckland")
print match.group()

16 Feb 2017


In [22]:
match = re.search(day1 + mon + dummy + year2, "Friday 25 Nov 3:00 p.m. (On Site)(2016)")
print match.group()

25 Nov 3:00 p.m. (On Site)(2016)


In [38]:
match_pattern = r"\d{4}|\(\d{4}\)|\'\d{2}" 
match = re.search(match_pattern, "Nov 4, '219")
print match.group()

'2


In [48]:
s = "17 Feb"

from dateutil.parser import parse

parse(s, fuzzy=True).year

2019

In [None]:
import re
from datetime import datetime

#Date 'DD/M/YY', 'DD/MM/YYYY', 'D/MM/YYYY' extraction
match = re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', "11/8/16")
print match.group()

#Date 'DD/MM' extraction
match = re.search(r'\d{1,2}/\d{1,2}', "11/21")
print match.group()


#Date 'DD MM YYYY' extraction
match = re.search(r'\d{1,2} \d{1,2} \d{2,4}', "1 8 2016")
print match.group()

#Date 'MON DD, YYYY' extraction
mon = '(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)'
match_pattern = r' \d{1,2}, \d{2,4}' 
match = re.search(mon + match_pattern, "Nov 4, 2016")
print match.group()

#Date 'MON. DD. YYYY' extraction
match_pattern = r'. \d{1,2}. \d{2,4}' 
match = re.search(mon + match_pattern, "Nov. 18. 2016")
print match.group()

#Date 'MON. DD. YYYY' extraction
match_pattern = r' \d{1,2} \d{2,4}' 
match = re.search(mon + match_pattern, "Nov 18 2016")
print match.group()

In [20]:
df = pd.DataFrame({'col2': {0: 'a', 1: 2, 2: np.nan}, 'col1': {0: 'w', 1: 'A', 2: 'B'}})

In [21]:
df.head()

Unnamed: 0,col1,col2
0,w,a
1,A,2
2,B,


In [22]:
dit = {"A":1,"B":2}
df['col1'].map(dit).fillna(df['col1'])

0    w
1    1
2    2
Name: col1, dtype: object