In [40]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import urllib.request
import re

In [313]:
PROPERTY_ATTRIBS = ['Age', 'Area', 'Backup Water', 'Bathroom', 'Bathroom 1', 'Bathroom 2',
       'Bathrooms', 'Bedroom', 'Bedroom 1', 'Bedroom 2', 'Bedroom 3',
       'Bedroom 4', 'Bedrooms', 'Deposit Requirements', 'Description',
       'Dining Rooms', 'Erf Size', 'Facing', 'Floor Number', 'Floor Size',
       'Furnished', 'Garage', 'Garages', 'Garden', 'Gardens', 'Generator',
       'Internet Access', 'Kitchen', 'Kitchens', 'Lease Period', 'Levies',
       'Lifestyle', 'Lounge', 'Lounges', 'Nearby Public Transport',
       'Number of floors', 'Occupation Date', 'Outbuilding', 'Parking',
       'Pets Allowed', 'Pool', 'Pools', 'Rates and Taxes', 'Reception Rooms',
       'Roof', 'Secure Parking', 'Security', 'Security 1', 'Security 2',
       'Special Feature', 'Special Feature 1', 'Special Feature 2',
       'Special Feature 3', 'Special Features', 'Special Levy',
       'Standalone Building', 'Street Address', 'Style', 'Type of Property',
       'Wall', 'Wheelchair Accessible', 'Window']

In [160]:
def load_scraped_data(links_list):
    # Take in the list of scraped links to all of the suburbs on Prop24 and then extract the price data
    # Returns a dictionary with the suburb price data by suburb
    suburb_price_data = {}
    for link in links_list:
        # Open up each link
        base_url = 'https://www.property24.com/' + str(link)
        response = requests.get(base_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the year and price data using the custom function
        year, price = extract_prices(soup)
        suburb = link.split('/')[2].replace('-',' ').title()
        
        # Place the data into a dictionary with {Suburb: {Year: Price}} pairing
        suburb_price_data[suburb] = dict(zip(year,price))
    return suburb_price_data


In [138]:
link_list = []
for link in links:
    address = link.get('href')
    
    # This filters out the links so we're only looking at suburbs links (and some houses, which are filtered out)
    if re.search('^/for-sale', address) and re.search('[0-9]{4,5}$', address):
        temp_address = address
        
        # This removes links to individual properties
        if(len(temp_address.split('/')[-1])) < 6:
            link_list.append(address)
link_list = list(set(link_list))


In [170]:
suburb_price_data = {}
suburb_price_data = load_scraped_data(link_list)

In [172]:
suburb_df = pd.DataFrame(suburb_price_data).transpose()

In [179]:
suburb_df.index =suburb_df

Index(['Walmer Estate', 'Marina Da Gama', 'Kenwyn', 'Rosebank', 'Belhar',
       'Newfields', 'Valhalla Park', 'Kirstenhof', 'Newlands Upper', 'Montana',
       ...
       'Kenilworth Upper', 'Higgovale', 'Heathfield', 'Bergvliet', 'Woodstock',
       'Sybrand Park', 'Clifton', 'Cape Farms', 'Lansdowne', 'Surrey Estate'],
      dtype='object', length=136)

In [173]:
suburb_df.to_csv("property_prices.csv")

In [141]:
def extract_prices(soup_data):
    # Function takes a document processed with Beautiful soup and extracts the year and rand-value of property on the page
    # Returns a two lists with the year and price data
    
    jsdata = soup_data.findAll('script')
    price_by_year = {}
    year, price = [], []
    # Split the text by "" and then use regex to extract the years and rand amounts
    all_text = str(jsdata).split('"')
    for word in all_text:
        # Find the date
        if re.search("^:20", word) and len(word) < 7:
            year.append(word[1:-1])

        # Find the rand amount and process it to get rid of weird formatting
        if re.search("^R [0-9]", word):
            price.append(int(re.sub("\D","",str(word[2:]))))
    
    return year, price


In [457]:
def scrape_pages(num_pages):
    '''
    Description: Function that scrapes webpages on property24 to create a list of links to search further
    Arguments: num_pages - the number of pages to scrape
    Returns:   a list with the links for all of the properties
    '''
    
    base_url = 'https://www.property24.com/apartments-to-rent/cape-town/western-cape/432'
    search_terms = '?sp=pf%3d3500%26pt%3d7000'
    properties = []
    
    # Append the URL according to the formatting rules for the URL
    for it in range(num_pages):
        if it == 0:
            page_var = ''
        else:
            page_var = '/p' + str(it+1)
        scrape_url =  str(base_url) + str(page_var) + str(search_terms)
        
        # This try/except allows the function to avoid errors if too many pages are passed to it
        try: 
            response = requests.get(scrape_url)
        except:
            break
        
        # Extract the text from the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the links and filter out ones that aren't useful
        page_links = soup.findAll('a')
        for link in page_links:
            address = link.get('href')
            if address[1:8] == 'to-rent' and '%' not in address and len(address) > 8:
                properties.append('https://www.property24.com/' + str(address))
    return properties

def extract_property_data(properties, property_dict, property_attribs):
    '''
    Description: function that extracts key data points from the property links
    Arguments: properties - a list containing all of the links to the properties scraped from the site
    Returns:   a dictionary with the selected data points from the properties
    '''
    #property_dict = {}
    if len(property_dict.keys()) > 0:
        i = len(property_dict.keys())
    else:
        i = 0
    
    for link in properties:
        # setup a blank dictionary with the same attributes
        props = dict((attr, 0) for attr in property_attribs)
        # Extract the text from the page
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
                
        try:
            # Takes the area from part of the link
            props['Area'] = link.split('/')[5]
            
            # Finds all the attributes data and stores it in a dictionary
            data = soup.findAll('div', attrs = {'class':'row p24_propertyOverviewRow'})
            for item in data:
                items = item.text.strip().replace('\n\n','|').split('|')
                props[items[0]] = items[1]

            write_up = soup.find('span', attrs = {'class':'p24_dPL js_readMoreText'}).text.replace('\n','')    
            props['Description'] = write_up
            props['Link'] = link
            
            price = soup.find('span', attrs = {'class':'p24_price'}).text
            
            # Cleans the price data; repalces a text field called POA
            if price == "POA":
                price = 0
            else:
                price = price.replace('R ','').replace('&#160;','') 
            props['Price'] = price

            property_dict[i] = props
            i += 1
        except:
            continue        

    return property_dict
    

In [458]:
property_dict = {}
property_dict = extract_property_data(properties, property_dict, PROPERTY_ATTRIBS)

In [439]:
#property_dict

In [465]:
# Createa dataframe and then remove columns where 80% of the entries are 0
df = pd.DataFrame(property_dict, columns = property_dict.keys()).transpose()
for column in df.columns:
    a = (df[column].values == 0)
    a = np.sum(a*10)
    #print(a)
    if (a > 8*len(df[column].values)):
        df.drop(columns = [column], inplace=True)

items = []        
for item in df['Price']:
    if type(item) == int:
        items.append(item)
    else:
        items.append(np.int(item.replace('\xa0','')))
df['Price'] = items


In [466]:
blah = df.to_csv('Houses.csv')

