In [6]:
import re
from bs4 import BeautifulSoup
import csv
from time import sleep
import requests
import pandas as pd

In [15]:
def scrape_suburb(url, need_sub):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the main unordered list with class 'pclist'
    pclist = soup.find('ul', class_='pclist')
    
    if not pclist:
        print("Could not find the pclist")
        return
    
    # Find all list items within the pclist
    postcode_items = pclist.find_all('li', recursive=False)
    print(f"Total postcode items found: {len(postcode_items)}")
    
    # Prepare data for CSV
    data = []
    
    for index, item in enumerate(postcode_items):
        print(f"\nProcessing item {index}:")
        print(item)  # Print the entire item for debugging
        
        postcode_link = item.find('a')
        if postcode_link:
            postcode = postcode_link.text.strip()
            print(f"Postcode found: {postcode}")
            
            suburbs_ul = item.find('ul')
            if suburbs_ul:
                suburbs = suburbs_ul.find_all('li')
                print(f"Suburbs found: {len(suburbs)}")
                
                for suburb in suburbs:
                    suburb_name = suburb.text.strip()
                    data.append([suburb_name, postcode])
                    print(f"Added: {suburb_name}, {postcode}")
            else:
                print("No suburbs found for this postcode")
        else:
            print(f"Warning: No <a> tag found in item {index}")
    
    print(f"\nTotal entries extracted: {len(data)}")
    
    
    df = pd.DataFrame(data, columns=['Suburb', 'Postcode'])
    
    # If a suburb_list is provided, filter the DataFrame
    if need_sub:
        filtered_df = df[df['Suburb'].isin(need_sub)]
        print(f"\nFiltered DataFrame shape: {filtered_df.shape}")
    else:
        filtered_df = df
        print("\nNo filtering applied. Using full DataFrame.")
    
    # Write filtered data to CSV file
    if not filtered_df.empty:
        filtered_df['Postcode'] = filtered_df['Postcode'].astype(int)
        filtered_df = filtered_df[filtered_df['Postcode'] <= 4000]
        filtered_df.to_csv('../data/landing/victoria_suburbs_postcodes.csv', index=False)
        print("Filtered CSV file written successfully")
    else:
        print("No data to write, CSV file not created")
    
    # Print the first few entries for verification
    print("\nFirst 5 entries of the filtered data:")
    print(filtered_df.head())
    
    return filtered_df

In [16]:
need_sub = ['Albert Park', 'St Kilda', 'Armadale', 'Carlton North', 'Carlton', 'Parkville', 'Collingwood',
            'Abbotsford', 'Docklands', 'East Melbourne', 'St Kilda East', 'Elwood', 'Fitzroy', 
            'Fitzroy North', 'Clifton Hill', 'Flemington', 'Kensington', 'North Melbourne', 'West Melbourne', 
            'Port Melbourne','Prahran','Windsor','Richmond','Burnley','South Melbourne','South Yarra', 'Southbank', 
            'St Kilda', 'Toorak', 'Balwyn','Blackburn','Box Hill','Bulleen','Templestowe','Doncaster', 'Burwood',
            'Ashburton', 'Camberwell','Glen Iris','Canterbury','Surrey Hills','Mont Albert','Chadstone','Oakleigh',
            'Clayton','Doncaster East','Donvale East','Hawthorn','Glen Waverley','Mulgrave','Hawthorn','Kew','Mount Waverley',
            'Nunawading','Mitcham', 'Melbourne',
            'Vermont','Forest Hill','Burwood East','Aspendale','Chelsea','Carrum', 'Bentleigh', 'Brighton', 'Brighton East',
            'Carnegie','Caulfield','Cheltenham','Elsternwick','Hampton','Beaumaris','Malvern','Malvern East','Mentone','Parkdale','Mordialloc'
            ,'Murrumbeena','Hughesdale','Altona','Footscray','Keilor East','Avondale Heights','Melton','Newport','Spotswood', 
            'St Albans','Deer Park','Sunshine','Sydenham','Werribee','Footscray West','Williamstown','Yarraville','Seddon','Broadmeadows','Brunswick','Coburg' 
            ,'Brunswick East','Essendon','Tullamarine','Keilor Moonee Ponds', 'Ascot Vale', 
            'Oak Park','Glenroy','Fawkner','Pascoe Vale','Coburg North','Sunbury','Brunswick West','Bundoora','Greensborough','Hurstbridge',
            'Eltham','Research','Montmorency','Fairfield','Alphington','Heidelberg','Ivanhoe', 
           'Mill Park','Epping Northcote', 'Thomastown','Lalor','Thornbury','Whittlesea','Bayswater','Boronia', 
            'Croydon','Lilydale','Ferntree Gully','Ringwood','Rowville','Wantirna','Berwick','Cranbourne',
            'Dandenong','Dandenong North','Endeavour Hills','Narre Warren','Hampton Park','Noble Park','Pakenham','Springvale',
            'Dromana','Portsea','Frankston' ,'Hastings','Flinders','Mt Eliza','Mornington','Mt Martha','Seaford','Carrum Downs', 'Belmont','Grovedale' 
            'Corio' ,'Geelong','Newcombe' ,'Herne Hill','Geelong West', 'Lara', 'Newtown', 'North Geelong', 'Ballarat', 'Buninyong' ,
            'Sebastopol','Delacombe', 'Wendouree','Alfredton','Bendigo','Flora Hill','East Bendigo','Golden Square','Kangaroo Flat',
            'North Bendigo','Bairnsdale','Benalla','Castlemaine','Echuca','Hamilton','Horsham','Mildura','Moe','Newborough',
            'Morwell','Ocean Grove','Barwon Heads','Portland','Sale','Maffra','Seymour','Shepparton','Swan Hill','Torquay','Traralgon',
            'Wanagaratta','Warragul','Warrnambool','Wodonga']
            
            

In [17]:
url = 'https://postcodes-australia.com/state-postcodes/vic'
scrape_suburb(url, need_sub)

Total postcode items found: 761

Processing item 0:
<li><a href="https://postcodes-australia.com/postcodes/3000" title="Postcode 3000, Victoria">3000</a>
<ul>
<li>Melbourne</li>
</ul>
</li>
Postcode found: 3000
Suburbs found: 1
Added: Melbourne, 3000

Processing item 1:
<li><a href="https://postcodes-australia.com/postcodes/3001" title="Postcode 3001, Victoria">3001</a>
<ul>
<li>Melbourne</li>
</ul>
</li>
Postcode found: 3001
Suburbs found: 1
Added: Melbourne, 3001

Processing item 2:
<li><a href="https://postcodes-australia.com/postcodes/3002" title="Postcode 3002, Victoria">3002</a>
<ul>
<li>East Melbourne</li>
</ul>
</li>
Postcode found: 3002
Suburbs found: 1
Added: East Melbourne, 3002

Processing item 3:
<li><a href="https://postcodes-australia.com/postcodes/3003" title="Postcode 3003, Victoria">3003</a>
<ul>
<li>West Melbourne</li>
</ul>
</li>
Postcode found: 3003
Suburbs found: 1
Added: West Melbourne, 3003

Processing item 4:
<li><a href="https://postcodes-australia.com/postcod

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Postcode'] = filtered_df['Postcode'].astype(int)


Unnamed: 0,Suburb,Postcode
0,Melbourne,3000
1,Melbourne,3001
2,East Melbourne,3002
3,West Melbourne,3003
4,Melbourne,3004
...,...,...
2841,Mornington,3931
2845,Dromana,3936
2860,Portsea,3944
2964,Hampton Park,3976


In [27]:
url = 'https://www.domain.com.au/508-408-lonsdale-street-melbourne-vic-3000-17166328'

In [28]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [29]:
print(soup.prettify())

<!DOCTYPE html>
<html data-build-git-hash="4135e2bc2e79557ea1debc6c6a2b2e220ba80dae" data-build-id="master-6261" data-build-time="Wed Aug 28 2024 14:18:38 GMT+1000 (Australian Eastern Standard Time)" data-version="9.126.0" lang="en-AU">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   508/408 Lonsdale Street, Melbourne VIC 3000 - Apartment For Rent | Domain
  </title>
  <meta content="View this 1 bathroom rental apartment at 508/408 Lonsdale Street, Melbourne VIC 3000. Available from Friday, 10 February 2023. Contact agent for price." name="description"/>
  <meta content="508/408 Lonsdale Street, Melbourne VIC 3000 - Apartment For Rent | Domain" property="og:title"/>
  <meta content="View this 1 bathroom rental apartment at 508/408 Lonsdale Street, Melbourne VIC 3000. Available from Friday, 10 February 2023. Contact agent for price." property="og:description"/>
  <meta content="product" property="og:type"/>
  <meta c