In [5]:
import os
import re
import requests
from bs4 import BeautifulSoup
import csv
from geopy.geocoders import Nominatim
import time
import pandas as pd

In [6]:
# Create folder to save dataset
base_dir = '../../data/'
landing_dir = os.path.join(base_dir, 'landing')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

if not os.path.exists(landing_dir):
    os.makedirs(landing_dir)

subfolder = 'Supermarkets'

if not os.path.exists(os.path.join(landing_dir, subfolder)):
    os.makedirs(os.path.join(landing_dir, subfolder))


In [7]:
# Extract links to different regions from the homepage
def get_region_urls(home_url):
    response = requests.get(home_url)
    if response.status_code != 200:
        print(f"Could not access {home_url}")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all region links
    region_links = soup.find_all('a', class_='stretched-link')
    
    base_url = "https://www.travelvictoria.com.au"
    region_urls = [base_url + link['href'] for link in region_links]
    
    return region_urls

# Get the supermarket page URL for each region
def get_supermarket_url(region_url):
    response = requests.get(region_url)
    if response.status_code != 200:
        print(f"Could not access {region_url}")
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Look for the link that contains 'supermarkets'
    for link in soup.find_all('a', href=True):
        if "supermarkets" in link['href']:
            return "https://www.travelvictoria.com.au" + link['href']
    
    return None

# Function to extract Suburb from the address
def extract_suburb_from_address(address):
    # Assuming the suburb is the last part of the address, match it with regex
    match = re.search(r',\s*([^,]+)$', address)
    if match:
        return match.group(1).strip()
    return None

# Scrape supermarket names, addresses, and extract suburb from the address
def get_supermarkets_info(supermarket_url):
    response = requests.get(supermarket_url)
    if response.status_code != 200:
        print(f"Could not access {supermarket_url}")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find supermarket names and addresses
    supermarkets = []
    
    # Print the page title to ensure we are on the correct page
    print(f"Visiting supermarket page: {supermarket_url}")
    print(f"Page Title: {soup.title.string.strip()}")
    
    # Find all supermarket entries
    supermarket_entries = soup.find_all('div', class_='row')
    
    if not supermarket_entries:
        print(f"No supermarket entries found on {supermarket_url}")
        return supermarkets
    
    # Skip the first entry, assuming it's always the region name
    for idx, entry in enumerate(supermarket_entries):
        # Skip the first entry (region name)
        if idx == 0:
            continue
        
        # Select only <h5> tags, ensuring it's the supermarket name
        name_element = entry.find('h5')
        if not name_element:
            continue
        
        name = name_element.get_text(strip=True)
        
        # Find the supermarket address (in div class="col-sm-8 col-md-6 col-lg-9")
        address_element = entry.find('div', class_='col-sm-8 col-md-6 col-lg-9')
        
        # Skip entries without an address
        if not address_element:
            print(f"Skipping entry without address: {name}")
            continue
        
        # Filter out strings containing "Show on map" and exclude invalid entries
        addresses = [div.strip() for div in address_element.stripped_strings if 'Show on map' not in div]
        
        # Ensure valid pairing of name and address
        if name and len(addresses) > 0:
            for address in addresses:
                suburb = extract_suburb_from_address(address)  # Extract Suburb from the address
                print(f"Found Supermarket: Name={name}, Address={address}, Suburb={suburb}")
                supermarkets.append((name, address, suburb))
    
    return supermarkets

# Loop through all regions and scrape supermarket info
def scrape_all_regions(region_urls):
    all_supermarkets_info = []

    for region_url in region_urls:
        supermarket_url = get_supermarket_url(region_url)
        if supermarket_url:
            print(f"Found supermarket page: {supermarket_url}")
            supermarkets_info = get_supermarkets_info(supermarket_url)
            all_supermarkets_info.extend(supermarkets_info)
        else:
            print(f"No supermarket page for {region_url}")
    
    return all_supermarkets_info

# Save all supermarket information to a Parquet file in 'data/landing/supermarkets/'
def save_supermarkets_to_parquet(supermarkets_info, filename):
    # Define the folder path where the file will be saved
    folder_path = "../../data/landing/Supermarkets"
    
    # Define the full path to the file
    file_path = os.path.join(folder_path, filename)
    
    # Convert the list of supermarkets into a DataFrame
    df = pd.DataFrame(supermarkets_info, columns=["Supermarket Name", "Address", "Suburb"])
    
    # Save the DataFrame as a Parquet file
    df.to_parquet(file_path, index=False, engine='pyarrow')
    
    print(f"Supermarkets information has been saved to '{file_path}'")

# Homepage URL
home_url = "https://www.travelvictoria.com.au/regions/melbourne/"

# Get all region URLs from the homepage
region_urls = get_region_urls(home_url)

# Scrape supermarket info for all regions
all_supermarkets_info = scrape_all_regions(region_urls)

# Save supermarket info to the specific directory
save_supermarkets_to_parquet(all_supermarkets_info, "supermarkets_info.parquet")


Found supermarket page: https://www.travelvictoria.com.au/melbourne/supermarkets/
Visiting supermarket page: https://www.travelvictoria.com.au/melbourne/supermarkets/
Page Title: Melbourne CBD supermarkets - Travel Victoria: accommodation & visitor guide
Found Supermarket: Name=Aldi, Address=8 Franklin Street, Melbourne, Suburb=Melbourne
Found Supermarket: Name=Coles, Address=2 Elizabeth Street, Melbourne, Suburb=Melbourne
Found Supermarket: Name=Coles, Address=Melbourne Central, 183-201 La Trobe Street, Melbourne, Suburb=Melbourne
Found Supermarket: Name=Friendly Grocer, Address=Shop 1, 360 Collins Street, Melbourne, Suburb=Melbourne
Found Supermarket: Name=IGA, Address=470 Collins Street, Melbourne, Suburb=Melbourne
Found Supermarket: Name=IGA, Address=19 Commercial Road, Melbourne, Suburb=Melbourne
Found Supermarket: Name=IGA, Address=333 Exhibition Street, Melbourne, Suburb=Melbourne
Found Supermarket: Name=IGA, Address=84 Flinders Street, Melbourne, Suburb=Melbourne
Found Supermar

In [8]:
# Create geolocator object
geolocator = Nominatim(user_agent="supermarket_postcode_lookup")

# Function to simplify the address, keeping only street and city
def simplify_address(address):
    # Use regex to remove the part before the comma (usually mall name or other unnecessary info)
    simplified = re.sub(r'^[^,]+,\s*', '', address)
    return simplified

# Use geopy to get the postcode from the address
def get_postcode_from_geopy(address, retries=3):
    for attempt in range(retries):
        try:
            print(f"Fetching address: {address}")
            location = geolocator.geocode(address)
            if location:
                location_details = geolocator.reverse((location.latitude, location.longitude), exactly_one=True)
                address_info = location_details.raw.get('address', {})
                postcode = address_info.get('postcode', 'No postcode found')
                return postcode
            print(f"Postal code not found for address: {address}")
            return None
        except Exception as e:
            print(f"Error fetching data for address: {address}. Attempt {attempt + 1} of {retries}")
            time.sleep(2)  # Wait for 2 seconds before retrying
    return None

# Read the Parquet file and query the postcode for each address
def add_postcode_to_parquet(input_file, output_file):
    # Read the original Parquet file into a DataFrame
    df = pd.read_parquet(input_file)
    
    # Add a new 'Postcode' column
    df['Postcode'] = None

    # Iterate through each row of data
    for index, row in df.iterrows():
        address = row['Address']  # Assuming 'Address' column exists

        # Step 1: Use geopy to query the postcode
        postcode = get_postcode_from_geopy(address)
        
        # Step 2: If the original address query fails, simplify the address and retry
        if not postcode or postcode == "No postcode found":
            simplified_address = simplify_address(address)
            print(f"Original address failed, trying simplified address: {simplified_address}")
            postcode = get_postcode_from_geopy(simplified_address)
        
        # Update the 'Postcode' column with the found postcode
        df.at[index, 'Postcode'] = postcode
        time.sleep(1)  # Wait 1 second after each query to avoid sending requests too quickly

    # Save the updated data to a new Parquet file
    df.to_parquet(output_file, index=False, engine='pyarrow')

# File paths
input_file = '../../data/landing/Supermarkets/supermarkets_info.parquet'  # Input file path
output_file = '../../data/landing/Supermarkets/supermarkets_with_postcode.parquet'  # Output file path

# Call the function
add_postcode_to_parquet(input_file, output_file)

print(f"Updated Parquet file with postcodes has been saved to {output_file}")

Fetching address: 8 Franklin Street, Melbourne
Fetching address: 2 Elizabeth Street, Melbourne
Fetching address: Melbourne Central, 183-201 La Trobe Street, Melbourne
Postal code not found for address: Melbourne Central, 183-201 La Trobe Street, Melbourne
Original address failed, trying simplified address: 183-201 La Trobe Street, Melbourne
Fetching address: 183-201 La Trobe Street, Melbourne
Fetching address: Shop 1, 360 Collins Street, Melbourne
Fetching address: 470 Collins Street, Melbourne
Fetching address: 19 Commercial Road, Melbourne
Fetching address: 333 Exhibition Street, Melbourne
Fetching address: 84 Flinders Street, Melbourne
Fetching address: 35-41 Lonsdale Street, Melbourne
Fetching address: 85 Queen Street, Melbourne
Fetching address: 600 Bourke Street, Melbourne
Fetching address: 60 Elizabeth Street, Melbourne
Fetching address: 388 Elizabeth Street, Melbourne
Fetching address: 611 Elizabeth Street, Melbourne
Fetching address: 222 Exhibition Street, Melbourne
Fetching a