In [91]:
import os
import pandas as pd
import csv
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import re
import geopandas as gpd
from shapely.geometry import Point
import folium
from folium.plugins import MarkerCluster

In [38]:
# Create folder to save dataset
base_dir = '../../data/'
landing_dir = os.path.join(base_dir, 'landing')
raw_dir = os.path.join(base_dir, 'raw')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

if not os.path.exists(raw_dir):
    os.makedirs(raw_dir)

subfolder = 'Coles_WWS'

if not os.path.exists(os.path.join(raw_dir, subfolder)):
    os.makedirs(os.path.join(raw_dir, subfolder))

In [39]:
# read data
df_wws = pd.read_csv(f"{landing_dir}/{subfolder}/woolworths_stores.csv")

In [40]:
# Function to simplify the address by removing unnecessary parts and fixing abbreviations
def simplify_address(address):
    """
    Simplifies the address by removing unnecessary parts (like mall names) and correcting abbreviations.
    """
    # Remove extra quotes
    address = address.replace('"""', '').replace('"', '')

    # Replace common abbreviations
    address = re.sub(r'\bCnr\b', 'corner', address, flags=re.IGNORECASE)
    address = address.replace('&', 'and')

    # Remove mall or building names before the first comma
    address = re.sub(r'^[^,]+,\s*', '', address).strip()

    # If no street remains after simplification, return the original
    if not address:
        return address

    return address

# Function to try using a single street if the address is an intersection
def use_single_street(address):
    """
    Removes the second street from intersection addresses to focus on one street for geocoding.
    """
    # Split at "corner" and keep only the first street
    if 'corner' in address:
        return address.split('corner')[0].strip()
    return address

# Function to get latitude and longitude from the address
def get_lat_long_from_geopy(address, retries=3):
    """
    Fetches the latitude and longitude for a given address using geopy.

    Parameters:
    address (str): The address to geocode.
    retries (int): Number of retries if a request fails.

    Returns:
    tuple: (latitude, longitude)
    """
    for attempt in range(retries):
        try:
            print(f"Fetching address: {address}")
            # Get the latitude and longitude of the address
            location = geolocator.geocode(address)
            if location:
                return location.latitude, location.longitude  # Return lat, long
            print(f"Location not found for address: {address}")
            return None, None
        except GeocoderTimedOut as e:
            print(f"Timeout for address: {address}. Attempt {attempt + 1} of {retries}")
            time.sleep(2)  # Wait for 2 seconds before retrying
        except Exception as e:
            print(f"Error fetching data for address: {address}. Error: {e}. Attempt {attempt + 1} of {retries}")
            time.sleep(2)
    return None, None

# Function to add latitude and longitude to a DataFrame
def add_lat_long_to_dataframe(df):
    """
    Adds latitude and longitude to each row in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing address information.

    Returns:
    pd.DataFrame: The updated DataFrame with latitude and longitude columns.
    """
    # Add new 'Latitude' and 'Longitude' columns
    df['Latitude'] = None
    df['Longitude'] = None

    # Iterate through each row of data
    for index, row in df.iterrows():
        address = row['Address']

        # Step 1: Try fetching lat and long using the full address
        latitude, longitude = get_lat_long_from_geopy(address)

        # Step 2: If the original address query fails, simplify the address and retry
        if not latitude and not longitude:
            simplified_address = simplify_address(address)
            print(f"Original address failed, trying simplified address: {simplified_address}")
            latitude, longitude = get_lat_long_from_geopy(simplified_address)

        # Step 3: If still failing, try using only one street (remove intersection)
        if not latitude and not longitude:
            single_street_address = use_single_street(simplified_address)
            print(f"Trying single street address: {single_street_address}")
            latitude, longitude = get_lat_long_from_geopy(single_street_address)
        
        # Update the DataFrame with the results
        df.at[index, 'Latitude'] = latitude
        df.at[index, 'Longitude'] = longitude

        # Sleep to avoid sending requests too quickly
        time.sleep(1)

    return df  # Return the updated DataFrame

# preprocess Woolworths location data

In [41]:
df_wws.head(5)

Unnamed: 0,Store Name,"Address, City, State Zip Country"
0,Woolworths Store #3195 - Abbotsford,"313 Victoria Street, Abbotsford, Victoria 3067 Australia"
1,Woolworths Store #3291 - Airport West,"""Westfield Airport West, 25-39 Louis Street"", Airport West, Victoria 3042 Australia"
2,Woolworths Store #3066 - Lucas (Alfredton),"Cnr Dyson & Remembrance Drv, Alfredton, Victoria 3350 Australia"
3,Woolworths Store #3194 - Altona North,"2-32 Borrack Square, Altona North, Victoria 3025 Australia"
4,Woolworths Store #3164 - Ararat,"3 Ingor Street, Ararat, Victoria 3377 Australia"


In [42]:
# check missing value
df_wws.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 2 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Store Name                        244 non-null    object
 1   Address, City, State Zip Country  244 non-null    object
dtypes: object(2)
memory usage: 3.9+ KB


In [43]:
# check duplicate data
print(df_wws.duplicated().sum())


0


In [44]:
# rename column
df_wws = df_wws.rename(columns={'Address, City, State Zip Country': 'Address'})

In [45]:
# get the longtitude and latitude of each supermarket

# Initialize the geolocator using Nominatim
geolocator = Nominatim(user_agent="store_locator", timeout=10)

processed_wws = add_lat_long_to_dataframe(df_wws)

Fetching address: 313 Victoria Street, Abbotsford, Victoria 3067 Australia
Fetching address: "Westfield Airport West, 25-39 Louis Street", Airport West, Victoria 3042 Australia
Location not found for address: "Westfield Airport West, 25-39 Louis Street", Airport West, Victoria 3042 Australia
Original address failed, trying simplified address: 25-39 Louis Street, Airport West, Victoria 3042 Australia
Fetching address: 25-39 Louis Street, Airport West, Victoria 3042 Australia
Fetching address: Cnr Dyson & Remembrance Drv, Alfredton, Victoria 3350 Australia
Location not found for address: Cnr Dyson & Remembrance Drv, Alfredton, Victoria 3350 Australia
Original address failed, trying simplified address: Alfredton, Victoria 3350 Australia
Fetching address: Alfredton, Victoria 3350 Australia
Fetching address: 2-32 Borrack Square, Altona North, Victoria 3025 Australia
Fetching address: 3 Ingor Street, Ararat, Victoria 3377 Australia
Fetching address: 551-557 Warrigal Road, Ashwood, Victoria 3

In [46]:
# check missing value
processed_wws.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Store Name  244 non-null    object
 1   Address     244 non-null    object
 2   Latitude    234 non-null    object
 3   Longitude   234 non-null    object
dtypes: object(4)
memory usage: 7.8+ KB


In [47]:
processed_wws.head(5)

Unnamed: 0,Store Name,Address,Latitude,Longitude
0,Woolworths Store #3195 - Abbotsford,"313 Victoria Street, Abbotsford, Victoria 3067 Australia",-37.809771,144.995362
1,Woolworths Store #3291 - Airport West,"""Westfield Airport West, 25-39 Louis Street"", Airport West, Victoria 3042 Australia",-37.717503,144.888754
2,Woolworths Store #3066 - Lucas (Alfredton),"Cnr Dyson & Remembrance Drv, Alfredton, Victoria 3350 Australia",-37.558214,143.799815
3,Woolworths Store #3194 - Altona North,"2-32 Borrack Square, Altona North, Victoria 3025 Australia",-37.834887,144.845816
4,Woolworths Store #3164 - Ararat,"3 Ingor Street, Ararat, Victoria 3377 Australia",-37.283822,142.928223


In [48]:
pd.set_option('display.max_colwidth', 100)

missing_data = processed_wws[processed_wws['Latitude'].isnull()]
missing_data

Unnamed: 0,Store Name,Address,Latitude,Longitude
64,Woolworths Store #3349 - Dandenong,"""Dandenong Plaza, Cnr Mccrae & Foster Street"", Dandenong, Victoria 3175 Australia",,
74,Woolworths Store #3301 - The Pines (Doncaster East),"""The Pines Shopping Centre, Cnr Reynolds And Blackburn Road"", Doncaster East, Victoria 3109 Aust...",,
80,Woolworths Store #3292 - Northland (East Preston),"""Northland Shopping Centre, 50 Murray Road"", East Preston, Victoria 3072 Australia",,
84,Woolworths Store #3168 - Endeavour Hills,"""Endeavour Hills Shopping Centre, Cnr Heatherton Road & Matthew Flinders Dr"", Endeavour Hills, V...",,
113,Woolworths Store #3244 - Karingal (P.O. End),"""Karingal Hub, 330 Cranbourne Road & Karingal Drive P.O. End"", Karingal, Victoria 3199 Australia",,
159,Woolworths Store #3142 - Mornington East,"Cnr Bentons And Dunns Roads, Mornington East, Victoria 3931 Australia",,
168,Woolworths Store #3145 - Niddrie,"""Niddrie Central Shopping Centre, Cnr Hoffmans & Keilor Road"", Niddrie, Victoria 3042 Australia",,
170,Woolworths Store #3368 - Oakleigh,"""Oakleigh Central Shopping Centre, Station Square, Station Street"", Oakleigh, Victoria 3166 Aust...",,
174,Woolworths Store #3395 - Pakenham Market Place,"""55 Henry Street, Pakenham Market Place"", Pakenham, Victoria 3810 Australia",,
237,Woolworths Safeway Store #3363 - Wodonga Plaza,"""Shop 30, Wodonga Plaza, Hume Highway"", Wodonga, Victoria 3690 Australia",,


In [49]:
# Manually fill in missing value
manual_lat_lon = {
    64 : (-37.9873098,145.2171451),
    74: (-37.7625969,145.1669369),
    80: (-37.7407108,145.030207),
    84: (-37.9752774,145.2761512),
    113: (-38.1513198,145.1653291),
    159: (-38.2463947,145.047232),
    168: (-37.7377383,144.8929298),
    170: (-37.8996402,145.0881917),
    174: (-38.0778465,145.4858093),
    237: (-36.120870, 146.886822)
}

In [50]:
# Iterate over the rows of the DataFrame
for index, row in processed_wws.iterrows():
    # Check if latitude or longitude is missing
    if pd.isnull(row['Latitude']) or pd.isnull(row['Longitude']):
        # If the index exists in the manual_lat_lon_by_index dictionary, fill the missing value
        if index in manual_lat_lon:
            lat, lon = manual_lat_lon[index]
            processed_wws.at[index, 'Latitude'] = lat
            processed_wws.at[index, 'Longitude'] = lon

In [51]:
processed_wws.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Store Name  244 non-null    object
 1   Address     244 non-null    object
 2   Latitude    244 non-null    object
 3   Longitude   244 non-null    object
dtypes: object(4)
memory usage: 7.8+ KB


# Preprocess Coles location data

In [70]:
df_coles = pd.read_csv(f"{landing_dir}/{subfolder}/LocationsData.csv")

In [71]:
df_coles.head(5)

Unnamed: 0,Brand,Name,Number,Address,Store Type,Contact,Unnamed: 6
0,COLES,RICHMOND SOUTH,482,188-196 Swan Street Richmond 3121 VIC,Retail,03 8520 6700,
1,COLES,BELMONT,501,65 High Street Belmont 3216 VIC,Retail,03 5243 3644,
2,COLES,PINEWOOD,504,Pinewood Centreway Shopping Centre Blackburn Road Mt Waverley 3149 VIC,Retail,03 9802 0254,
3,COLES,DIAMOND CREEK,506,Main Hurstbridge Road Diamond Creek 3089 VIC,Retail,03 9438 1999,
4,COLES,TOORONGA VILLAGE,507,Tooronga Road Glen Iris 3146 VIC,Retail,03 8823 6300,


In [72]:
# check missing value
df_coles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Brand       234 non-null    object 
 1   Name        234 non-null    object 
 2   Number      234 non-null    int64  
 3   Address     234 non-null    object 
 4   Store Type  234 non-null    object 
 5   Contact     234 non-null    object 
 6   Unnamed: 6  0 non-null      float64
dtypes: float64(1), int64(1), object(5)
memory usage: 12.9+ KB


In [73]:
# check duplicate data
print(df_coles.duplicated().sum())


0


In [74]:
# Add comma to 'double space' to split adddress
df_coles['Address'] = df_coles['Address'].str.replace('  ', ', ', regex=False)

In [75]:
# get the longtitude and latitude of each supermarket

# Initialize the geolocator using Nominatim
geolocator = Nominatim(user_agent="store_locator", timeout=10)

processed_coles = add_lat_long_to_dataframe(df_coles)

Fetching address: 188-196 Swan Street, Richmond 3121 VIC 
Fetching address: 65 High Street, Belmont 3216 VIC 
Fetching address: Pinewood Centreway Shopping Centre, Blackburn Road, Mt Waverley 3149 VIC 
Location not found for address: Pinewood Centreway Shopping Centre, Blackburn Road, Mt Waverley 3149 VIC 
Original address failed, trying simplified address: Blackburn Road, Mt Waverley 3149 VIC
Fetching address: Blackburn Road, Mt Waverley 3149 VIC
Fetching address: Main Hurstbridge Road, Diamond Creek 3089 VIC 
Fetching address: Tooronga Road, Glen Iris 3146 VIC 
Fetching address: The Glen Shopping Centre Cnr High Street & Springvale Road, Glen Waverley 3150 VIC 
Location not found for address: The Glen Shopping Centre Cnr High Street & Springvale Road, Glen Waverley 3150 VIC 
Original address failed, trying simplified address: Glen Waverley 3150 VIC
Fetching address: Glen Waverley 3150 VIC
Fetching address: Cnr Exeter Road & Maroondah Highway, Croydon North 3136 VIC 
Location not foun

In [76]:
# check missing value
processed_coles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Brand       234 non-null    object 
 1   Name        234 non-null    object 
 2   Number      234 non-null    int64  
 3   Address     234 non-null    object 
 4   Store Type  234 non-null    object 
 5   Contact     234 non-null    object 
 6   Unnamed: 6  0 non-null      float64
 7   Latitude    222 non-null    object 
 8   Longitude   222 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 16.6+ KB


In [77]:
pd.set_option('display.max_colwidth', 1000)

missing_coles = processed_coles[processed_coles['Latitude'].isnull()]
missing_coles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, 10 to 216
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Brand       12 non-null     object 
 1   Name        12 non-null     object 
 2   Number      12 non-null     int64  
 3   Address     12 non-null     object 
 4   Store Type  12 non-null     object 
 5   Contact     12 non-null     object 
 6   Unnamed: 6  0 non-null      float64
 7   Latitude    0 non-null      object 
 8   Longitude   0 non-null      object 
dtypes: float64(1), int64(1), object(7)
memory usage: 960.0+ bytes


In [78]:
missing_coles['Address']

10                     Sanctuary Lakes Shopping Centre, Shop 1Cnr Point Cook Road & Jamieson Way, Point Cook 3030 VIC 
16                                                          Cnr Britannia Street & Victoria Street, MIitcham 3132 VIC 
28                                        Warringal Mall Shopping Centre, Shop 1 Burgundy Street, Heidleberg 3084 VIC 
61                                              Northpoint Shopping Centre Hopkins Highway, North Warnambool 3280 VIC 
62                                                 Ivanhoe Plaza Shopping Centre, Livingston Street, Ivanhoe 3079 VIC 
65                                                                                  Latrobe Street, Melborne 3000 VIC 
111                                                Cnr Frankston/Cranbourne Road & Southgate Way, Langwarren 3910 VIC 
118                                                         Gippsland Shopping Centre, Cunningham Road, Sale 3850 VIC 
159    Casey Central Shopping Centre, Cnr Little

In [79]:
# Manually fill in missing value
manual_lat_lon_coles = {
    10: (-37.895479, 144.7522644),
    16: (-37.7625969, 145.1669369),
    28: (-37.755627, 145.0692596),
    61: (-38.3684998, 142.4947968),
    62: (-37.767898, 145.041290),
    65: (-37.809830, 144.963289),
    111: (-38.150985, 145.196710),
    118: (-38.106773, 147.064516),
    159: (-38.11342, 145.28326),
    183: (-37.621240, 145.006748), 
    210: (-36.548702, 145.984955),
    216: (-37.722511, 144.670048) 
}

In [80]:
# Iterate over the rows of the DataFrame
for index, row in processed_coles.iterrows():
    # Check if latitude or longitude is missing
    if pd.isnull(row['Latitude']) or pd.isnull(row['Longitude']):
        # If the index exists in the manual_lat_lon_coles dictionary, fill the missing value
        if index in manual_lat_lon_coles:
            lat, lon = manual_lat_lon_coles[index]
            processed_coles.at[index, 'Latitude'] = lat
            processed_coles.at[index, 'Longitude'] = lon

In [81]:
processed_coles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Brand       234 non-null    object 
 1   Name        234 non-null    object 
 2   Number      234 non-null    int64  
 3   Address     234 non-null    object 
 4   Store Type  234 non-null    object 
 5   Contact     234 non-null    object 
 6   Unnamed: 6  0 non-null      float64
 7   Latitude    234 non-null    object 
 8   Longitude   234 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 16.6+ KB


# Union data and visualization

In [85]:
# For coles dataset, merge 'Brand', 'Name' and 'Number' to a new feature 'Store Name'
processed_coles['Number'] = processed_coles['Number'].astype(str)
processed_coles['Store Name'] = processed_coles['Brand'] + ' #' + processed_coles['Number'] + ' - ' + processed_coles['Name']

# select 'Store Name', 'Address', 'Latitude' and 'Longitude' from both dataset
df_coles_selected = processed_coles[['Store Name', 'Address', 'Latitude', 'Longitude']]
df_wws_selected = processed_wws[['Store Name', 'Address', 'Latitude', 'Longitude']]

# Union these 2 dataset
df_union = pd.concat([df_wws_selected, df_coles_selected], ignore_index=True)

df_union.head(5)


Unnamed: 0,Store Name,Address,Latitude,Longitude
0,Woolworths Store #3195 - Abbotsford,"313 Victoria Street, Abbotsford, Victoria 3067 Australia",-37.809771,144.995362
1,Woolworths Store #3291 - Airport West,"""Westfield Airport West, 25-39 Louis Street"", Airport West, Victoria 3042 Australia",-37.717503,144.888754
2,Woolworths Store #3066 - Lucas (Alfredton),"Cnr Dyson & Remembrance Drv, Alfredton, Victoria 3350 Australia",-37.558214,143.799815
3,Woolworths Store #3194 - Altona North,"2-32 Borrack Square, Altona North, Victoria 3025 Australia",-37.834887,144.845816
4,Woolworths Store #3164 - Ararat,"3 Ingor Street, Ararat, Victoria 3377 Australia",-37.283822,142.928223


In [86]:
# save data
output_path = f"{raw_dir}/{subfolder}/coles_wws_data.csv"
df_union.to_csv(output_path, index=False)

In [92]:
# Visualize data

# Create a geometry column from latitude and longitude
geometry = [Point(xy) for xy in zip(df_union['Longitude'], df_union['Latitude'])]

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(df_union, geometry=geometry)

# Create a folium map object, centered at a location (e.g., Melbourne)
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=12)  # Adjust the zoom level as needed

# Create a MarkerCluster object
marker_cluster = MarkerCluster().add_to(m)

# Add markers to the cluster
for _, row in gdf.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"{row['Store Name']}",
        tooltip=row['Store Name']
    ).add_to(marker_cluster)

m

# Save the map to an HTML file for viewing in a web browser
# m.save("hospital_map_with_cluster.html")
