Created by Wanyu Xu, Sep 3

In [1]:
import os
import pandas as pd
import csv
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import re
import geopandas as gpd
from shapely.geometry import Point
import folium
from folium.plugins import MarkerCluster

In [2]:
# Create folder to save dataset
base_dir = '../../data/'
landing_dir = os.path.join(base_dir, 'landing')
raw_dir = os.path.join(base_dir, 'raw')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

if not os.path.exists(raw_dir):
    os.makedirs(raw_dir)

subfolder = 'Coles_WWS'

if not os.path.exists(os.path.join(raw_dir, subfolder)):
    os.makedirs(os.path.join(raw_dir, subfolder))


# define the plot output dir
plot_dir = '../../plots'

if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

if not os.path.exists(os.path.join(plot_dir, subfolder)):
    os.makedirs(os.path.join(plot_dir, subfolder))


In [3]:
# read data
df_wws = pd.read_csv(f"{landing_dir}/{subfolder}/woolworths_stores.csv")

In [4]:
# Function to simplify the address by removing unnecessary parts and fixing abbreviations
def simplify_address(address):
    """
    Simplifies the address by removing unnecessary parts (like mall names) and correcting abbreviations.
    """
    # Remove extra quotes
    address = address.replace('"""', '').replace('"', '')

    # Replace common abbreviations
    address = re.sub(r'\bCnr\b', 'corner', address, flags=re.IGNORECASE)
    address = address.replace('&', 'and')

    # Remove mall or building names before the first comma
    address = re.sub(r'^[^,]+,\s*', '', address).strip()

    # If no street remains after simplification, return the original
    if not address:
        return address

    return address

# Function to try using a single street if the address is an intersection
def use_single_street(address):
    """
    Removes the second street from intersection addresses to focus on one street for geocoding.
    """
    # Split at "corner" and keep only the first street
    if 'corner' in address:
        return address.split('corner')[0].strip()
    return address

# Function to get latitude and longitude from the address
def get_lat_long_from_geopy(address, retries=3):
    """
    Fetches the latitude and longitude for a given address using geopy.

    Parameters:
    address (str): The address to geocode.
    retries (int): Number of retries if a request fails.

    Returns:
    tuple: (latitude, longitude)
    """
    for attempt in range(retries):
        try:
            print(f"Fetching address: {address}")
            # Get the latitude and longitude of the address
            location = geolocator.geocode(address)
            if location:
                return location.latitude, location.longitude  # Return lat, long
            print(f"Location not found for address: {address}")
            return None, None
        except GeocoderTimedOut as e:
            print(f"Timeout for address: {address}. Attempt {attempt + 1} of {retries}")
            time.sleep(2)  # Wait for 2 seconds before retrying
        except Exception as e:
            print(f"Error fetching data for address: {address}. Error: {e}. Attempt {attempt + 1} of {retries}")
            time.sleep(2)
    return None, None

# Function to add latitude and longitude to a DataFrame
def add_lat_long_to_dataframe(df):
    """
    Adds latitude and longitude to each row in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing address information.

    Returns:
    pd.DataFrame: The updated DataFrame with latitude and longitude columns.
    """
    # Add new 'Latitude' and 'Longitude' columns
    df['Latitude'] = None
    df['Longitude'] = None

    # Iterate through each row of data
    for index, row in df.iterrows():
        address = row['Address']

        # Step 1: Try fetching lat and long using the full address
        latitude, longitude = get_lat_long_from_geopy(address)

        # Step 2: If the original address query fails, simplify the address and retry
        if not latitude and not longitude:
            simplified_address = simplify_address(address)
            print(f"Original address failed, trying simplified address: {simplified_address}")
            latitude, longitude = get_lat_long_from_geopy(simplified_address)

        # Step 3: If still failing, try using only one street (remove intersection)
        if not latitude and not longitude:
            single_street_address = use_single_street(simplified_address)
            print(f"Trying single street address: {single_street_address}")
            latitude, longitude = get_lat_long_from_geopy(single_street_address)
        
        # Update the DataFrame with the results
        df.at[index, 'Latitude'] = latitude
        df.at[index, 'Longitude'] = longitude

        # Sleep to avoid sending requests too quickly
        time.sleep(1)

    return df  # Return the updated DataFrame

# preprocess Woolworths location data

In [None]:
df_wws.head(5)

In [None]:
# check missing value
df_wws.info()

In [None]:
# check duplicate data
print(df_wws.duplicated().sum())


In [8]:
# rename column
df_wws = df_wws.rename(columns={'Address, City, State Zip Country': 'Address'})

In [None]:
# get the longtitude and latitude of each supermarket

# Initialize the geolocator using Nominatim
geolocator = Nominatim(user_agent="store_locator", timeout=10)

processed_wws = add_lat_long_to_dataframe(df_wws)

In [None]:
# check missing value
processed_wws.info()

In [None]:
processed_wws.head(5)

In [None]:
pd.set_option('display.max_colwidth', 100)

missing_data = processed_wws[processed_wws['Latitude'].isnull()]
missing_data

In [13]:
# Manually fill in missing value
manual_lat_lon = {
    64 : (-37.9873098,145.2171451),
    74: (-37.7625969,145.1669369),
    80: (-37.7407108,145.030207),
    84: (-37.9752774,145.2761512),
    113: (-38.1513198,145.1653291),
    159: (-38.2463947,145.047232),
    168: (-37.7377383,144.8929298),
    170: (-37.8996402,145.0881917),
    174: (-38.0778465,145.4858093),
    237: (-36.120870, 146.886822)
}

In [14]:
# Iterate over the rows of the DataFrame
for index, row in processed_wws.iterrows():
    # Check if latitude or longitude is missing
    if pd.isnull(row['Latitude']) or pd.isnull(row['Longitude']):
        # If the index exists in the manual_lat_lon_by_index dictionary, fill the missing value
        if index in manual_lat_lon:
            lat, lon = manual_lat_lon[index]
            processed_wws.at[index, 'Latitude'] = lat
            processed_wws.at[index, 'Longitude'] = lon

In [None]:
processed_wws.info()

# Preprocess Coles location data

In [16]:
df_coles = pd.read_csv(f"{landing_dir}/{subfolder}/LocationsData.csv")

In [None]:
df_coles.head(5)

In [None]:
# check missing value
df_coles.info()

In [None]:
# check duplicate data
print(df_coles.duplicated().sum())


In [20]:
# Add comma to 'double space' to split adddress
df_coles['Address'] = df_coles['Address'].str.replace('  ', ', ', regex=False)

In [None]:
# get the longtitude and latitude of each supermarket

# Initialize the geolocator using Nominatim
geolocator = Nominatim(user_agent="store_locator", timeout=10)

processed_coles = add_lat_long_to_dataframe(df_coles)

In [None]:
# check missing value
processed_coles.info()

In [None]:
pd.set_option('display.max_colwidth', 1000)

missing_coles = processed_coles[processed_coles['Latitude'].isnull()]
missing_coles.info()

In [None]:
missing_coles['Address']

In [25]:
# Manually fill in missing value
manual_lat_lon_coles = {
    10: (-37.895479, 144.7522644),
    16: (-37.7625969, 145.1669369),
    28: (-37.755627, 145.0692596),
    61: (-38.3684998, 142.4947968),
    62: (-37.767898, 145.041290),
    65: (-37.809830, 144.963289),
    111: (-38.150985, 145.196710),
    118: (-38.106773, 147.064516),
    159: (-38.11342, 145.28326),
    183: (-37.621240, 145.006748), 
    210: (-36.548702, 145.984955),
    216: (-37.722511, 144.670048) 
}

In [26]:
# Iterate over the rows of the DataFrame
for index, row in processed_coles.iterrows():
    # Check if latitude or longitude is missing
    if pd.isnull(row['Latitude']) or pd.isnull(row['Longitude']):
        # If the index exists in the manual_lat_lon_coles dictionary, fill the missing value
        if index in manual_lat_lon_coles:
            lat, lon = manual_lat_lon_coles[index]
            processed_coles.at[index, 'Latitude'] = lat
            processed_coles.at[index, 'Longitude'] = lon

In [None]:
processed_coles.info()

# Union data and visualization

In [None]:
# For coles dataset, merge 'Brand', 'Name' and 'Number' to a new feature 'Store Name'
processed_coles['Number'] = processed_coles['Number'].astype(str)
processed_coles['Store Name'] = processed_coles['Brand'] + ' #' + processed_coles['Number'] + ' - ' + processed_coles['Name']

# select 'Store Name', 'Address', 'Latitude' and 'Longitude' from both dataset
df_coles_selected = processed_coles[['Store Name', 'Address', 'Latitude', 'Longitude']]
df_wws_selected = processed_wws[['Store Name', 'Address', 'Latitude', 'Longitude']]

# Union these 2 dataset
df_union = pd.concat([df_wws_selected, df_coles_selected], ignore_index=True)

df_union.head(5)


In [29]:
# save data
output_path = f"{raw_dir}/{subfolder}/coles_wws_data.csv"
df_union.to_csv(output_path, index=False)

In [5]:
# Visualize data

df = pd.read_csv(f"{raw_dir}/{subfolder}/coles_wws_data.csv")

# Create a geometry column from latitude and longitude
geometry = [Point(xy) for xy in zip(df['Longitude'], df['Latitude'])]

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=geometry)

# Create a folium map object, centered at a location (e.g., Melbourne)
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=12)  # Adjust the zoom level as needed

# Create a MarkerCluster object
marker_cluster = MarkerCluster().add_to(m)

# Add markers to the cluster
for _, row in gdf.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"{row['Store Name']}",
        tooltip=row['Store Name']
    ).add_to(marker_cluster)

# Save the map to an HTML file for viewing in a web browser
output_path = f"{plot_dir}/{subfolder}/Coles_WWS.html"
m.save(output_path)

m