In [3]:
# Import libraries/packages
import pandas as pd
import googlemaps, polyline
import os
import pickle
import math
from pathlib import Path

In [18]:
# Load original dataset
ROOT = Path.cwd().parent
fuel_data_raw = pd.read_csv(ROOT / 'data/raw/fuel_data_raw.csv')

In [19]:
# Drop 'Origin' and 'Destination' titled columns (excluding ID/Lat/Lon)
fuel_data_raw.drop(columns=[col for col in fuel_data_raw.columns if 'Origin' in col and col not in ['Origin_ID', 'Origin_Latitude', 'Origin_Longitude']
                or 'Destination' in col and col not in ['Destination_ID', 'Destination_Latitude', 'Destination_Longitude']], inplace=True)

# Anonymize origin names uniquely
if 'Origin_ID' in fuel_data_raw.columns:
    origin_mapping = {old: f"Source_{i+1}" for i, old in enumerate(fuel_data_raw['Origin_ID'].dropna().unique())}
    fuel_data_raw['Origin_ID'] = fuel_data_raw['Origin_ID'].map(origin_mapping)

# Rename Destination_IDs consistently
destination_mapping = {old: f"Station_{i+1}" for i, old in enumerate(fuel_data_raw['Destination_ID'].unique())}
fuel_data_raw['Destination_ID'] = fuel_data_raw['Destination_ID'].map(destination_mapping)

# Remove 'Truck' column
if 'Truck' in fuel_data_raw.columns:
    fuel_data_raw.drop(columns='Truck', inplace=True)

# Anonymize product names uniquely
product_mapping = {old: f"Product_{i+1}" for i, old in enumerate(fuel_data_raw['Product'].unique())}
fuel_data_raw['Product'] = fuel_data_raw['Product'].map(product_mapping)

# Anonymize Tractor and Trailer IDs
if 'Tractor' in fuel_data_raw.columns:
    tractor_mapping = {old: f"Tractor_{i+1}" for i, old in enumerate(fuel_data_raw['Tractor'].dropna().unique())}
    fuel_data_raw['Tractor'] = fuel_data_raw['Tractor'].map(tractor_mapping)

if 'Trailer' in fuel_data_raw.columns:
    trailer_mapping = {old: f"Trailer_{i+1}" for i, old in enumerate(fuel_data_raw['Trailer'].dropna().unique())}
    fuel_data_raw['Trailer'] = fuel_data_raw['Trailer'].map(trailer_mapping)

# Filter out rows with null Product_Storage_Capacity_Liters
fuel_data_raw = fuel_data_raw[fuel_data_raw['Product_Storage_Capacity_Liters'].notnull()]

In [None]:
API_KEY     = ""                  
CACHE_FILE  = "distance_cache.pkl"

gmaps = googlemaps.Client(key=API_KEY) if API_KEY else None
use_api = gmaps is not None

# 2.  LOAD or INITIALISE the distance cache
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "rb") as f:
        distance_cache = pickle.load(f)
    print(f"Loaded {len(distance_cache):,} cached distances")
else:
    distance_cache = {}
    print("Starting with an empty distance cache")

# ------------------------------------------------------------------
# 3.  Helper that first checks the cache, then calls Google if allowed
def get_road_distance(row):
    origin = (row["Origin_Latitude"], row["Origin_Longitude"])
    dest   = (row["Destination_Latitude"], row["Destination_Longitude"])
    key    = (origin, dest)

    # 3a. Use cached value if we have it
    if key in distance_cache:
        return distance_cache[key]

    # 3b. No cache → call API if we can
    if use_api:
        try:
            result = gmaps.distance_matrix(origin, dest, mode="driving")
            meters = result["rows"][0]["elements"][0]["distance"]["value"]
            km     = meters / 1000
            distance_cache[key] = km         # store for next run
            return km
        except Exception as e:
            print(f"Google API error for {key}: {e}")

    # 3c. Offline or API failed → return None (becomes NaN in DataFrame)
    return None

# ------------------------------------------------------------------
# 4.  Apply to your DataFrame
fuel_data_raw["Distance_km"] = fuel_data_raw.apply(get_road_distance, axis=1)

# ------------------------------------------------------------------
# 5.  SAVE the updated cache (only if we actually queried the API)
if use_api:
    with open(CACHE_FILE, "wb") as f:
        pickle.dump(distance_cache, f)

# ------------------------------------------------------------------
# 6.  Finish anonymisation (unchanged)
fuel_data_raw.drop(columns=[
    "Origin_Latitude", "Origin_Longitude",
    "Destination_Latitude", "Destination_Longitude"
], inplace=True)

Starting with an empty distance cache
Saved 49 distances to distance_cache.pkl


In [21]:
fuel_data_raw.head()

Unnamed: 0,Delivery_ID,Origin_ID,Destination_ID,Truck_Tank_Capacity_Liters,Product,Delivered_Volume_Liters,Product_Storage_Capacity_Liters,Previous_Delivery_Date,Delivery_Date,Tractor,Trailer,Distance_km
0,56660,Source_1,Station_1,11365.0,Product_1,6781.0,20441.38,2022-12-24,2023-01-02,,Trailer_1,52.544
1,56852,Source_1,Station_1,12728.8,Product_1,4912.0,20441.38,2023-01-02,2023-01-05,,Trailer_2,52.544
2,56995,Source_1,Station_1,20003.0,Product_1,7467.0,20441.38,2023-01-05,2023-01-07,,Trailer_3,52.544
3,57252,Source_1,Station_1,12728.8,Product_1,4941.0,20441.38,2023-01-07,2023-01-11,,Trailer_2,52.544
4,57352,Source_1,Station_1,11365.0,Product_1,4498.0,20441.38,2023-01-11,2023-01-13,,Trailer_1,52.544


In [None]:
# Save the anonymized dataset
anon_path = ROOT / 'data/raw/fuel_data_raw_anon.csv'
fuel_data_raw.to_csv(anon_path, index=False)