In [1]:
# Import libraries/packages
import pandas as pd
import googlemaps

In [None]:
# Load original dataset
df = pd.read_csv(r'C:\Users\marius\OneDrive\UWI - Postgraduate - Data Science\Thesis\thesis_code\data\raw\fuel_data_raw.csv')

In [None]:
# Drop 'Origin' and 'Destination' titled columns (excluding ID/Lat/Lon)
df.drop(columns=[col for col in df.columns if 'Origin' in col and col not in ['Origin_ID', 'Origin_Latitude', 'Origin_Longitude']
                or 'Destination' in col and col not in ['Destination_ID', 'Destination_Latitude', 'Destination_Longitude']], inplace=True)

# Anonymize origin names uniquely
if 'Origin_ID' in df.columns:
    origin_mapping = {old: f"Source_{i+1}" for i, old in enumerate(df['Origin_ID'].dropna().unique())}
    df['Origin_ID'] = df['Origin_ID'].map(origin_mapping)

# Rename Destination_IDs consistently
destination_mapping = {old: f"Station_{i+1}" for i, old in enumerate(df['Destination_ID'].unique())}
df['Destination_ID'] = df['Destination_ID'].map(destination_mapping)

# Remove 'Truck' column
if 'Truck' in df.columns:
    df.drop(columns='Truck', inplace=True)

# Anonymize product names uniquely
product_mapping = {old: f"Product_{i+1}" for i, old in enumerate(df['Product'].unique())}
df['Product'] = df['Product'].map(product_mapping)

# Anonymize Tractor and Trailer IDs
if 'Tractor' in df.columns:
    tractor_mapping = {old: f"Tractor_{i+1}" for i, old in enumerate(df['Tractor'].dropna().unique())}
    df['Tractor'] = df['Tractor'].map(tractor_mapping)

if 'Trailer' in df.columns:
    trailer_mapping = {old: f"Trailer_{i+1}" for i, old in enumerate(df['Trailer'].dropna().unique())}
    df['Trailer'] = df['Trailer'].map(trailer_mapping)

# Filter out rows with null Product_Storage_Capacity_Liters
df = df[df['Product_Storage_Capacity_Liters'].notnull()]

# Calculate road distance using Google Maps, with caching to avoid duplicates
gmaps = googlemaps.Client(key='')

distance_cache = {}

def get_road_distance(row):
    origin = (row['Origin_Latitude'], row['Origin_Longitude'])
    dest = (row['Destination_Latitude'], row['Destination_Longitude'])
    key = (origin, dest)

    if key not in distance_cache:
        try:
            result = gmaps.distance_matrix(origin, dest, mode='driving')
            meters = result['rows'][0]['elements'][0]['distance']['value']
            distance_km = meters / 1000
            distance_cache[key] = distance_km
        except Exception as e:
            print(f"Error retrieving distance for {key}: {e}")
            distance_cache[key] = None
    return distance_cache[key]

df['Distance_km'] = df.apply(get_road_distance, axis=1)

df.drop(columns=[
    'Origin_Latitude', 'Origin_Longitude',
    'Destination_Latitude', 'Destination_Longitude'
], inplace=True)

In [None]:
# Save the anonymized dataset
anon_path = r'C:\Users\marius\OneDrive\UWI - Postgraduate - Data Science\Thesis\thesis_code\data\raw\fuel_data_raw_anon.csv'
df.to_csv(anon_path, index=False)