In [None]:
import numpy as np
import pandas as pd

# Data Cleaning and Visualisation

In [23]:
data = pd.read_csv("data/fuelPrice_NSW.csv", parse_dates=['PriceUpdatedDate'])

print(f"Dataset size before cleaning: {data.shape[0]}")

# Replace whitespace with NaN
data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Convert Price to numeric
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')

# Create separate date and time columns
data['Date'] = data['PriceUpdatedDate'].dt.date
data['Time'] = data['PriceUpdatedDate'].dt.time

# Drop all rows with missing or NaN values
data.dropna(inplace=True)

# Drop all duplicate rows
data.drop_duplicates(inplace=True)

print(f"Dataset size after cleaning: {data.shape[0]}")

data.head(n=-1)

Dataset size before cleaning: 98925
Dataset size after cleaning: 96854


Unnamed: 0,ServiceStationName,FuelCode,PriceUpdatedDate,Price,Latitude,Longitude,Date,Time
0,7-Eleven Minchinbury,E10,2025-08-31 22:10:00,159.9,-33.778213,150.808089,2025-08-31,22:10:00
1,7-Eleven Minchinbury,U91,2025-08-31 22:10:00,163.9,-33.778213,150.808089,2025-08-31,22:10:00
2,7-Eleven Minchinbury,P95,2025-08-31 22:10:00,178.9,-33.778213,150.808089,2025-08-31,22:10:00
3,7-Eleven Minchinbury,P98,2025-08-31 22:10:00,185.9,-33.778213,150.808089,2025-08-31,22:10:00
4,7-Eleven Blacktown,P98,2025-08-31 18:44:00,188.9,-33.754838,150.891467,2025-08-31,18:44:00
...,...,...,...,...,...,...,...,...
98919,7-Eleven Croydon Park,E10,2016-08-01 00:28:00,103.9,-33.894219,151.111471,2016-08-01,00:28:00
98920,7-Eleven Croydon Park,P98,2016-08-01 00:28:00,121.9,-33.894219,151.111471,2016-08-01,00:28:00
98921,7-Eleven Croydon Park,U91,2016-08-01 00:28:00,105.9,-33.897687,151.099818,2016-08-01,00:28:00
98922,7-Eleven Croydon Park,P98,2016-08-01 00:28:00,121.9,-33.897687,151.099818,2016-08-01,00:28:00


In [24]:
# Filter outliers using IQR method
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

print(f"Dataset size after removing price outliers: {data.shape[0]}")

Dataset size after removing price outliers: 96522
