# **Weather Data Lookup**

### **Finding the Nearest Weather Station to all Traffic Stations**

1. Read trafficData.csv, trafficStations.csv, and weatherStations.csv files 

In [31]:
import pandas as pd

# read each CSV file into its own DataFrame
trafficData = pd.read_csv('datasets_cleaned/trafficData.csv')
trafficStations = pd.read_csv('datasets_cleaned/trafficStations.csv')

In [32]:
print(trafficData.shape)

(3465422, 35)


2. Merge station coordinates information with traffic data measurements

In [33]:
# keep only necessary columns in trafficData to avoid excess memory use
trafficData = trafficData[['station_key', 'year', 'month', 'day',]]

# keep only necessary columns in trafficStations to avoid excess memory use
stationLocations = trafficStations[['station_key', 'wgs84_latitude', 'wgs84_longitude']]

# merge the latitude and longitude info into trafficData
trafficData = trafficData.merge(stationLocations, on='station_key', how='left')

3. Round latitude and longitude values to the nearest 0.05 degrees

In [34]:
# round latitude and create a new column
trafficData['lat_rounded'] = (trafficData['wgs84_latitude'] / 0.05).round() * 0.05
# round longitude and create a new column
trafficData['lon_rounded'] = (trafficData['wgs84_longitude'] / 0.05).round() * 0.05

# drop the original latitude and longitude columns
trafficData = trafficData.drop(columns=['wgs84_latitude', 'wgs84_longitude'])

4. Convert year, month, and day columns to a single datetime column

In [35]:
# create datetime column
trafficData['date'] = pd.to_datetime(trafficData[['year', 'month', 'day']])

# drop the original year/month/day columns
trafficData.drop(columns=['year', 'month', 'day'], inplace=True)

5. Group rows by coordinates

In [36]:
# remove duplicates just for grouping purposes (we’ll keep the full df later)
unique_requests = trafficData[['lat_rounded', 'lon_rounded', 'date']].drop_duplicates()

# group by location
grouped = unique_requests.groupby(['lat_rounded', 'lon_rounded'])

6. Determine all unique API requests to make

In [37]:
api_requests = []

for (lat, lon), group in grouped:
    group = group.sort_values('date')
    # Identify breaks in date sequence (where date difference > 1 day)
    diff = group['date'].diff().dt.days
    range_id = (diff != 1).cumsum()

    for _, sub in group.groupby(range_id):
        start_date = sub['date'].iloc[0]
        end_date = sub['date'].iloc[-1]
        api_requests.append({
            'lat_rounded': lat,
            'lon_rounded': lon,
            'start_date': start_date,
            'end_date': end_date
        })

# Create a DataFrame of API requests
request_df = pd.DataFrame(api_requests)

# format dates as YYYYMMDD integers
request_df['start_date'] = request_df['start_date'].dt.strftime('%Y%m%d').astype(int)
request_df['end_date'] = request_df['end_date'].dt.strftime('%Y%m%d').astype(int)


7. Create function to retrieve API response

In [38]:
import requests

def get_weather_data(start, finish, lat_rounded, lon_rounded):

    # construct the URL
    url = (
        "https://www.longpaddock.qld.gov.au/cgi-bin/silo/DataDrillDataset.php"
        f"?lat={lat_rounded}&lon={lon_rounded}&start={start}&finish={finish}&format=json&comment=RXN&username=danielalexanderchung@outlook.com&password=apirequest"
    )

    # make the request
    response = requests.get(url)
    response.raise_for_status()

    # parse JSON response
    data = response.json()
    daily_data = data.get("data", [])

   # Extract weather data for each date in the range
    weather_list = []
    for row in daily_data:
        variables = {var['variable_code']: var['value'] for var in row['variables']}
        weather_list.append({
            'date': row.get("date"),
            'lat_rounded': lat_rounded,
            'lon_rounded': lon_rounded,
            'daily_rain': variables.get("daily_rain"),
            'max_temp': variables.get("max_temp"),
            'min_temp': variables.get("min_temp"),
        })

    return weather_list


8. Collect all API responses

In [39]:
import time
import sys

all_API_responses = []

for idx, row in request_df.iterrows():
    lat = row['lat_rounded']
    lon = row['lon_rounded']
    start = str(int(row['start_date']))
    end = str(int(row['end_date']))

    try:
        API_response = get_weather_data(start, end, lat, lon)
        all_API_responses.extend(API_response)
    except Exception as e:
        print(f"Failed request for {lat}, {lon} from {start} to {end}: {e}")

    # display progress
    sys.stdout.write(f"\rProcessed {idx + 1}/{len(request_df)} requests.")
    sys.stdout.flush()

    # add a delay to avoid overloading the server
    # time.sleep(0.5)  # 0.5 seconds

# save as DafaFrame
weatherData = pd.DataFrame(all_API_responses)


Processed 450/29080 requests.

KeyboardInterrupt: 

In [None]:
# convert date to datetime format for merging
trafficData['date'] = pd.to_datetime(trafficData['date'])
weatherData['date'] = pd.to_datetime(weatherData['date'])

# merge weather onto trafficData using lat, lon, and date
weatherData_merged = trafficData.merge(
    weatherData[['date', 'lat_rounded', 'lon_rounded', 'daily_rain', 'max_temp', 'min_temp']],
    how='left',
    on=['lat_rounded', 'lon_rounded', 'date']
)

# remove station_key column
weatherData_merged = weatherData_merged[['lat_rounded', 'lon_rounded', 'date', 'daily_rain', 'max_temp', 'min_temp']]

# save the DataFrame to a CSV file
weatherData_merged.to_csv("datasets_cleaned/weatherData.csv", index=False)

In [None]:
print("trafficData size:", trafficData.shape)
print("weatherData size:", weatherData.shape)
print("weatherData_merged size:", weatherData_merged.shape)
print("request_df size:", request_df.shape)

trafficData size: (50, 4)
weatherData size: (38, 6)
weatherData_merged size: (50, 6)
request_df size: (38, 4)
