Merging the e.coli and weather data 

Loading the data

In [39]:
import pandas as pd

# Load data
beach_data = pd.read_parquet("..\\data\\cleaned\\cleaned_hanlans_point_beach.parquet")
weather_data = pd.read_parquet("..\\data\\cleaned\\cleaned_toronto_city.parquet")

print(f"beach data shape: {beach_data.shape}")
print(f"beach data col. names: {beach_data.columns.to_list()}")

print(f"weather data shape: {weather_data.shape}")
print(f"weather data col. names: {weather_data.columns.to_list()}")

print(beach_data['e_coli_cfu'].head(5))
print(weather_data.head(5))


beach data shape: (1567, 5)
beach data col. names: ['beach_id', 'beach_name', 'e_coli_cfu', 'water_quality_advisory', 'safety_status']
weather data shape: (6423, 3)
weather data col. names: ['station_name', 'precip_mm', 'date']
date
2008-08-23     10.0
2008-08-24     10.0
2008-08-25     70.0
2008-08-26    159.0
2008-08-27     23.0
Name: e_coli_cfu, dtype: float64
            station_name  precip_mm       date
Date/Time                                     
2007-01-01  TORONTO CITY        0.0 2007-01-01
2007-01-02  TORONTO CITY        0.0 2007-01-02
2007-01-03  TORONTO CITY        0.0 2007-01-03
2007-01-04  TORONTO CITY        3.2 2007-01-04
2007-01-05  TORONTO CITY        6.4 2007-01-05


Split beach data by year

In [35]:
annual_beach = {year: group for year, group in beach_data.groupby(beach_data['date'].dt.year)}

print(annual_beach.keys())
print(annual_beach[2009])

KeyError: 'date'

Align weather data with 7-day lookback 

In [22]:
annual_weather = {}
for year, beach_df in annual_beach.items():
    # Calculate lookback window for THIS YEAR
    start_date = beach_df['date'].min() - pd.Timedelta(days=7)  # 7-day lookback
    end_date = beach_df['date'].max()
    
    # Filter weather data
    mask = (weather_data['date'] >= start_date) & (weather_data['date'] <= end_date)
    annual_weather[year] = weather_data.loc[mask].copy()
    
    print(f"{year}: Weather from {start_date.date()} to {end_date.date()} ({len(annual_weather[year])} rows)")

2008: Weather from 2008-08-16 to 2008-08-31 (16 rows)
2009: Weather from 2009-05-24 to 2009-09-06 (106 rows)
2010: Weather from 2010-05-23 to 2010-09-07 (108 rows)
2011: Weather from 2011-05-05 to 2011-09-05 (124 rows)
2012: Weather from 2012-05-15 to 2012-09-03 (110 rows)
2013: Weather from 2013-05-14 to 2013-09-02 (110 rows)
2014: Weather from 2014-05-13 to 2014-09-01 (105 rows)
2015: Weather from 2015-05-12 to 2015-09-07 (116 rows)
2016: Weather from 2016-05-17 to 2016-09-06 (110 rows)
2017: Weather from 2017-05-16 to 2017-09-04 (102 rows)
2018: Weather from 2018-05-15 to 2018-09-03 (105 rows)
2019: Weather from 2019-05-14 to 2019-09-01 (110 rows)
2020: Weather from 2020-06-09 to 2020-09-07 (90 rows)
2021: Weather from 2021-05-18 to 2021-09-06 (111 rows)
2022: Weather from 2022-05-17 to 2022-09-05 (112 rows)
2023: Weather from 2023-05-16 to 2023-09-10 (118 rows)
2024: Weather from 2024-05-14 to 2024-09-02 (110 rows)


Merge data

In [None]:
merged = beach_data.merge(
    weather_data,
    how="inner",
    left_index=True,
    right_index=True
)

merged.info()