Merging the e.coli and weather data 

Loading the data

In [24]:
import pandas as pd

# Load data
beach_data = pd.read_parquet("..\\data\\cleaned\\cleaned_hanlans_point_beach.parquet")
weather_data = pd.read_parquet("..\\data\\cleaned\\cleaned_toronto_city.parquet")

print(beach_data.shape)
print(beach_data.columns)

print(weather_data.shape)
print(weather_data.columns)

print(beach_data.head)


(1567, 6)
Index(['date', 'beach_id', 'beach_name', 'e_coli_cfu',
       'water_quality_advisory', 'safety_status'],
      dtype='object')
(6423, 3)
Index(['station_name', 'precip_mm', 'date'], dtype='object')
<bound method NDFrame.head of            date  beach_id            beach_name  e_coli_cfu  \
5    2024-09-02       3.0  Hanlan's Point Beach       162.0   
6    2024-09-01       3.0  Hanlan's Point Beach        25.0   
7    2024-08-31       3.0  Hanlan's Point Beach        13.0   
8    2024-08-30       3.0  Hanlan's Point Beach        26.0   
9    2024-08-29       3.0  Hanlan's Point Beach        68.0   
...         ...       ...                   ...         ...   
1666 2008-08-27       3.0  Hanlan's Point Beach        23.0   
1667 2008-08-26       3.0  Hanlan's Point Beach       159.0   
1668 2008-08-25       3.0  Hanlan's Point Beach        70.0   
1669 2008-08-24       3.0  Hanlan's Point Beach        10.0   
1670 2008-08-23       3.0  Hanlan's Point Beach        10.0   

    

Split beach data by year

In [23]:
annual_beach = {year: group for year, group in beach_data.groupby(beach_data['date'].dt.year)}

print(annual_beach.keys())
print(annual_beach[2009])

dict_keys([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024])
           date  beach_id            beach_name  e_coli_cfu  \
1563 2009-09-06       3.0  Hanlan's Point Beach        10.0   
1564 2009-09-05       3.0  Hanlan's Point Beach        10.0   
1565 2009-09-04       3.0  Hanlan's Point Beach        12.0   
1566 2009-09-03       3.0  Hanlan's Point Beach        19.0   
1567 2009-09-02       3.0  Hanlan's Point Beach        22.0   
...         ...       ...                   ...         ...   
1657 2009-06-04       3.0  Hanlan's Point Beach        10.0   
1658 2009-06-03       3.0  Hanlan's Point Beach        11.0   
1659 2009-06-02       3.0  Hanlan's Point Beach        11.0   
1660 2009-06-01       3.0  Hanlan's Point Beach        10.0   
1661 2009-05-31       3.0  Hanlan's Point Beach        10.0   

                                 water_quality_advisory safety_status  
1563  Beach safe. E.coli levels at or below 100 per ...  

Align weather data with 7-day lookback 

In [22]:
annual_weather = {}
for year, beach_df in annual_beach.items():
    # Calculate lookback window for THIS YEAR
    start_date = beach_df['date'].min() - pd.Timedelta(days=7)  # 7-day lookback
    end_date = beach_df['date'].max()
    
    # Filter weather data
    mask = (weather_data['date'] >= start_date) & (weather_data['date'] <= end_date)
    annual_weather[year] = weather_data.loc[mask].copy()
    
    print(f"{year}: Weather from {start_date.date()} to {end_date.date()} ({len(annual_weather[year])} rows)")

2008: Weather from 2008-08-16 to 2008-08-31 (16 rows)
2009: Weather from 2009-05-24 to 2009-09-06 (106 rows)
2010: Weather from 2010-05-23 to 2010-09-07 (108 rows)
2011: Weather from 2011-05-05 to 2011-09-05 (124 rows)
2012: Weather from 2012-05-15 to 2012-09-03 (110 rows)
2013: Weather from 2013-05-14 to 2013-09-02 (110 rows)
2014: Weather from 2014-05-13 to 2014-09-01 (105 rows)
2015: Weather from 2015-05-12 to 2015-09-07 (116 rows)
2016: Weather from 2016-05-17 to 2016-09-06 (110 rows)
2017: Weather from 2017-05-16 to 2017-09-04 (102 rows)
2018: Weather from 2018-05-15 to 2018-09-03 (105 rows)
2019: Weather from 2019-05-14 to 2019-09-01 (110 rows)
2020: Weather from 2020-06-09 to 2020-09-07 (90 rows)
2021: Weather from 2021-05-18 to 2021-09-06 (111 rows)
2022: Weather from 2022-05-17 to 2022-09-05 (112 rows)
2023: Weather from 2023-05-16 to 2023-09-10 (118 rows)
2024: Weather from 2024-05-14 to 2024-09-02 (110 rows)


Merge data

In [None]:
merged = beach_data.merge(
    weather_data,
    how="inner",
    left_index=True,
    right_index=True
)

merged.info()