# Data import and feature selection

Import only datetime and precipitation

In [90]:
import pandas as pd

# Load data while preserving raw strings
weather_data = pd.read_csv(
    "../data/raw/weather_data/climate_data/daily/31688.csv",
    usecols=['Station Name', 'Date/Time', 'Total Precip (mm)'],
    dtype={'Date/Time': str}  # Critical for accurate duplicate detection
)

# Clean date strings (remove hidden characters/timezones)
weather_data['Date/Time'] = (
    weather_data['Date/Time']
    .str.strip()  # Remove whitespace
    .str[:10]     # Keep only YYYY-MM-DD (ignore time/timezone if present)
)

# Remove TRUE duplicates (same station + same date)
weather_clean = (
    weather_data
    .drop_duplicates(['Station Name', 'Date/Time'], keep='first')
    .sort_values(['Station Name', 'Date/Time'])
)

# Convert to datetime AFTER deduplication
weather_clean['date'] = pd.to_datetime(weather_clean['Date/Time'], errors='coerce')

# Final validation
assert weather_clean.duplicated(['Station Name', 'date']).sum() == 0
print(f"Cleaned data: {len(weather_clean)} rows ({len(weather_data)-len(weather_clean)} duplicates removed)")

Cleaned data: 6575 rows (72325 duplicates removed)


Check for duplicates

In [91]:
# Identify TRUE duplicates (same station + same timestamp)
dupes = weather_clean.duplicated(subset=['Station Name', 'Date/Time'], keep=False)
print(f"Found {dupes.sum()} true duplicates:")
print(weather_clean[dupes].sort_values(['Station Name', 'Date/Time']).head(10))

# Identify date/time duplucates
dt_dupes = weather_clean.duplicated(subset=['Date/Time'], keep=False)
print(f"Found {dt_dupes.sum()} date/time duplicates:")
print(weather_clean[dt_dupes].sort_values(['Station Name', 'Date/Time']).head(10))

Found 0 true duplicates:
Empty DataFrame
Columns: [Station Name, Date/Time, Total Precip (mm), date]
Index: []
Found 0 date/time duplicates:
Empty DataFrame
Columns: [Station Name, Date/Time, Total Precip (mm), date]
Index: []


Set date/time as index

In [93]:
weather_clean = (
    weather_clean.dropna(subset=['Date/Time'])
    .set_index('Date/Time')
    .sort_index()
)

Data Cleaning 

In [95]:
weather_clean.dropna(subset=["Total Precip (mm)"], inplace=True)

In [96]:
weather_clean.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6423 entries, 2007-01-01 to 2024-12-31
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Station Name       6423 non-null   object        
 1   Total Precip (mm)  6423 non-null   float64       
 2   date               6423 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 200.7+ KB


Write to file

In [98]:
import re

# Get and format beach name
stn_name = weather_clean.iloc[0]['Station Name'] # Assuming only one beach name at a time 
stn_name = stn_name.lower().replace(" ", "_")
stn_name = re.sub(f'[^a-z0-9_]', "", stn_name)

print(stn_name)
# Write to file 
weather_clean.to_parquet(f"..\\data\\cleaned\\cleaned_{stn_name}.parquet")

toronto_city


# Data Exploration

Rainfall exploration

In [None]:
has_precip = weather_data['Total Precip (mm)'] > 0 # 11892 entries in Toronto City
weather_data.loc[has_precip]

Plot date vs. precipitation

In [None]:
for i in range(2008, 2025):
    plt.figure(figsize=(10,5))
    mask = weather_data['Date/Time'].dt.year == i
    plt.scatter(weather_data['Date/Time'].loc[mask], weather_data['Total Precip (mm)'].loc[mask])
    plt.xlabel('Date')
    plt.ylabel('Precipitation (mm)')
    plt.title(f'{i}')
    plt.show()

In [5]:
# Inspect ecoli anomaly 