# Data import and feature selection

Original

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Open .csv as a Dataframe
filename = "31688.csv"
weather_data = pd.read_csv(f"..\\data\\raw\\weather_data\\climate_data\\daily\\{filename}")

# Select only relevant features
# weather_data = weather_data.iloc[:, [2,4,5,6,7,9,11,13,15,17,19,23,29]]
weather_data = weather_data.loc[ ['']]

# Convert datetime str column to datetime object
weather_data['Date/Time'] = pd.to_datetime(weather_data['Date/Time'])
weather_data.drop(columns=["Year", "Month", "Day"], axis=1, inplace=True)

weather_data.info()
# weather_data.tail

# print(weather_data.loc[weather_data["Total Precip (mm)"].isna()])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78900 entries, 0 to 78899
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Station Name            78900 non-null  object        
 1   Date/Time               78900 non-null  datetime64[ns]
 2   Max Temp (°C)           78072 non-null  float64       
 3   Min Temp (°C)           78348 non-null  float64       
 4   Mean Temp (°C)          77976 non-null  float64       
 5   Heat Deg Days (°C)      77976 non-null  float64       
 6   Cool Deg Days (°C)      77976 non-null  float64       
 7   Total Rain (mm)         792 non-null    float64       
 8   Total Precip (mm)       77076 non-null  float64       
 9   Spd of Max Gust (km/h)  0 non-null      float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 6.0+ MB


  weather_data = pd.read_csv(f"..\\data\\raw\\weather_data\\climate_data\\daily\\{filename}")


Import only datetime and precipitation

In [78]:
import pandas as pd

# Load data while preserving raw strings
weather = pd.read_csv(
    "../data/raw/weather_data/climate_data/daily/31688.csv",
    usecols=['Station Name', 'Date/Time', 'Total Precip (mm)'],
    dtype={'Date/Time': str}  # Critical for accurate duplicate detection
)

# Step 1: Clean date strings (remove hidden characters/timezones)
weather['Date/Time'] = (
    weather['Date/Time']
    .str.strip()  # Remove whitespace
    .str[:10]     # Keep only YYYY-MM-DD (ignore time/timezone if present)
)

# Step 2: Remove TRUE duplicates (same station + same date)
weather_clean = (
    weather
    .drop_duplicates(['Station Name', 'Date/Time'], keep='first')
    .sort_values(['Station Name', 'Date/Time'])
)

# Step 3: Convert to datetime AFTER deduplication
weather_clean['date'] = pd.to_datetime(weather_clean['Date/Time'], errors='coerce')

# Final validation
assert weather_clean.duplicated(['Station Name', 'date']).sum() == 0
print(f"Cleaned data: {len(weather_clean)} rows ({len(weather)-len(weather_clean)} duplicates removed)")

Cleaned data: 6575 rows (72325 duplicates removed)


Check for duplicates

In [80]:
# 1. Identify TRUE duplicates (same station + same timestamp)
dupes = weather_clean.duplicated(subset=['Station Name', 'Date/Time'], keep=False)
print(f"Found {dupes.sum()} true duplicates:")
print(weather_clean[dupes].sort_values(['Station Name', 'Date/Time']).head(10))


Found 0 true duplicates:
Empty DataFrame
Columns: [Station Name, Date/Time, Total Precip (mm), date]
Index: []


Data Cleaning 

In [38]:
weather_data.dropna(subset=["Total Precip (mm)"], inplace=True)

In [39]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77076 entries, 0 to 78899
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Station Name            77076 non-null  object        
 1   Date/Time               77076 non-null  datetime64[ns]
 2   Max Temp (°C)           77040 non-null  float64       
 3   Min Temp (°C)           77064 non-null  float64       
 4   Mean Temp (°C)          77040 non-null  float64       
 5   Heat Deg Days (°C)      77040 non-null  float64       
 6   Cool Deg Days (°C)      77040 non-null  float64       
 7   Total Rain (mm)         792 non-null    float64       
 8   Total Precip (mm)       77076 non-null  float64       
 9   Spd of Max Gust (km/h)  0 non-null      float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 6.5+ MB


Write to file

In [40]:
import re

# Get and format beach name
stn_name = weather_data.iloc[0]['Station Name'] # Assuming only one beach name at a time 
stn_name = stn_name.lower().replace(" ", "_")
stn_name = re.sub(f'[^a-z0-9_]', "", stn_name)

print(stn_name)
# Write to file 
weather_data.to_parquet(f"..\\data\\cleaned\\cleaned_{stn_name}.parquet")

toronto_city


# Data Exploration

Rainfall exploration

In [None]:
has_precip = weather_data['Total Precip (mm)'] > 0 # 11892 entries in Toronto City
weather_data.loc[has_precip]

Plot date vs. precipitation

In [None]:
for i in range(2008, 2025):
    plt.figure(figsize=(10,5))
    mask = weather_data['Date/Time'].dt.year == i
    plt.scatter(weather_data['Date/Time'].loc[mask], weather_data['Total Precip (mm)'].loc[mask])
    plt.xlabel('Date')
    plt.ylabel('Precipitation (mm)')
    plt.title(f'{i}')
    plt.show()

In [5]:
# Inspect ecoli anomaly 