## 1. Load Data and Explore Variables

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load daily data
data_path = Path('../data/processed/daily/all_locations_daily.parquet')
daily = pd.read_parquet(data_path)

print(f"Dataset shape: {daily.shape}")
print(f"Total variables: {len(daily.columns)}")
print(f"\nAll columns:")
print(daily.columns.tolist())

Dataset shape: (10965, 57)
Total variables: 57

All columns:
['date', 'location_code', 'location_name', 'weather_code', 'temperature_2m_mean', 'temperature_2m_max', 'temperature_2m_min', 'apparent_temperature_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'sunshine_duration', 'daylight_duration', 'sunset', 'sunrise', 'precipitation_sum', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 'et0_fao_evapotranspiration', 'shortwave_radiation_sum', 'wind_direction_10m_dominant', 'wind_gusts_10m_max', 'wind_speed_10m_max', 'cloud_cover_mean', 'cloud_cover_max', 'cloud_cover_min', 'dew_point_2m_mean', 'dew_point_2m_max', 'dew_point_2m_min', 'pressure_msl_min', 'pressure_msl_max', 'pressure_msl_mean', 'snowfall_water_equivalent_sum', 'relative_humidity_2m_min', 'relative_humidity_2m_max', 'et0_fao_evapotranspiration_sum', 'relative_humidity_2m_mean', 'surface_pressure_mean', 'surface_pressure_max', 'surface_pressure_min', 'winddirection_10m_dominant', 'wind_gusts_10m_mean', 'wi

## 2. Categorize Variables by Relevance

In [2]:
# CATEGORY 1: Essential Daily Summaries - KEEP
essential_vars = [
    'weather_code',                    # Daily weather condition
    'temperature_2m_max',              # Max temp (important for hot days!)
    'temperature_2m_min',              # Min temp (important for cold nights)
    'temperature_2m_mean',             # Average temp
    'apparent_temperature_max',        # Feels-like max (very important!)
    'apparent_temperature_min',        # Feels-like min
    'apparent_temperature_mean',       # Feels-like average
    'precipitation_sum',               # Total rain (tourists avoid rainy days)
    'rain_sum',                        # Total rainfall
    'precipitation_hours',             # How many hours it rained
    'wind_speed_10m_max',              # Max wind speed (safety)
    'wind_gusts_10m_max',              # Max wind gusts (safety)
    'wind_direction_10m_dominant',     # Dominant wind direction
    'sunshine_duration',               # Hours of sunshine (important!)
    'daylight_duration',               # Total daylight hours
    'sunrise',                         # Sunrise time
    'sunset',                          # Sunset time
    'uv_index_max',                    # UV index (sun protection)
    'cloud_cover_mean',                # Average cloudiness
    'relative_humidity_2m_mean',       # Average humidity
    'pressure_msl_mean',               # Average pressure
    'visibility_mean'                  # Average visibility
]

# CATEGORY 2: Redundant Variables - REMOVE (keep only one from each group)
# We already have max/min/mean for temperature and apparent_temperature
redundant_vars = [
    # Temperature redundancy - keeping max, min, mean is enough
    # No need for additional temperature metrics
    
    # Humidity - we have mean, don't need max/min
    'relative_humidity_2m_max',
    'relative_humidity_2m_min',
    
    # Dew point - redundant with humidity and temperature
    'dew_point_2m_mean',
    'dew_point_2m_max',
    'dew_point_2m_min',
    
    # Pressure - mean is enough
    'pressure_msl_max',
    'pressure_msl_min',
    'surface_pressure_mean',
    'surface_pressure_max',
    'surface_pressure_min',
    
    # Cloud cover - mean is enough
    'cloud_cover_max',
    'cloud_cover_min',
    
    # Wind - max is most important, don't need mean/min
    'wind_speed_10m_mean',
    'wind_speed_10m_min',
    'wind_gusts_10m_mean',
    'wind_gusts_10m_min',
    'winddirection_10m_dominant',  # Duplicate of wind_direction_10m_dominant
    
    # Visibility - mean is enough
    'visibility_max',
    'visibility_min',
    
    # Wet bulb temperature - redundant with temp and humidity
    'wet_bulb_temperature_2m_mean',
    'wet_bulb_temperature_2m_max',
    'wet_bulb_temperature_2m_min'
]

# CATEGORY 3: Rare/Not Applicable in SA - REMOVE
rare_vars = [
    'snowfall_sum',                    # Rare in most SA locations
    'showers_sum',                     # Similar to rain_sum
    'snowfall_water_equivalent_sum',   # Rare
]

# CATEGORY 4: Optional/Advanced - EVALUATE
optional_vars = [
    'precipitation_probability_max',   # Might be useful for planning
    'precipitation_probability_min',
    'precipitation_probability_mean',
    'shortwave_radiation_sum',         # Related to solar energy/heat
    'et0_fao_evapotranspiration',      # Evapotranspiration
    'et0_fao_evapotranspiration_sum',
    'vapour_pressure_deficit_max',     # Humidity-related
    'uv_index_clear_sky_max',          # UV on clear days
    'cape_mean',                       # Atmospheric instability (storms)
    'cape_max',
    'cape_min',
    'updraft_max',                     # Storm-related
    'leaf_wetness_probability_mean',   # Agricultural
    'growing_degree_days_base_0_limit_50'  # Agricultural
]

# CATEGORY 5: Metadata (Always keep)
metadata_vars = [
    'date',
    'location_code',
    'location_name'
]

print(f"Essential variables: {len(essential_vars)}")
print(f"Redundant variables (to remove): {len(redundant_vars)}")
print(f"Rare variables (to remove): {len(rare_vars)}")
print(f"Optional variables: {len(optional_vars)}")
print(f"Metadata variables: {len(metadata_vars)}")

Essential variables: 22
Redundant variables (to remove): 22
Rare variables (to remove): 3
Optional variables: 14
Metadata variables: 3


## 3. Check Which Variables Exist in Our Data

In [3]:
# Check which essential vars we actually have
available_essential = [v for v in essential_vars if v in daily.columns]
missing_essential = [v for v in essential_vars if v not in daily.columns]

print(f"Available essential vars: {len(available_essential)}")
print(available_essential)
print(f"\nMissing essential vars: {len(missing_essential)}")
print(missing_essential)

Available essential vars: 20
['weather_code', 'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean', 'precipitation_sum', 'rain_sum', 'precipitation_hours', 'wind_speed_10m_max', 'wind_gusts_10m_max', 'wind_direction_10m_dominant', 'sunshine_duration', 'daylight_duration', 'sunrise', 'sunset', 'cloud_cover_mean', 'relative_humidity_2m_mean', 'pressure_msl_mean']

Missing essential vars: 2
['uv_index_max', 'visibility_mean']


## 4. Select Final Variables

In [4]:
# Final selection: Metadata + Essential + Useful optional
selected_vars = metadata_vars + available_essential

# Add useful optional vars if they exist
useful_optional = [
    'precipitation_probability_max',
    'shortwave_radiation_sum',
    'uv_index_clear_sky_max',
    'cape_max'  # Storm indicator
]

for var in useful_optional:
    if var in daily.columns and var not in selected_vars:
        selected_vars.append(var)

# Create filtered dataset
daily_filtered = daily[selected_vars].copy()

print(f"\n{'='*80}")
print(f"FINAL DAILY VARIABLE SELECTION")
print(f"{'='*80}")
print(f"Original variables: {len(daily.columns)}")
print(f"Selected variables: {len(selected_vars)}")
print(f"Removed variables: {len(daily.columns) - len(selected_vars)}")
print(f"\nSelected variables:")
for i, var in enumerate(selected_vars, 1):
    print(f"  {i:2d}. {var}")


FINAL DAILY VARIABLE SELECTION
Original variables: 57
Selected variables: 24
Removed variables: 33

Selected variables:
   1. date
   2. location_code
   3. location_name
   4. weather_code
   5. temperature_2m_max
   6. temperature_2m_min
   7. temperature_2m_mean
   8. apparent_temperature_max
   9. apparent_temperature_min
  10. apparent_temperature_mean
  11. precipitation_sum
  12. rain_sum
  13. precipitation_hours
  14. wind_speed_10m_max
  15. wind_gusts_10m_max
  16. wind_direction_10m_dominant
  17. sunshine_duration
  18. daylight_duration
  19. sunrise
  20. sunset
  21. cloud_cover_mean
  22. relative_humidity_2m_mean
  23. pressure_msl_mean
  24. shortwave_radiation_sum


## 5. Check for Missing Values

In [5]:
# Check missing values in selected variables
missing = daily_filtered.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    print("\nVariables with missing values:")
    print(missing)
    print(f"\nPercentage missing:")
    print((missing / len(daily_filtered) * 100).round(2))
else:
    print("\n✅ No missing values in selected variables!")


✅ No missing values in selected variables!


## 6. Summary Statistics

In [6]:
# Summary stats for numeric variables
numeric_vars = daily_filtered.select_dtypes(include=[np.number]).columns
daily_filtered[numeric_vars].describe()

Unnamed: 0,weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,precipitation_sum,rain_sum,precipitation_hours,...,wind_gusts_10m_max,wind_direction_10m_dominant,sunshine_duration,daylight_duration,sunrise,sunset,cloud_cover_mean,relative_humidity_2m_mean,pressure_msl_mean,shortwave_radiation_sum
count,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,...,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0,10965.0
mean,25.264751,23.178824,13.004907,17.683614,22.597709,11.67596,16.622628,2.076689,2.076516,3.37036,...,44.12221,175.840952,35114.343319,43586.471129,1609388000.0,1609432000.0,40.360108,67.925687,1017.87372,18.574563
std,26.575291,5.044426,4.621199,4.295161,6.498547,6.051134,5.693744,5.723801,5.723349,5.473236,...,15.454209,105.420096,10087.5797,5026.040621,18232690.0,18233050.0,30.948481,15.455983,5.533442,7.017364
min,0.0,7.019,-4.018,2.332,1.079331,-10.080518,-3.221737,0.0,0.0,0.0,...,14.04,1.9e-05,0.0,35460.52,1577848000.0,1577897000.0,0.0,9.982326,1001.9749,0.92
25%,2.0,19.7105,9.9655,14.644916,17.999456,7.608822,12.588473,0.0,0.0,0.0,...,33.12,82.959076,32554.273,38901.83,1593583000.0,1593618000.0,11.791667,59.004864,1013.84155,13.17
50%,3.0,23.0595,13.481999,17.887167,22.393183,11.974827,16.766405,0.0,0.0,0.0,...,41.399998,174.87569,36308.363,43449.2,1609385000.0,1609435000.0,37.208332,71.34394,1017.35834,17.95
75%,51.0,26.612,16.362,20.76525,27.165886,15.965113,20.614866,1.2,1.2,5.0,...,52.56,263.07663,41879.8,48268.91,1625202000.0,1625239000.0,65.75,79.36569,1021.45,24.02
max,73.0,41.0605,24.487999,30.695915,43.41577,29.272266,32.194813,91.9,91.9,24.0,...,138.59999,360.0,48653.832,52045.805,1640922000.0,1640974000.0,100.0,98.414635,1038.7334,34.46


## 7. Save Filtered Dataset

In [7]:
# Save filtered daily data
output_path = Path('../data/processed/daily/daily_filtered.parquet')
daily_filtered.to_parquet(output_path, index=False)

print(f"\n✅ Filtered daily data saved to: {output_path}")
print(f"   Shape: {daily_filtered.shape}")
print(f"   File size: {output_path.stat().st_size / (1024**2):.2f} MB")


✅ Filtered daily data saved to: ..\data\processed\daily\daily_filtered.parquet
   Shape: (10965, 24)
   File size: 1.15 MB


## 8. Export Variable List

In [8]:
# Save selected variable names for documentation
selected_vars_df = pd.DataFrame({
    'variable': selected_vars,
    'category': ['metadata'] * 3 + ['essential'] * (len(selected_vars) - 3)
})

selected_vars_df.to_csv('../docs/selected_daily_variables.csv', index=False)
print("\n✅ Variable list saved to: docs/selected_daily_variables.csv")


✅ Variable list saved to: docs/selected_daily_variables.csv
