In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import pandas

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer, StandardScaler

from datetime import datetime
from scipy.io import readsav
import pyreadstat
from powderalert.ml_logic.preprocessor import define_X, preprocess
from powderalert.ml_logic.data import clean_data

# API request

In [2]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 47.26580883196723,
	"longitude": 11.84457426992035,
	"start_date": "2009-01-01",
	"end_date": "2024-01-01",
	"hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m", "wind_direction_100m", "wind_gusts_10m", "soil_temperature_0_to_7cm", "soil_temperature_7_to_28cm", "soil_temperature_28_to_100cm", "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm", "sunshine_duration"],
	"models": "best_match"
}
responses = openmeteo.weather_api(url, params=params)


# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")


Coordinates 47.27592086791992°N 12.058823585510254°E
Elevation 1818.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


In [3]:
# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
hourly_rain = hourly.Variables(4).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(5).ValuesAsNumpy()
hourly_snow_depth = hourly.Variables(6).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(7).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(8).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(9).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(10).ValuesAsNumpy()
hourly_cloud_cover_low = hourly.Variables(11).ValuesAsNumpy()
hourly_cloud_cover_mid = hourly.Variables(12).ValuesAsNumpy()
hourly_cloud_cover_high = hourly.Variables(13).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(14).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(15).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(16).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(17).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(18).ValuesAsNumpy()
hourly_wind_direction_100m = hourly.Variables(19).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(20).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(21).ValuesAsNumpy()
hourly_soil_temperature_7_to_28cm = hourly.Variables(22).ValuesAsNumpy()
hourly_soil_temperature_28_to_100cm = hourly.Variables(23).ValuesAsNumpy()
hourly_soil_temperature_100_to_255cm = hourly.Variables(24).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(25).ValuesAsNumpy()
hourly_soil_moisture_7_to_28cm = hourly.Variables(26).ValuesAsNumpy()
hourly_soil_moisture_28_to_100cm = hourly.Variables(27).ValuesAsNumpy()
hourly_soil_moisture_100_to_255cm = hourly.Variables(28).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(29).ValuesAsNumpy()


hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["precipitation"] = hourly_precipitation
hourly_data["rain"] = hourly_rain
hourly_data["snowfall"] = hourly_snowfall
hourly_data["snow_depth"] = hourly_snow_depth
hourly_data["weather_code"] = hourly_weather_code
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
hourly_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
hourly_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m
hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_temperature_7_to_28cm"] = hourly_soil_temperature_7_to_28cm
hourly_data["soil_temperature_28_to_100cm"] = hourly_soil_temperature_28_to_100cm
hourly_data["soil_temperature_100_to_255cm"] = hourly_soil_temperature_100_to_255cm
hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm
hourly_data["soil_moisture_7_to_28cm"] = hourly_soil_moisture_7_to_28cm
hourly_data["soil_moisture_28_to_100cm"] = hourly_soil_moisture_28_to_100cm
hourly_data["soil_moisture_100_to_255cm"] = hourly_soil_moisture_100_to_255cm
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)
hourly_dataframe

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,...,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,sunshine_duration
0,2009-01-01 00:00:00+00:00,-10.842501,72.968170,-14.742500,0.0,0.0,0.00,0.92,3.0,1025.199951,...,29.879999,-0.1425,0.2575,1.1575,2.6575,0.21,0.214,0.209,0.269,0.0
1,2009-01-01 01:00:00+00:00,-10.642500,73.911522,-14.392500,0.0,0.0,0.00,0.92,3.0,1025.199951,...,25.559999,-0.1425,0.2575,1.1075,2.6575,0.21,0.214,0.209,0.269,0.0
2,2009-01-01 02:00:00+00:00,-10.492500,74.547844,-14.142500,0.1,0.0,0.07,0.92,71.0,1025.000000,...,20.160000,-0.1925,0.2575,1.1075,2.6575,0.21,0.214,0.209,0.269,0.0
3,2009-01-01 03:00:00+00:00,-10.442499,75.476273,-13.942499,0.1,0.0,0.07,0.92,71.0,1025.300049,...,15.119999,-0.1925,0.2575,1.1075,2.6575,0.21,0.214,0.209,0.269,0.0
4,2009-01-01 04:00:00+00:00,-10.542500,78.596596,-13.542500,0.2,0.0,0.14,0.93,71.0,1025.500000,...,14.759999,-0.1925,0.2575,1.1075,2.6575,0.21,0.214,0.209,0.269,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131491,2024-01-01 19:00:00+00:00,-13.821000,86.608582,-15.571000,0.0,0.0,0.00,1.23,3.0,1017.099976,...,26.639999,0.6790,0.9290,1.7790,3.6290,0.24,0.246,0.238,0.267,0.0
131492,2024-01-01 20:00:00+00:00,-14.571000,86.166550,-16.371000,0.0,0.0,0.00,1.23,1.0,1017.400024,...,25.919998,0.6790,0.9290,1.7790,3.6290,0.24,0.246,0.238,0.267,0.0
131493,2024-01-01 21:00:00+00:00,-14.571000,84.035316,-16.671001,0.0,0.0,0.00,1.23,3.0,1017.099976,...,25.559999,0.6790,0.9290,1.7790,3.6290,0.24,0.246,0.238,0.267,0.0
131494,2024-01-01 22:00:00+00:00,-13.721000,80.381676,-16.371000,0.0,0.0,0.00,1.23,3.0,1016.900024,...,26.280001,0.6790,0.9290,1.7790,3.6290,0.24,0.246,0.238,0.267,0.0


In [4]:
target = ['snowfall']

In [5]:
train_df = hourly_dataframe
clean_train_df = clean_data(train_df)
X = define_X(clean_train_df,target)


✅ Data cleaned


In [6]:
X.describe()

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,...,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,sunshine_duration
count,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,...,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0
mean,2.178336,79.138184,-1.472008,0.196152,0.1138,0.482332,22.706781,1017.929688,816.054932,66.579666,...,27.618757,3.625005,3.60484,3.603981,3.628445,0.255772,0.255304,0.241331,0.282237,1213.196411
std,8.31859,17.163219,7.913401,0.503262,0.416948,0.532701,29.793755,7.943738,8.121846,36.923317,...,13.763321,5.212834,4.629919,3.761255,2.889373,0.045898,0.044622,0.048923,0.024311,1638.633545
min,-27.6425,7.492445,-33.471001,0.0,0.0,0.0,0.0,978.299988,776.987122,0.0,...,3.96,-6.821,-1.921,-0.5425,0.1575,0.121,0.135,0.116,0.241,0.0
25%,-3.771,67.887783,-7.071,0.0,0.0,0.0,1.0,1013.599976,811.240417,36.0,...,18.0,-0.371,-0.221,0.179,0.7575,0.216,0.216,0.197,0.262,0.0
50%,2.029,83.385448,-1.0425,0.0,0.0,0.22,3.0,1018.5,817.260681,84.0,...,24.48,0.729,1.229,2.079,2.9075,0.255,0.255,0.239,0.281,0.0
75%,8.2575,93.331192,5.0575,0.2,0.0,0.95,51.0,1022.799988,821.999466,100.0,...,33.839996,7.5075,7.679,7.229,6.4075,0.291,0.29,0.282,0.299,3600.0
max,27.328999,100.0,16.057501,11.2,11.2,1.9,75.0,1047.599976,837.25531,100.0,...,153.0,23.078999,16.7575,12.4575,9.379001,0.401,0.389,0.361,0.353,3600.0


In [7]:
X_preprocessed = preprocess(X)

y = clean_train_df[['snowfall']]

✅ Processed data, with shape (131496, 29)


In [9]:
X_preprocessed.describe()

Unnamed: 0,weather_code_encoded,cloud_cover,cloud_cover_high,cloud_cover_low,cloud_cover_mid,dew_point_2m,et0_fao_evapotranspiration,precipitation,pressure_msl,rain,...,soil_temperature_7_to_28cm,sunshine_duration,surface_pressure,temperature_2m,vapour_pressure_deficit,wind_direction_100m,wind_direction_10m,wind_gusts_10m,wind_speed_100m,wind_speed_10m
count,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,...,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0
mean,3.764244,2.178336,79.138184,-1.472008,0.196152,0.1138,0.482332,1017.929688,816.054905,66.579668,...,27.618758,3.625005,3.60484,3.603982,3.628445,0.255772,0.255304,0.241331,0.282237,1213.196306
std,3.595433,8.31859,17.16322,7.913401,0.503262,0.416948,0.532701,7.943738,8.121846,36.923318,...,13.763321,5.212834,4.629918,3.761255,2.889373,0.045898,0.044622,0.048923,0.024311,1638.633538
min,0.0,-27.6425,7.492445,-33.471001,0.0,0.0,0.0,978.299988,776.987122,0.0,...,3.96,-6.821,-1.921,-0.5425,0.1575,0.121,0.135,0.116,0.241,0.0
25%,1.0,-3.771,67.887783,-7.071,0.0,0.0,0.0,1013.599976,811.240417,36.0,...,18.0,-0.371,-0.221,0.179,0.7575,0.216,0.216,0.197,0.262,0.0
50%,3.0,2.029,83.385448,-1.0425,0.0,0.0,0.22,1018.5,817.260681,84.0,...,24.48,0.729,1.229,2.079,2.9075,0.255,0.255,0.239,0.281,0.0
75%,4.0,8.2575,93.331192,5.0575,0.2,0.0,0.95,1022.799988,821.999466,100.0,...,33.839996,7.5075,7.679,7.229,6.4075,0.291,0.29,0.282,0.299,3600.0
max,12.0,27.328999,100.0,16.057501,11.2,11.2,1.9,1047.599976,837.25531,100.0,...,153.0,23.078999,16.7575,12.4575,9.379001,0.401,0.389,0.361,0.353,3600.0


In [10]:
y = y.reset_index()

In [11]:
y.head(2)

Unnamed: 0,date,snowfall
0,2009-01-01 00:00:00,0.0
1,2009-01-01 01:00:00,0.0


In [12]:
train_df_merged = X_preprocessed.join(y)
train_df_merged.tail(2)

Unnamed: 0,weather_code_encoded,cloud_cover,cloud_cover_high,cloud_cover_low,cloud_cover_mid,dew_point_2m,et0_fao_evapotranspiration,precipitation,pressure_msl,rain,...,surface_pressure,temperature_2m,vapour_pressure_deficit,wind_direction_100m,wind_direction_10m,wind_gusts_10m,wind_speed_100m,wind_speed_10m,date,snowfall
131494,3.0,-13.721,80.381676,-16.371,0.0,0.0,1.23,1016.900024,804.648254,100.0,...,0.929,1.779,3.629,0.24,0.246,0.238,0.267,0.0,2024-01-01 22:00:00,0.0
131495,1.0,-13.421,78.443703,-16.371,0.0,0.0,1.23,1015.700012,803.911438,40.0,...,0.929,1.779,3.629,0.24,0.246,0.238,0.267,0.0,2024-01-01 23:00:00,0.0


In [13]:
train_df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131496 entries, 0 to 131495
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   weather_code_encoded           131496 non-null  float64       
 1   cloud_cover                    131496 non-null  float64       
 2   cloud_cover_high               131496 non-null  float64       
 3   cloud_cover_low                131496 non-null  float64       
 4   cloud_cover_mid                131496 non-null  float64       
 5   dew_point_2m                   131496 non-null  float64       
 6   et0_fao_evapotranspiration     131496 non-null  float64       
 7   precipitation                  131496 non-null  float64       
 8   pressure_msl                   131496 non-null  float64       
 9   rain                           131496 non-null  float64       
 10  relative_humidity_2m           131496 non-null  float64       
 11  

In [14]:
train_df_merged.to_csv('combined_dataset.csv')