In [67]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import pandas

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer, StandardScaler

from datetime import datetime
from scipy.io import readsav
import pyreadstat
from powderalert.ml_logic.preprocessor import define_X, preprocess
from powderalert.ml_logic.data import clean_data, time_features

# Import CSV Dataset

In [54]:
current_dir = os.getcwd()
relative_path = os.path.dirname(current_dir)
file_name = "openmeteo_api_NEW_train_dataset.csv"

file_path = os.path.join(relative_path, "raw_data", file_name)

df_hist_openmeteo = pd.read_csv(file_path)
df_hist_openmeteo['date'] = pd.to_datetime(df_hist_openmeteo['date'])

# Preprocess data

In [55]:
df_hist_openmeteo.head(2)

Unnamed: 0.1,Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snowfall,snow_depth,weather_code,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
0,0,2009-01-01 00:00:00,-10.842501,72.96817,-14.7425,0.0,0.0,0.0,0.92,3.0,...,95.0,24.0,0.001225,0.072607,6.989935,11.27553,191.88864,196.69933,29.88,-0.1425
1,1,2009-01-01 01:00:00,-10.6425,73.91152,-14.3925,0.0,0.0,0.0,0.92,3.0,...,95.0,46.0,0.0,0.0712,5.860375,8.557102,190.6196,202.24907,25.56,-0.1425


In [56]:
df_hist_openmeteo = df_hist_openmeteo.drop(columns='Unnamed: 0')
df_hist_openmeteo.tail(2)

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
131494,2024-01-01 22:00:00,-13.721,80.381676,-16.371,0.0,0.0,0.0,1.23,3.0,1016.9,...,2.0,100.0,0.0,0.041773,6.763786,12.429127,154.7989,190.00792,26.28,0.679
131495,2024-01-01 23:00:00,-13.421,78.4437,-16.371,0.0,0.0,0.0,1.23,1.0,1015.7,...,0.0,40.0,0.0,0.047036,6.439876,12.605142,153.43501,181.63654,27.0,0.679


In [57]:
df_hist_openmeteo = clean_data(df_hist_openmeteo)
df_hist_openmeteo.tail(3)

✅ Data cleaned


Unnamed: 0_level_0,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,surface_pressure,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-01 21:00:00,-14.571,84.03532,-16.671001,0.0,0.0,0.0,1.23,3.0,1017.1,804.2009,...,24.0,92.0,0.0,0.031705,6.618519,11.874544,157.61983,194.03627,25.56,0.679
2024-01-01 22:00:00,-13.721,80.381676,-16.371,0.0,0.0,0.0,1.23,3.0,1016.9,804.64825,...,2.0,100.0,0.0,0.041773,6.763786,12.429127,154.7989,190.00792,26.28,0.679
2024-01-01 23:00:00,-13.421,78.4437,-16.371,0.0,0.0,0.0,1.23,1.0,1015.7,803.91144,...,0.0,40.0,0.0,0.047036,6.439876,12.605142,153.43501,181.63654,27.0,0.679


In [58]:
target = "snowfall"
X = define_X(df_hist_openmeteo,target)
X.head(3)

Unnamed: 0_level_0,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01 00:00:00,-10.842501,72.96817,-14.7425,0.0,0.0,0.92,3.0,1025.2,813.2572,96.0,...,95.0,24.0,0.001225,0.072607,6.989935,11.27553,191.88864,196.69933,29.88,-0.1425
2009-01-01 01:00:00,-10.6425,73.91152,-14.3925,0.0,0.0,0.92,3.0,1025.2,813.3975,95.0,...,95.0,46.0,0.0,0.0712,5.860375,8.557102,190.6196,202.24907,25.56,-0.1425
2009-01-01 02:00:00,-10.4925,74.547844,-14.1425,0.1,0.0,0.92,71.0,1025.0,813.344,91.0,...,89.0,51.0,0.0,0.070298,5.154416,6.696387,192.09474,216.25392,20.16,-0.1925


In [61]:
X_preprocessed = preprocess(X)
y = df_hist_openmeteo[['snowfall']]
X_preprocessed.head(2)

✅ Processed data, with shape (131496, 21)


Unnamed: 0,weather_code_encoded,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,pressure_msl,surface_pressure,cloud_cover,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
0,3.0,-1.565276,-0.359492,-1.676971,-0.389763,-0.272938,0.821605,0.915229,-0.344468,0.796798,...,1.525737,-0.266834,-0.609045,-0.468303,0.915814,0.615276,-0.038465,0.000227,0.164295,-0.722739
1,3.0,-1.541233,-0.304528,-1.632742,-0.389763,-0.272938,0.821605,0.915229,-0.327194,0.769715,...,1.525737,0.286362,-0.619531,-0.473445,0.465664,0.069724,-0.051303,0.052709,-0.149584,-0.722739


In [62]:
y = y.reset_index()
y.head(2)

Unnamed: 0,date,snowfall
0,2009-01-01 00:00:00,0.0
1,2009-01-01 01:00:00,0.0


In [63]:
df = y.join(X_preprocessed)
print(df.shape)
df.tail(2)

(131496, 23)


Unnamed: 0,date,snowfall,weather_code_encoded,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,pressure_msl,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
131494,2024-01-01 22:00:00,0.0,3.0,-1.911309,0.072451,-1.882762,-0.389763,-0.272938,1.403548,-0.129623,...,-0.925955,1.644208,-0.619531,-0.580998,0.825689,0.846788,-0.41365,-0.063053,-0.09727,-0.565147
131495,2024-01-01 23:00:00,0.0,1.0,-1.875245,-0.040464,-1.882762,-0.389763,-0.272938,1.403548,-0.280686,...,-0.978679,0.13549,-0.619531,-0.561763,0.696605,0.882111,-0.427446,-0.142219,-0.044957,-0.565147


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131496 entries, 0 to 131495
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   date                        131496 non-null  datetime64[ns]
 1   snowfall                    131496 non-null  float64       
 2   weather_code_encoded        131496 non-null  float64       
 3   temperature_2m              131496 non-null  float64       
 4   relative_humidity_2m        131496 non-null  float64       
 5   dew_point_2m                131496 non-null  float64       
 6   precipitation               131496 non-null  float64       
 7   rain                        131496 non-null  float64       
 8   snow_depth                  131496 non-null  float64       
 9   pressure_msl                131496 non-null  float64       
 10  surface_pressure            131496 non-null  float64       
 11  cloud_cover                 131496 non-

# Engineer additional time features

In [65]:
df.head(2)

Unnamed: 0,date,snowfall,weather_code_encoded,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,pressure_msl,...,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration
0,2009-01-01 00:00:00,0.0,3.0,-1.565276,-0.359492,-1.676971,-0.389763,-0.272938,0.821605,0.915229,...,1.525737,-0.266834,-0.609045,-0.468303,0.915814,0.615276,-0.038465,0.000227,0.164295,-0.722739
1,2009-01-01 01:00:00,0.0,3.0,-1.541233,-0.304528,-1.632742,-0.389763,-0.272938,0.821605,0.915229,...,1.525737,0.286362,-0.619531,-0.473445,0.465664,0.069724,-0.051303,0.052709,-0.149584,-0.722739


In [70]:
df['hour_sin'] = np.sin(2 * np.pi * df['date'].dt.hour / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['date'].dt.hour / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['date'].dt.dayofweek / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['date'].dt.dayofweek / 7)
df['month_sin'] = np.sin(2 * np.pi * (df['date'].dt.month - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['date'].dt.month - 1) / 12)
df = df.reset_index()

In [None]:
df.head(2)

Unnamed: 0,date,snowfall,weather_code_encoded,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,pressure_msl,...,wind_direction_10m,wind_direction_100m,wind_gusts_10m,sunshine_duration,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos
0,2009-01-01 00:00:00,0.0,3.0,-1.565276,-0.359492,-1.676971,-0.389763,-0.272938,0.821605,0.915229,...,-0.038465,0.000227,0.164295,-0.722739,0.0,1.0,0.433884,-0.900969,0.0,1.0
1,2009-01-01 01:00:00,0.0,3.0,-1.541233,-0.304528,-1.632742,-0.389763,-0.272938,0.821605,0.915229,...,-0.051303,0.052709,-0.149584,-0.722739,0.258819,0.965926,0.433884,-0.900969,0.0,1.0


# Save preprocessed DF as CSV

In [76]:
current_dir = os.getcwd()
relative_path = os.path.dirname(current_dir)
relative_path

'/home/anita/code/MadMax1995bb/powder_alert2.0'

In [78]:
file_path = os.path.join(relative_path, "raw_data")

df.to_csv(os.path.join(file_path,'historical_data_preprocessed.csv'))

# API Request Forecast

In [3]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"
params = {
	"latitude": 47.26580883196723,
	"longitude": 11.84457426992035,
	"hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation_probability", "precipitation", "rain", "showers", "snowfall", "snow_depth", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "visibility", "evapotranspiration", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_speed_80m", "wind_speed_120m", "wind_speed_180m", "wind_direction_10m", "wind_direction_80m", "wind_direction_120m", "wind_direction_180m", "wind_gusts_10m", "temperature_80m", "temperature_120m", "temperature_180m", "soil_temperature_0cm", "soil_temperature_6cm", "soil_temperature_18cm", "soil_temperature_54cm", "soil_moisture_0_to_1cm", "soil_moisture_1_to_3cm", "soil_moisture_3_to_9cm", "soil_moisture_9_to_27cm", "soil_moisture_27_to_81cm"],
	"past_days": 2,
	"forecast_days": 3
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

Coordinates 47.2400016784668°N 11.84000015258789°E
Elevation 1818.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


# API request

In [2]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 47.26580883196723,
	"longitude": 11.84457426992035,
	"start_date": "2009-01-01",
	"end_date": "2024-01-01",
	"hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "rain", "snowfall", "snow_depth", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m", "wind_direction_100m", "wind_gusts_10m", "soil_temperature_0_to_7cm", "soil_temperature_7_to_28cm", "soil_temperature_28_to_100cm", "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm", "sunshine_duration"],
	"models": "best_match"
}
responses = openmeteo.weather_api(url, params=params)


# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")


Coordinates 47.27592086791992°N 12.058823585510254°E
Elevation 1818.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


In [4]:
# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
hourly_rain = hourly.Variables(4).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(5).ValuesAsNumpy()
hourly_snow_depth = hourly.Variables(6).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(7).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(8).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(9).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(10).ValuesAsNumpy()
hourly_cloud_cover_low = hourly.Variables(11).ValuesAsNumpy()
hourly_cloud_cover_mid = hourly.Variables(12).ValuesAsNumpy()
hourly_cloud_cover_high = hourly.Variables(13).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(14).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(15).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(16).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(17).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(18).ValuesAsNumpy()
hourly_wind_direction_100m = hourly.Variables(19).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(20).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(21).ValuesAsNumpy()
hourly_soil_temperature_7_to_28cm = hourly.Variables(22).ValuesAsNumpy()
hourly_soil_temperature_28_to_100cm = hourly.Variables(23).ValuesAsNumpy()
hourly_soil_temperature_100_to_255cm = hourly.Variables(24).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(25).ValuesAsNumpy()
hourly_soil_moisture_7_to_28cm = hourly.Variables(26).ValuesAsNumpy()
hourly_soil_moisture_28_to_100cm = hourly.Variables(27).ValuesAsNumpy()
hourly_soil_moisture_100_to_255cm = hourly.Variables(28).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(29).ValuesAsNumpy()


hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["precipitation"] = hourly_precipitation
hourly_data["rain"] = hourly_rain
hourly_data["snowfall"] = hourly_snowfall
hourly_data["snow_depth"] = hourly_snow_depth
hourly_data["weather_code"] = hourly_weather_code
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
hourly_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
hourly_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m
hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_temperature_7_to_28cm"] = hourly_soil_temperature_7_to_28cm
hourly_data["soil_temperature_28_to_100cm"] = hourly_soil_temperature_28_to_100cm
hourly_data["soil_temperature_100_to_255cm"] = hourly_soil_temperature_100_to_255cm
hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm
hourly_data["soil_moisture_7_to_28cm"] = hourly_soil_moisture_7_to_28cm
hourly_data["soil_moisture_28_to_100cm"] = hourly_soil_moisture_28_to_100cm
hourly_data["soil_moisture_100_to_255cm"] = hourly_soil_moisture_100_to_255cm
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)
hourly_dataframe

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,...,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,sunshine_duration
0,2024-12-11 00:00:00+00:00,-5.0740,77.0,-8.479725,0.0,0.0,0.0,0.0,0.00,0.24,...,6.214563,3.219938,2.305125,1.938659,259.992096,243.435013,231.340164,201.801468,16.199999,-3.9240
1,2024-12-11 01:00:00+00:00,-5.0240,75.0,-8.768821,0.0,0.0,0.0,0.0,0.00,0.24,...,6.519877,3.415260,2.414953,1.527351,263.659912,251.564957,243.435013,224.999893,16.199999,-3.7240
2,2024-12-11 02:00:00+00:00,-4.5740,70.0,-9.216799,0.0,0.0,0.0,0.0,0.00,0.24,...,8.640000,5.315336,4.843305,4.510787,270.000000,241.699341,221.987137,208.610367,16.919998,-2.9240
3,2024-12-11 03:00:00+00:00,-4.8240,70.0,-9.457163,0.0,0.0,0.0,0.0,0.00,0.24,...,5.400000,1.835647,0.720000,0.000000,270.000000,281.309906,270.000000,180.000000,16.559999,-3.5240
4,2024-12-11 04:00:00+00:00,-5.3240,71.0,-9.757853,0.0,0.0,0.0,0.0,0.00,0.24,...,9.360000,4.802999,3.545589,2.545584,270.000000,257.005371,246.037506,224.999893,16.919998,-3.1240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2024-12-15 19:00:00+00:00,-6.0865,80.0,-8.975688,20.0,0.0,0.0,0.0,0.00,0.28,...,4.452954,4.452954,4.680000,5.991594,284.036255,284.036255,292.619904,302.735199,39.959999,-4.6865
116,2024-12-15 20:00:00+00:00,-5.6365,76.0,-9.193255,30.0,0.0,0.0,0.0,0.00,0.28,...,4.198285,4.394360,5.116561,6.369050,300.963684,304.992096,309.289368,317.290619,43.199997,-4.2865
117,2024-12-15 21:00:00+00:00,-5.1865,75.0,-8.926248,40.0,0.0,0.0,0.0,0.00,0.28,...,4.553679,5.014219,5.634891,6.489992,288.435028,291.037567,296.564972,303.690094,43.919998,-4.0865
118,2024-12-15 22:00:00+00:00,-5.1865,84.0,-7.467149,50.0,0.0,0.0,0.0,0.00,0.28,...,3.671294,4.379589,5.154416,6.151683,281.309906,279.462250,282.094727,290.556122,43.199997,-4.3865


In [5]:
target = ['snowfall']

In [8]:
train_df = hourly_dataframe
clean_train_df = clean_data(train_df)
X = define_X(clean_train_df,target)


NameError: name 'clean_data' is not defined

In [6]:
X.describe()

Unnamed: 0,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,...,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,sunshine_duration
count,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,...,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0
mean,2.178336,79.138184,-1.472008,0.196152,0.1138,0.482332,22.706781,1017.929688,816.054932,66.579666,...,27.618757,3.625005,3.60484,3.603981,3.628445,0.255772,0.255304,0.241331,0.282237,1213.196411
std,8.31859,17.163219,7.913401,0.503262,0.416948,0.532701,29.793755,7.943738,8.121846,36.923317,...,13.763321,5.212834,4.629919,3.761255,2.889373,0.045898,0.044622,0.048923,0.024311,1638.633545
min,-27.6425,7.492445,-33.471001,0.0,0.0,0.0,0.0,978.299988,776.987122,0.0,...,3.96,-6.821,-1.921,-0.5425,0.1575,0.121,0.135,0.116,0.241,0.0
25%,-3.771,67.887783,-7.071,0.0,0.0,0.0,1.0,1013.599976,811.240417,36.0,...,18.0,-0.371,-0.221,0.179,0.7575,0.216,0.216,0.197,0.262,0.0
50%,2.029,83.385448,-1.0425,0.0,0.0,0.22,3.0,1018.5,817.260681,84.0,...,24.48,0.729,1.229,2.079,2.9075,0.255,0.255,0.239,0.281,0.0
75%,8.2575,93.331192,5.0575,0.2,0.0,0.95,51.0,1022.799988,821.999466,100.0,...,33.839996,7.5075,7.679,7.229,6.4075,0.291,0.29,0.282,0.299,3600.0
max,27.328999,100.0,16.057501,11.2,11.2,1.9,75.0,1047.599976,837.25531,100.0,...,153.0,23.078999,16.7575,12.4575,9.379001,0.401,0.389,0.361,0.353,3600.0


In [7]:
X_preprocessed = preprocess(X)

y = clean_train_df[['snowfall']]

✅ Processed data, with shape (131496, 29)


In [9]:
X_preprocessed.describe()

Unnamed: 0,weather_code_encoded,cloud_cover,cloud_cover_high,cloud_cover_low,cloud_cover_mid,dew_point_2m,et0_fao_evapotranspiration,precipitation,pressure_msl,rain,...,soil_temperature_7_to_28cm,sunshine_duration,surface_pressure,temperature_2m,vapour_pressure_deficit,wind_direction_100m,wind_direction_10m,wind_gusts_10m,wind_speed_100m,wind_speed_10m
count,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,...,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0,131496.0
mean,3.764244,2.178336,79.138184,-1.472008,0.196152,0.1138,0.482332,1017.929688,816.054905,66.579668,...,27.618758,3.625005,3.60484,3.603982,3.628445,0.255772,0.255304,0.241331,0.282237,1213.196306
std,3.595433,8.31859,17.16322,7.913401,0.503262,0.416948,0.532701,7.943738,8.121846,36.923318,...,13.763321,5.212834,4.629918,3.761255,2.889373,0.045898,0.044622,0.048923,0.024311,1638.633538
min,0.0,-27.6425,7.492445,-33.471001,0.0,0.0,0.0,978.299988,776.987122,0.0,...,3.96,-6.821,-1.921,-0.5425,0.1575,0.121,0.135,0.116,0.241,0.0
25%,1.0,-3.771,67.887783,-7.071,0.0,0.0,0.0,1013.599976,811.240417,36.0,...,18.0,-0.371,-0.221,0.179,0.7575,0.216,0.216,0.197,0.262,0.0
50%,3.0,2.029,83.385448,-1.0425,0.0,0.0,0.22,1018.5,817.260681,84.0,...,24.48,0.729,1.229,2.079,2.9075,0.255,0.255,0.239,0.281,0.0
75%,4.0,8.2575,93.331192,5.0575,0.2,0.0,0.95,1022.799988,821.999466,100.0,...,33.839996,7.5075,7.679,7.229,6.4075,0.291,0.29,0.282,0.299,3600.0
max,12.0,27.328999,100.0,16.057501,11.2,11.2,1.9,1047.599976,837.25531,100.0,...,153.0,23.078999,16.7575,12.4575,9.379001,0.401,0.389,0.361,0.353,3600.0


In [10]:
y = y.reset_index()

In [11]:
y.head(2)

Unnamed: 0,date,snowfall
0,2009-01-01 00:00:00,0.0
1,2009-01-01 01:00:00,0.0


In [12]:
train_df_merged = X_preprocessed.join(y)
train_df_merged.tail(2)

Unnamed: 0,weather_code_encoded,cloud_cover,cloud_cover_high,cloud_cover_low,cloud_cover_mid,dew_point_2m,et0_fao_evapotranspiration,precipitation,pressure_msl,rain,...,surface_pressure,temperature_2m,vapour_pressure_deficit,wind_direction_100m,wind_direction_10m,wind_gusts_10m,wind_speed_100m,wind_speed_10m,date,snowfall
131494,3.0,-13.721,80.381676,-16.371,0.0,0.0,1.23,1016.900024,804.648254,100.0,...,0.929,1.779,3.629,0.24,0.246,0.238,0.267,0.0,2024-01-01 22:00:00,0.0
131495,1.0,-13.421,78.443703,-16.371,0.0,0.0,1.23,1015.700012,803.911438,40.0,...,0.929,1.779,3.629,0.24,0.246,0.238,0.267,0.0,2024-01-01 23:00:00,0.0


In [13]:
train_df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131496 entries, 0 to 131495
Data columns (total 31 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   weather_code_encoded           131496 non-null  float64       
 1   cloud_cover                    131496 non-null  float64       
 2   cloud_cover_high               131496 non-null  float64       
 3   cloud_cover_low                131496 non-null  float64       
 4   cloud_cover_mid                131496 non-null  float64       
 5   dew_point_2m                   131496 non-null  float64       
 6   et0_fao_evapotranspiration     131496 non-null  float64       
 7   precipitation                  131496 non-null  float64       
 8   pressure_msl                   131496 non-null  float64       
 9   rain                           131496 non-null  float64       
 10  relative_humidity_2m           131496 non-null  float64       
 11  

In [14]:
train_df_merged.to_csv('combined_dataset.csv')