In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import re
import os
from sklearn.preprocessing import StandardScaler
import pickle

In [None]:
DATA_ROOT = Path("./data")

file_dict: dict[str, dict[str, Path]]
file_dict = {
    "scalar": {
        "temperature": (DATA_ROOT / "temperature.csv", ("min", "max")),
        "pressure": (DATA_ROOT / "pressure.csv", ("mean",)),
        "humidity": (DATA_ROOT / "humidity.csv", ("mean",)),
        "wind_speed": (DATA_ROOT / "wind_speed.csv", ("mean", "max")),
    },
}


In [None]:
def transform_scalar_data(df: pd.DataFrame, measurement_name: str, agg_types: tuple, 
                         scaler_dir: str = 'scalers', is_training: bool = True) -> pd.DataFrame:
    """
    Transform scalar weather data with aggregation and normalization.
    Uses a single scaler for all statistics of the same measurement.
    
    Args:
        df: Input dataframe with datetime and city columns
        measurement_name: Name of the measurement (e.g., 'temperature')
        agg_types: Tuple of aggregation types (e.g., ('min', 'max', 'mean'))
        scaler_dir: Directory to save/load scalers
        is_training: Whether this is training data (to fit scaler) or not
    """
    # Create scaler directory if it doesn't exist
    os.makedirs(scaler_dir, exist_ok=True)
    
    # Convert datetime to datetime type if it isn't already
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Get list of city columns (all columns except datetime)
    city_columns = [col for col in df.columns if col != 'datetime']
    
    # Melt the dataframe to convert from wide to long format
    df_melted = df.melt(
        id_vars=['datetime'],
        value_vars=city_columns,
        var_name='city',
        value_name=measurement_name
    )
    
    # Extract date from datetime
    df_melted['date'] = df_melted['datetime'].dt.date
    
    # Group by date and city to calculate daily statistics
    result = df_melted.groupby(['date', 'city']).agg({
        measurement_name: [(f'{measurement_name}_{agg_type}', agg_type) for agg_type in agg_types]
    }).reset_index()
    
    # Flatten multi-level columns
    result.columns = ['date', 'city'] + [f'{measurement_name}_{agg_type}' for agg_type in agg_types]
    
    # Get columns to normalize (all except date and city)
    cols_to_normalize = [f'{measurement_name}_{agg_type}' for agg_type in agg_types]
    
    # Use a single scaler for all statistics of this measurement
    scaler_path = os.path.join(scaler_dir, f'{measurement_name}_scaler.pkl')
    
    if is_training:
        # Fit and save scaler on all statistics together
        scaler = StandardScaler()
        result[cols_to_normalize] = scaler.fit_transform(result[cols_to_normalize])
        with open(scaler_path, 'wb') as f:
            pickle.dump(scaler, f)
    else:
        # Load and use existing scaler
        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)
        result[cols_to_normalize] = scaler.transform(result[cols_to_normalize])
    
    return result


In [None]:
def transform_wind_direction(df: pd.DataFrame) -> pd.DataFrame:
    # Convert datetime to datetime type if it isn't already
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Get list of city columns (all columns except datetime)
    city_columns = [col for col in df.columns if col != 'datetime']
    
    # Melt the dataframe
    df_melted = df.melt(
        id_vars=['datetime'],
        value_vars=city_columns,
        var_name='city',
        value_name='wind_direction'
    )
    
    # Convert degrees to radians and calculate x,y components
    radians = np.deg2rad(df_melted['wind_direction'])
    df_melted['wind_direction_x'] = np.cos(radians)
    df_melted['wind_direction_y'] = np.sin(radians)
    
    # Extract date from datetime
    df_melted['date'] = df_melted['datetime'].dt.date
    
    # Group by date and city to get daily mean vectors
    result = df_melted.groupby(['date', 'city']).agg({
        'wind_direction_x': 'mean',
        'wind_direction_y': 'mean'
    }).reset_index()
    
    return result

In [None]:
def to_snake_case(text: str) -> str:
    # List of words to remove (can be expanded)
    connecting_words = {'with', 'and', 'or', 'the', 'a', 'an', 'in', 'at', 'on'}
    
    # Replace special characters with spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', str(text))
    
    # Convert to lowercase and split
    words = text.lower().split()
    
    # Filter out connecting words
    words = [word for word in words if word not in connecting_words]
    
    # Join with underscore
    return '_'.join(words)

def get_weather_category_mapping():
    # Sky condition mappings
    sky_clear = ['sky_is_clear']
    sky_partial = ['few_clouds', 'scattered_clouds', 'broken_clouds']
    sky_covered = ['overcast_clouds', 'mist', 'fog', 'haze', 'smoke', 'dust', 'sand', 'volcanic_ash']
    
    # Precipitation mappings
    precip_light = [
        'light_rain', 'light_snow', 'light_intensity_drizzle', 'drizzle',
        'light_intensity_drizzle_rain', 'light_intensity_shower_rain',
        'light_rain_snow', 'light_shower_sleet', 'light_shower_snow'
    ]
    precip_heavy = [
        'heavy_intensity_rain', 'very_heavy_rain', 'heavy_intensity_drizzle',
        'heavy_intensity_shower_rain', 'heavy_shower_snow', 'heavy_snow',
        'shower_rain', 'shower_drizzle', 'shower_snow', 'rain_snow',
        'moderate_rain', 'freezing_rain'
    ]
    
    # Storm mappings
    storm = [
        'thunderstorm', 'heavy_thunderstorm', 'ragged_thunderstorm',
        'thunderstorm_drizzle', 'thunderstorm_heavy_drizzle',
        'thunderstorm_heavy_rain', 'thunderstorm_light_drizzle',
        'thunderstorm_light_rain', 'thunderstorm_rain',
        'proximity_thunderstorm', 'proximity_thunderstorm_drizzle',
        'proximity_thunderstorm_rain', 'squalls', 'tornado'
    ]
    
    # Create the mapping dictionary
    category_mapping = {}
    
    # Sky condition
    for condition in sky_clear:
        category_mapping[condition] = 'sky_clear'
    for condition in sky_partial:
        category_mapping[condition] = 'sky_partial'
    for condition in sky_covered:
        category_mapping[condition] = 'sky_covered'
        
    # Precipitation
    for condition in precip_light:
        category_mapping[condition] = 'precip_light'
    for condition in precip_heavy:
        category_mapping[condition] = 'precip_heavy'
        
    # Storm
    for condition in storm:
        category_mapping[condition] = 'storm'
    
    return category_mapping

def transform_weather_description(df: pd.DataFrame) -> pd.DataFrame:
    # Convert datetime to datetime type if it isn't already
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Get list of city columns (all columns except datetime)
    city_columns = [col for col in df.columns if col != 'datetime']
    
    # Melt the dataframe to convert from wide to long format
    df_melted = df.melt(
        id_vars=['datetime'],
        value_vars=city_columns,
        var_name='city',
        value_name='class_label'
    )
    
    # Extract date from datetime
    df_melted['date'] = df_melted['datetime'].dt.date
    
    # Convert class labels to snake case
    df_melted['class_label'] = df_melted['class_label'].apply(to_snake_case)
    
    # Map to simplified categories
    category_mapping = get_weather_category_mapping()
    df_melted['weather_category'] = df_melted['class_label'].map(category_mapping)
    
    # Create one-hot encoded columns for the simplified categories
    one_hot = pd.get_dummies(df_melted['weather_category'], prefix="weather_description")
    
    # Add one-hot columns to the melted dataframe
    df_melted = pd.concat([df_melted[['date', 'city']], one_hot], axis=1)
    
    # Group by date and city to get daily proportions
    result = df_melted.groupby(['date', 'city']).mean().reset_index()
    
    return result

In [None]:
def encode_day_of_year(dates: pd.Series) -> pd.DataFrame:
    """
    Encode day of year as sin/cos components to capture cyclical nature
    """
    day_of_year = pd.to_datetime(dates).dt.dayofyear
    day_of_year_sin = np.sin(2 * np.pi * day_of_year / 365.25)
    day_of_year_cos = np.cos(2 * np.pi * day_of_year / 365.25)
    
    return pd.DataFrame({
        'day_of_year_sin': day_of_year_sin,
        'day_of_year_cos': day_of_year_cos
    })

In [None]:
def load_and_preprocess_data(file_dict: dict[str, tuple[Path, tuple]], is_training: bool = True) -> list[pd.DataFrame]:
    transformed_dfs = []
    for measurement_name, args in file_dict.items():
        file_path, agg_types = args
        if not file_path.exists():
            print(f"File responsible for {measurement_name} doesn't exist at {file_path.absolute()}")
            continue
        df = pd.read_csv(file_path)

        transformed_dfs.append(transform_scalar_data(df, measurement_name, agg_types, is_training=is_training))
    return transformed_dfs

In [None]:
transformed_dfs = []
transformed_dfs.extend(load_and_preprocess_data(file_dict["scalar"], is_training=True))
df_wind_dir = pd.read_csv(DATA_ROOT / "wind_direction.csv")
transformed_dfs.append(transform_wind_direction(df_wind_dir))
df_weather_class = pd.read_csv(DATA_ROOT / "weather_description.csv")
transformed_dfs.append(transform_weather_description(df_weather_class))

In [None]:
result = transformed_dfs[0]
    
# Merge with each subsequent dataset
for df in transformed_dfs[1:]:
    result = pd.merge(
        result,
        df,
        on=['date', 'city'],
        how='outer'
    )

# Sort by date and city
result = result.sort_values(['date', 'city'])



In [None]:
# Tworzenie słownika z oddzielnymi ramkami danych dla każdego miasta
city_dfs = {city: df.reset_index(drop=True) for city, df in result.groupby('city')}

# Sprawdzenie przykładowego miasta
print("\nPrzykładowa ramka danych dla Vancouver:")
print(city_dfs['Vancouver'].head())

In [None]:
# Definicja funkcji tworzącej okno przewidywania
def create_windows(df: pd.DataFrame, window_size=3, skip=1, scaler_dir: str = "scalers"):
    with open(scaler_dir + "/wind_speed_scaler.pkl", 'rb') as f:
        wind_speed_scaler: StandardScaler = pickle.load(f)

    wind_speed_limit = wind_speed_scaler.transform([6])[0]

    df["strong_wind"] = ((df["wind_speed_max"]) >= wind_speed_limit).astype(int)

    X = []
    y_temp = []
    y_wind = []

    for i in range(window_size, len(df) - skip + 1):
        # Sprawdzenie czy dni są kolejne
        window = df.iloc[i-window_size:i]
        target = df.iloc[i + skip -1]

        # Sprawdzenie, czy dni są ciągłe
        expected_date = window['date'].iloc[-1] + pd.Timedelta(days=1)
        if target['date'] != expected_date:
            continue
        
        window = window.drop(["date", "city", "wind_speed"])

        # Przygotowanie cech
        features = window.values.flatten()
        X.append(features)
        
        y_temp.append()
        y_wind.append(target["strong_wind"])

    return np.array(X), np.array(y_temp), np.array(y_wind)

In [None]:
# TODO: SCALERS SHOULD BE CONFIGURED ON TRAINING DATA ONLY AND LOADED FOR TEST DATA
# TODO: SCALERS SHOULD BE SEPARATED FROM THE LOAD SCALER DATA FUNCTION
# TODO: WRITE AN EASY PIPELINE FOR INFERENCE
# TODO: Y_TEMP SHOULD BE PREDICTED WITHIN 2 DEG CELCIUS