## Meteo Bakery: Weather - Feature Engineering

### import packages

In [2]:
# data packages
import numpy as np
import pandas as pd

### load weather data

In [3]:
df = pd.read_csv('../data/neueFische_Wetter.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87673 entries, 0 to 87672
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   time        87673 non-null  object 
 1   temp        87673 non-null  float64
 2   feels_like  87673 non-null  float64
 3   pressure    87673 non-null  int64  
 4   humidity    87673 non-null  int64  
 5   clouds      87673 non-null  int64  
 6   visibility  84935 non-null  float64
 7   wind_speed  87673 non-null  float64
 8   wind_deg    87673 non-null  int64  
 9   wind_gust   30291 non-null  float64
 10  rain_1h     14246 non-null  float64
 11  snow_1h     1058 non-null   float64
 12  weather_id  87673 non-null  int64  
dtypes: float64(7), int64(5), object(1)
memory usage: 8.7+ MB


### Feature Engineering

#### extract x and y components from wind direction

In [5]:
# Turn wind data into sensible format
df["wind_dir_x"] = df.wind_deg.apply(lambda x: np.cos(np.array(x) * np.pi /180))
df["wind_dir_y"] = df.wind_deg.apply(lambda x: np.sin(np.array(x) * np.pi /180))

In [6]:
# replace missings w/ zero
df.rain_1h.fillna(0,inplace=True)
df.snow_1h.fillna(0,inplace=True)

#### Input Weather Codes

In [7]:
codes = pd.read_csv('../data/neueFische_Wettercodes.csv')
codes.columns = ['weather_id', 'WeatherMain', 'WeatherDescription']

In [8]:
df = pd.merge(df, codes,how='left',on='weather_id')

#### Extract date and hour from datetime

In [9]:
df['time'] = pd.to_datetime(df['time'],utc=True)
df['date'] = df.time.dt.date
df['hour'] = df.time.dt.hour


In [10]:
df.head()

Unnamed: 0,time,temp,feels_like,pressure,humidity,clouds,visibility,wind_speed,wind_deg,wind_gust,rain_1h,snow_1h,weather_id,wind_dir_x,wind_dir_y,WeatherMain,WeatherDescription,date,hour
0,2011-12-31 23:00:00+00:00,0.31,-2.76,1017,80,75,10000.0,2.6,200,,0.0,0.0,803,-0.9396926,-0.3420201,Clouds,broken clouds: 51-84%,2011-12-31,23
1,2012-01-01 00:00:00+00:00,2.47,2.47,1017,76,75,4200.0,1.0,90,,0.0,0.0,803,6.123234000000001e-17,1.0,Clouds,broken clouds: 51-84%,2012-01-01,0
2,2012-01-01 01:00:00+00:00,2.47,2.47,1017,76,75,4200.0,1.0,90,,0.13,0.0,500,6.123234000000001e-17,1.0,Rain,light rain,2012-01-01,1
3,2012-01-01 02:00:00+00:00,0.95,-2.84,1017,80,75,10000.0,3.6,180,,0.21,0.0,600,-1.0,1.224647e-16,Snow,light snow,2012-01-01,2
4,2012-01-01 03:00:00+00:00,2.14,2.14,1017,90,90,4700.0,1.0,220,,0.38,0.0,500,-0.7660444,-0.6427876,Rain,light rain,2012-01-01,3


#### Extract daily summary statistics

In [11]:
# utility function for extracting summary statistics from weather recordings
def extract_daily_statistics(df,columns):
    """Extracts daily summary statistics of hourly weather data. 
        Returns a dataframe with mean, min, max and std values for weather recordings between 06:00 - 20:00 
        and also returns mean values for three daily time period, i.e. 06:00-10:00, 11:00-15:00, and 16:00-20:00

    Args:
        df (pd.DataFrame): Dataframe containing ourly weather recordings
        columns (list): A list of columns containing weather variable names contained in the input dataframe.

    Returns:
        pd.DataFrame: Dataframe containing daily summary statistics of weather data
    """

    # intialize dataframe for summary statistics
    summary_stats = pd.DataFrame({'date': df.date.unique()})
    summary_stats.set_index('date', inplace=True)

    # subselect dataframe for calculating whole-day summary statistics
    df_day = df[df.hour.between(6, 20)]

    # subselect dataframe for specified time frames
    # 06:00-10:00
    df_6_10 = df[df.hour.between(6, 10)]
    # 11:00-15:00
    df_11_15= df[df.hour.between(11, 15)]
    # 16:00-20:00
    df_16_20= df[df.hour.between(16, 20)]

    for col in columns:
        # calculate different summary statistics over complete daytime period
        summary_stats[[col+'_mean', col+'_min', col+'_max', col+'_std']] = df_day.groupby('date')[col].agg(['mean', 'min', 'max', 'std'])
        
        # calculate mean values for specified time frames
        # 06:00-10:00
        df_6_10 = df[df.hour.between(6, 10)]
        summary_stats[col+'_06_10'] = df_6_10.groupby('date')[col].agg(['mean'])
        # 11:00-15:00
        df_11_15= df[df.hour.between(11, 15)]
        summary_stats[col+'_11_15'] = df_11_15.groupby('date')[col].agg(['mean'])
        # 16:00-20:00
        df_16_20= df[df.hour.between(16, 20)]
        summary_stats[col+'_16_20'] = df_16_20.groupby('date')[col].agg(['mean'])
    

    return summary_stats

In [12]:
df.columns

Index(['time', 'temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'visibility', 'wind_speed', 'wind_deg', 'wind_gust', 'rain_1h',
       'snow_1h', 'weather_id', 'wind_dir_x', 'wind_dir_y', 'WeatherMain',
       'WeatherDescription', 'date', 'hour'],
      dtype='object')

In [13]:
summary_stats = extract_daily_statistics(df, ['temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'wind_speed', 'wind_dir_x', 'wind_dir_y', 'rain_1h','snow_1h', 'visibility'])
summary_stats

Unnamed: 0_level_0,temp_mean,temp_min,temp_max,temp_std,temp_06_10,temp_11_15,temp_16_20,feels_like_mean,feels_like_min,feels_like_max,...,snow_1h_06_10,snow_1h_11_15,snow_1h_16_20,visibility_mean,visibility_min,visibility_max,visibility_std,visibility_06_10,visibility_11_15,visibility_16_20
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-12-31,,,,,,,,,,,...,,,,,,,,,,
2012-01-01,3.353333,1.98,5.84,1.343618,2.162,3.600,4.298,1.975333,-0.71,4.47,...,0.0,0.0,0.000,3380.000000,1900.0,6000.0,1000.142847,3560.0,3220.0,3360.000000
2012-01-02,2.420667,0.73,4.32,1.194060,0.950,2.996,3.316,-0.285333,-2.38,1.52,...,0.0,0.0,0.000,5326.666667,100.0,9000.0,2749.666646,2200.0,6800.0,6980.000000
2012-01-03,7.330000,4.38,9.28,1.637734,7.016,8.940,6.034,5.321333,0.76,8.31,...,0.0,0.0,0.000,10000.000000,10000.0,10000.0,0.000000,10000.0,10000.0,10000.000000
2012-01-04,6.797333,4.31,8.81,1.400752,6.602,7.730,6.060,3.930667,0.05,6.54,...,0.0,0.0,0.000,10000.000000,10000.0,10000.0,0.000000,10000.0,10000.0,10000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,0.779333,-0.90,3.18,1.280984,-0.140,2.240,0.238,0.184000,-1.95,2.85,...,0.0,0.0,0.000,10000.000000,10000.0,10000.0,0.000000,10000.0,10000.0,10000.000000
2021-12-28,0.728667,-0.03,1.33,0.422271,0.366,0.884,0.936,-1.468667,-3.16,1.33,...,0.0,0.0,0.152,4133.333333,1100.0,10000.0,2852.985070,2180.0,6500.0,3720.000000
2021-12-29,6.506667,2.49,8.12,1.655763,4.564,7.282,7.674,5.810000,1.37,7.58,...,0.0,0.0,0.000,6700.000000,900.0,10000.0,2874.021573,6575.0,7840.0,4966.666667
2021-12-30,10.480000,5.72,12.97,2.487575,7.554,11.522,12.364,10.010000,5.72,12.57,...,0.0,0.0,0.000,7800.000000,3000.0,10000.0,2573.367875,5250.0,9400.0,10000.000000


In [14]:
#2011-12-13 was created because of the time offset. This row doesn't contain any information and can be dropped
summary_stats.dropna(axis=0, how='all', inplace=True)

In [15]:
summary_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3653 entries, 2012-01-01 to 2021-12-31
Data columns (total 77 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temp_mean         3653 non-null   float64
 1   temp_min          3653 non-null   float64
 2   temp_max          3653 non-null   float64
 3   temp_std          3653 non-null   float64
 4   temp_06_10        3653 non-null   float64
 5   temp_11_15        3653 non-null   float64
 6   temp_16_20        3653 non-null   float64
 7   feels_like_mean   3653 non-null   float64
 8   feels_like_min    3653 non-null   float64
 9   feels_like_max    3653 non-null   float64
 10  feels_like_std    3653 non-null   float64
 11  feels_like_06_10  3653 non-null   float64
 12  feels_like_11_15  3653 non-null   float64
 13  feels_like_16_20  3653 non-null   float64
 14  pressure_mean     3653 non-null   float64
 15  pressure_min      3653 non-null   float64
 16  pressure_max      3653 non-null 

In [16]:
df.date.nunique()

3654

There are as many rows as unqiue days in the summary-statistics df. Thus, the feature engineering seems to be successful. 

### Feature engineering on string features

In [17]:
df.WeatherMain.unique()

array(['Clouds', 'Rain', 'Snow', 'Mist', 'Drizzle', 'Fog', 'Clear',
       'Thunderstorm', 'Dust', 'Haze', 'Smoke', 'Tornado'], dtype=object)

In [18]:
df.WeatherMain.value_counts()

Clouds          42493
Clear           19991
Rain            16904
Mist             4495
Snow             1707
Fog              1436
Drizzle           399
Thunderstorm      208
Haze               33
Dust                3
Smoke               2
Tornado             2
Name: WeatherMain, dtype: int64

- convert weather main into broader categories (e.g. rain, drizzle == rainy)
- extract mode category as general weather condition for that day

- __broader weather categories__
    * 'Clouds': 'cloudy'
    * 'Snow': 'snowy'
    * 'Rain', 'Drizzle': 'rainy'
    * 'Mist', 'Fog', 'Haze': 'foggy'
    * 'Clear': 'clear'
    * 'Dust', 'Smoke': 'dusty'
    * 'Thunderstorm': thunderstorm
    * 'Tornado': 'tornado'

In [19]:
# define dictionary to re-classify weather categories
dict = {'Clouds': 'cloudy', 'Snow': 'snowy', 'Rain': 'rainy', 'Mist': 'foggy', 'Drizzle':'rainy', 'Fog':'foggy', 'Clear':'clear',
       'Dust': 'dusty', 'Thunderstorm': 'thunderstorm', 'Haze': 'foggy', 'Smoke': 'dusty', 'Tornado': 'tornado'}
 
 
# Remap the values of the dataframe
df['weather_cats'] = df.WeatherMain.map(dict)

# count frequency per category
df.weather_cats.value_counts()

cloudy          42493
clear           19991
rainy           17303
foggy            5964
snowy            1707
thunderstorm      208
dusty               5
tornado             2
Name: weather_cats, dtype: int64

### extract mode as representative weather condition per time frame

In [20]:
summary_stats['condition_total'] = df[df.hour.between(6, 20)].groupby('date').weather_cats.agg(pd.Series.mode)
summary_stats['condition_6_10'] = df[df.hour.between(6, 10)].groupby('date').weather_cats.agg(pd.Series.mode)
summary_stats['condition_11-15'] = df[df.hour.between(11, 15)].groupby('date').weather_cats.agg(pd.Series.mode)
summary_stats['condition_16-20'] = df[df.hour.between(16, 20)].groupby('date').weather_cats.agg(pd.Series.mode)

### convert weather categories into one-hot encodings and compute relative frequency per day
Additionally, each weather category is dummy-encoded as a separate feature. We will then sum up the hourly occurrences per time frame for each weather category and compute the relative frequency per time frame.

In [21]:
weather_cats = pd.get_dummies(df.weather_cats)
df[weather_cats.columns] = weather_cats
df.head()

Unnamed: 0,time,temp,feels_like,pressure,humidity,clouds,visibility,wind_speed,wind_deg,wind_gust,...,hour,weather_cats,clear,cloudy,dusty,foggy,rainy,snowy,thunderstorm,tornado
0,2011-12-31 23:00:00+00:00,0.31,-2.76,1017,80,75,10000.0,2.6,200,,...,23,cloudy,0,1,0,0,0,0,0,0
1,2012-01-01 00:00:00+00:00,2.47,2.47,1017,76,75,4200.0,1.0,90,,...,0,cloudy,0,1,0,0,0,0,0,0
2,2012-01-01 01:00:00+00:00,2.47,2.47,1017,76,75,4200.0,1.0,90,,...,1,rainy,0,0,0,0,1,0,0,0
3,2012-01-01 02:00:00+00:00,0.95,-2.84,1017,80,75,10000.0,3.6,180,,...,2,snowy,0,0,0,0,0,1,0,0
4,2012-01-01 03:00:00+00:00,2.14,2.14,1017,90,90,4700.0,1.0,220,,...,3,rainy,0,0,0,0,1,0,0,0


In [22]:
# use mean to calculate the relative frequency per time frame
for cat in weather_cats:
    summary_stats[cat+'_total'] = df[df.hour.between(6, 20)].groupby('date')[cat].agg(np.mean) 
    summary_stats[cat+'_06_10'] = df[df.hour.between(6, 10)].groupby('date')[cat].agg(np.mean)
    summary_stats[cat+'_11_15'] = df[df.hour.between(11, 15)].groupby('date')[cat].agg(np.mean) 
    summary_stats[cat+'_16_20'] = df[df.hour.between(16, 20)].groupby('date')[cat].agg(np.mean) 

### Assign 'climatologic days' according to the DWD
https://www.dwd.de/DE/service/lexikon/Functions/glossar.html;jsessionid=EB2D3A27D634826A0176255436956DA7.live21064?lv2=101334&lv3=101452

In [23]:
summary_stats['day_icy']=summary_stats.temp_max<0
summary_stats['day_frosty']=summary_stats.temp_min<0
summary_stats['day_thunder']=summary_stats.thunderstorm_total>0
summary_stats['day_hot']=summary_stats.temp_max>=30
summary_stats['day_clear']=summary_stats.clouds_mean<20
summary_stats['day_hazy']=summary_stats.visibility_min<1000
summary_stats['day_rainy']=summary_stats.rain_1h_min>0.1
summary_stats['day_summer']=summary_stats.temp_max>=25
summary_stats['day_murky']=summary_stats.clouds_mean>80

# Climatological days that cannot be encoded based on current data: 'precipitation_day', 'hail_day', 'tropical_nights'

In [24]:
summary_stats.to_csv('../data/summary_stats.csv')

-----------------

## Seasonal decomposition: Deviation from seasonal and trend influences

In [27]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [28]:
def get_deviations(df=summary_stats, period=365, stat='_mean'):
    '''Gets residuals from seasonal and trend influences.

    Args:
        df (Pandas DataFrame): A TimeSeries as DataFrame with time as an index and target value(s) as columns. If 2d, individual series are in columns. Must contain 2 complete cycles. Defaults to summary_stats.
        period (int, optional): Period of the series. Defaults to 365 (one year).
        stat (str, optional): Summary statistic to extract. Can be '_mean', '_min', '_max', '_std', '_06_10', '_11_15' or '_16_20'. Defaults to '_mean'.

    Returns:
        Pandas DataFrame: contains the residuals for each of the columns of the provided DataFrame
    '''
    elms = [elm for elm in df.columns if stat in elm]
    dev_df = pd.DataFrame()
    for e in elms:
        resdf = pd.DataFrame(
                        seasonal_decompose(x=df[f'{e}'].fillna(method='ffill'),
                                            model='additive', 
                                            two_sided=False,
                                            period=period
                                            )._resid
                            )
        resdf.rename(columns={'resid':f'{e}_dev'}, inplace=True)
        dev_df = pd.concat([dev_df, resdf],axis=1,join='outer')
    return dev_df

In [29]:
dev_df = get_deviations()

## Abrupt Weather Changes

"Changes in weather are primarily the result of a change in temperature, air pressure, and humidity in the atmosphere. When any of these three variables experience a substantial change, it can lead to a complete change in weather conditions."

In [34]:
cats = [elm for elm in summary_stats.columns for cat in ['temp_', 'pressure_', 'humidity_'] if cat in elm]

In [36]:
cats_mean = [elm for elm in cats if '_mean' in elm]

In [91]:
def get_changes(df=summary_stats, cats=cats_mean, comp=1):
    '''calculates changes of columns in TimeSeries dataframe

    Args:
        df (Pandas DataFrame, optional): A DataFrame. Defaults to summary_stats.
        cats (iterable, optional): iterable column names of df. Defaults to cats_mean.
        comp (int, optional): shift which to compare to. Defaults to 1.

    Returns:
        DataFrame: Table of the changes of each category compared to the index comp steps before.
    '''
    change_df = pd.DataFrame()
    for c in cats:
        cdf=(df[c]-(df[c].shift(comp))).to_frame(name=f'{c}_change')
        change_df = pd.concat([change_df, cdf],axis=1,join='outer')
    return change_df

In [94]:
changes = get_changes(summary_stats, cats_mean)