## Meteo Bakery: Weather - Feature Engineering

### import packages

In [None]:
# data packages
import numpy as np
import pandas as pd

### load weather data

In [None]:
df = pd.read_csv('../data/neueFische_Wetter.csv')

In [None]:
df.info()

### Feature Engineering

#### extract x and y components from wind direction

In [None]:
# Turn wind data into sensible format
df["wind_dir_x"] = df.wind_deg.apply(lambda x: np.cos(np.array(x) * np.pi /180))
df["wind_dir_y"] = df.wind_deg.apply(lambda x: np.sin(np.array(x) * np.pi /180))

In [None]:
# replace missings w/ zero
df.rain_1h.fillna(0,inplace=True)
df.snow_1h.fillna(0,inplace=True)

#### Input Weather Codes

In [None]:
codes = pd.read_csv('../data/neueFische_Wettercodes.csv')
codes.columns = ['weather_id', 'WeatherMain', 'WeatherDescription']

In [None]:
df = pd.merge(df, codes,how='left',on='weather_id')

#### Extract date and hour from datetime

In [None]:
df['time'] = pd.to_datetime(df['time'],utc=True)
df['date'] = df.time.dt.date
df['hour'] = df.time.dt.hour


In [None]:
df.head()

#### Extract daily summary statistics

In [None]:
# utility function for extracting summary statistics from weather recordings
def extract_daily_statistics(df,columns):
    """Extracts daily summary statistics of hourly weather data. 
        Returns a dataframe with mean, min, max and std values for weather recordings between 06:00 - 20:00 
        and also returns mean values for three daily time period, i.e. 06:00-10:00, 11:00-15:00, and 16:00-20:00

    Args:
        df (pd.DataFrame): Dataframe containing ourly weather recordings
        columns (list): A list of columns containing weather variable names contained in the input dataframe.

    Returns:
        pd.DataFrame: Dataframe containing daily summary statistics of weather data
    """

    # intialize dataframe for summary statistics
    summary_stats = pd.DataFrame({'date': df.date.unique()})
    summary_stats.set_index('date', inplace=True)

    # subselect dataframe for calculating whole-day summary statistics
    df_day = df[df.hour.between(6, 20)]

    # subselect dataframe for specified time frames
    # 06:00-10:00
    df_6_10 = df[df.hour.between(6, 10)]
    # 11:00-15:00
    df_11_15= df[df.hour.between(11, 15)]
    # 16:00-20:00
    df_16_20= df[df.hour.between(16, 20)]

    for col in columns:
        # calculate different summary statistics over complete daytime period
        summary_stats[[col+'_mean', col+'_min', col+'_max', col+'_std']] = df_day.groupby('date')[col].agg(['mean', 'min', 'max', 'std'])
        
        # calculate mean values for specified time frames
        # 06:00-10:00
        df_6_10 = df[df.hour.between(6, 10)]
        summary_stats[col+'_06-10'] = df_6_10.groupby('date')[col].agg(['mean'])
        # 11:00-15:00
        df_11_15= df[df.hour.between(11, 15)]
        summary_stats[col+'_11_15'] = df_11_15.groupby('date')[col].agg(['mean'])
        # 16:00-20:00
        df_16_20= df[df.hour.between(16, 20)]
        summary_stats[col+'_16_20'] = df_16_20.groupby('date')[col].agg(['mean'])
    

    return summary_stats

In [None]:
df.columns

In [None]:
summary_stats = extract_daily_statistics(df, ['temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'wind_speed', 'wind_dir_x', 'wind_dir_y', 'rain_1h','snow_1h'])
summary_stats.head()

In [None]:
summary_stats.info()

In [None]:
df.date.nunique()

There are as many rows as unqiue days in the summary-statistics df. Thus, the feature engineering seems to be successful. 

### Feature engineering on string features

In [None]:
df.WeatherMain.unique()

In [None]:
df.WeatherMain.value_counts()

- convert weather main into broader categories (e.g. rain, drizzle == rainy)
- extract mode category as general weather condition for that day

- __broader weather categories__
    * 'Clouds': 'cloudy'
    * 'Snow': 'snowy'
    * 'Rain', 'Drizzle': 'rainy'
    * 'Mist', 'Fog', 'Haze': 'foggy'
    * 'Clear': 'clear'
    * 'Dust', 'Smoke': 'dusty'
    * 'Thunderstorm', 'Tornado': 'stormy'

In [None]:
# define dictionary to re-classify weather categories
dict = {'Clouds': 'cloudy', 'Snow': 'snowy', 'Rain': 'rainy', 'Mist': 'foggy', 'Drizzle':'rainy', 'Fog':'foggy', 'Clear':'clear',
       'Dust': 'dusty', 'Thunderstorm': 'stormy', 'Haze': 'foggy', 'Smoke': 'dusty', 'Tornado': 'stormy'}
 
 
# Remap the values of the dataframe
df['weather_cats'] = df.WeatherMain.map(dict)

# count frequency per category
df.weather_cats.value_counts()

### extract mode as representative weather condition per time frame

In [None]:
summary_stats['condition_total'] = df[df.hour.between(6, 20)].groupby('date').weather_cats.agg(pd.Series.mode)
summary_stats['condition_6-10'] = df[df.hour.between(6, 10)].groupby('date').weather_cats.agg(pd.Series.mode)
summary_stats['condition_11-15'] = df[df.hour.between(11, 15)].groupby('date').weather_cats.agg(pd.Series.mode)
summary_stats['condition_16-20'] = df[df.hour.between(16, 20)].groupby('date').weather_cats.agg(pd.Series.mode)

### convert weather categories into one-hot encodings and compute relative frequency per day
Additionally, each weather category is dummy-encoded as a separate feature. We will then sum up the hourly occurrences per time frame for each weather category and compute the relative frequency per time frame.

In [None]:
weather_cats = pd.get_dummies(df.weather_cats)
df[weather_cats.columns] = weather_cats
df.head()

In [None]:
# use mean to calculate the relative frequency per time frame
for cat in weather_cats:
    summary_stats[cat+'_total'] = df[df.hour.between(6, 20)].groupby('date')[cat].agg(np.mean) 
    summary_stats[cat+'_06_10'] = df[df.hour.between(6, 10)].groupby('date')[cat].agg(np.mean)
    summary_stats[cat+'_11_15'] = df[df.hour.between(11, 15)].groupby('date')[cat].agg(np.mean) 
    summary_stats[cat+'_16_20'] = df[df.hour.between(16, 20)].groupby('date')[cat].agg(np.mean) 

In [None]:
summary_stats.head()

In [None]:
df.WeatherDescription.unique()

In [None]:
summary_stats.to_csv('../data/summary_stats.csv')