# EDA Weather Data

## Setup and Packages

In [None]:
# Main data packages. 
import numpy as np
import pandas as pd

# Data Viz. 
import statsmodels.formula.api as smf
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.ndimage import gaussian_filter
from calendar import monthrange
from calendar import month_name

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
sns.set_style(
    style='darkgrid', 
    rc={'axes.facecolor': 'white', 'grid.color': '.8'}
)
NF_ORANGE = '#ff5a36'
NF_BLUE = '#163251'
cmaps_hex = ['#193251','#FF5A36','#696969', '#7589A2','#FF5A36', '#DB6668']
sns.set_palette(palette=cmaps_hex)
sns_c = sns.color_palette(palette=cmaps_hex)
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

## Import Dataset and First Look

In [None]:
#import data
df = pd.read_csv('../data/neueFische_Wetter.csv')

In [None]:
df.info()

## Extreme Values and Distribution

We can see several missings in visibility, wind_gust, rain_1h and snow_1h.

For the precipitation features, we know these missings are equivalent to zero. wind_gust and visibility need further investigation - we will get to that later.

In [None]:
# replace missings with zeroes for rain and snow
df.rain_1h.fillna(0,inplace=True)
df.snow_1h.fillna(0,inplace=True)

#df.visibility.fillna(10000,inplace=True)

#df.wind_gust.fillna(df.wind_speed,inplace=True)

In [None]:
sns.scatterplot(data=df, x='time', y=(df.visibility/100), x_bins=877)#, bins=240, hue=df.visibility.isna())
sns.histplot(data=df, x='time', hue=df.visibility.isna(), bins=877, multiple='stack')

In [None]:
sns.scatterplot(data=df, x='time', y=((df.wind_gust/df.wind_gust.max())*100), x_bins=877)#, bins=240, hue=df.visibility.isna())
sns.histplot(data=df, x='time', hue=df.wind_gust.isna(), bins=877, multiple='stack')

Now let's see whether there are any extreme values in the data

In [None]:
df.describe().T

In [None]:
sns.set()
fig, axes = plt.subplots(3,4)
ax=int(0)
ay=int(0)
ay2=0
for e in ['temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'visibility', 'wind_speed', 'wind_deg', 'wind_gust', 'rain_1h',
       'snow_1h']:  
       sns.boxplot(data=df, y=e, ax=axes[ay,ax])
       ax+=1
       ay2+=1
       ay=int(ay2%3)
       ax=int(ax-ay/4)
fig.tight_layout()
axes[-1, -1].axis('off')
plt.show();

Most features have at least some extreme values. However, at first glance they don't look like they aren't genuine values: They are neither completely out of range nor not accompanied by other values, thus might represent unusual weather. We will keep these data points in for now, but keep them in mind for future model building.

Next, we should investigate whether the data is normally distributed. Since we can expect the distribution to be different by year, we should look at the data yearwise.

In [None]:
#convert time to datetime and extract years, months, and so on.
df.time = df.apply(lambda x: pd.to_datetime(x['time'],utc=True),axis=1)
df = df.assign(
    date = lambda x: x['time'].dt.date,
    year = lambda x: x['time'].dt.year,
    month = lambda x: x['time'].dt.month,
    week = lambda x: x['time'].dt.isocalendar().week,
    day = lambda x: x['time'].dt.day,
    weekday = lambda x: x['time'].dt.weekday,
    dayofyear = lambda x: x['time'].dt.dayofyear,
    hour = lambda x: x['time'].dt.hour,
)
df.weekday.replace({0:'Mo',
                    1:'Tu',
                    2:'We',
                    3:'Th',
                    4:'Fr',
                    5:'Sa',
                    6:'Su'},inplace=True)

In [None]:
sns.set()
fig, axes = plt.subplots(3,4)
ax=int(0)
ay=int(0)
ay2=0
for e in ['temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'visibility', 'wind_speed', 'wind_deg', 'wind_gust', 'rain_1h',
       'snow_1h']:  
       sns.histplot(data=df, x=e, ax=axes[ay,ax], hue='year', legend=False, palette='Set2', element='poly')
       ax+=1
       ay2+=1
       ay=int(ay2%3)
       ax=int(ax-ay/4)
fig.tight_layout()
axes[-1, -1].axis('off')
plt.show();

In [None]:
# for better visibility, here's the temperature plot separately for any year
sns.histplot(data=df.loc[df['year']>2018], x='temp', hue='year', element='poly')

We can see that most features are not normally distributed.

## Feature Engineering: Wind and Weather Codes

First, let's take a look at where the wind was most often coming from.

In [None]:
winddata = pd.DataFrame(df[['wind_speed','wind_deg']].value_counts().reset_index())

In [None]:
winddata.columns = ['wind_speed', 'wind_deg', 'frequency']

In [None]:
import plotly.express as px
fig = px.bar_polar(winddata.query('frequency>60'), r='frequency', theta="wind_deg",
                   color="wind_speed", template="plotly_dark",
                   color_discrete_sequence= px.colors.sequential.Plasma_r,
                   log_r=True)
fig.show()

Most of the time, the location seems to have eastern to north-easterly winds or wind coming from the south-west.

However, the wind data is not in a sensible format for most analyses as it is measured in degrees, ranging from 0 (North) to 360 (also North). So 360 and 0 is the same, not the two extreme ends of a scale. We thus translate wind direction into x and y coordinates.

In [None]:
# Turn wind data into x-y-coordinates
df["wind_dir_x"] = df.wind_deg.apply(lambda x: np.cos(np.array(x) * np.pi /180))
df["wind_dir_y"] = df.wind_deg.apply(lambda x: np.sin(np.array(x) * np.pi /180))

We further now interpret the weather codes by linking them with metadata from a reference table.

In [None]:
codes = pd.read_csv('../data/neueFische_Wettercodes.csv')
codes.columns = ['weather_id', 'WeatherMain', 'WeatherDescription']
df = pd.merge(df, codes, how='left',on='weather_id')

# Time Series EDA

To examine the distribution of weather data over time, we look at precipitation and temperature and compare them by years (from 2017 on, so we can actually see something on the graphs).

In [None]:
sns.lineplot(data=df.loc[df['year']>2016].groupby(['year','month']).agg('mean'), x='month', y='rain_1h', hue='year')

It seems to rain pretty evenly during the year, with an extreme peak in may 2018.

In [None]:
sns.lineplot(data=df.loc[df['year']>2016].groupby(['year','month']).agg('mean'), x='month', y='snow_1h',hue='year')

Snow is, not surprisingly, sparse and falling in january 2019 and 2020 mostly.

In [None]:
sns.lineplot(data=df.loc[df['year']>2016].groupby(['year','month']).agg('mean'), x='month', y='feels_like',hue='year')

In [None]:
# Polar plot for seasonality 
ax = plt.subplot(111, projection='polar')

# Convert and plot data
df \
    .assign(day_of_year_cyclic = lambda x: x['dayofyear'].transform(lambda x: 2*np.pi*x/365.25)) \
    .pipe((sns.lineplot, 'data'), 
        x='day_of_year_cyclic', 
        y='feels_like', 
        hue='month',
        palette=sns.color_palette("husl", 12),
        ax=ax
    )

fancy_plot=True     #this is only to make the plot more beautiful. If you just want to see how the data looks without the adjustments to the plot set fancy_plot to False
if(fancy_plot): 
    days_per_month=[0] + [monthrange(2021, i)[1] for i in range(1,12)]      #findout how many days each month has ()
    month_start=np.cumsum(days_per_month) +1                                #add 1 to start at 0 instead of 0, take the cumsum to get ech months starting day
    month_start_theta=[i *2 * np.pi / 365.25 for i in month_start]          #turn start day into an angle (in rad), use 365.25 as the average length of a year

    month_label=[month_name[i] for i in range(1,13)]
    month_label_long=[label+'\n(Day ' +str(month_start[ind]) +')' for ind,label in enumerate(month_label)]

    ax.set_title('Felt Temperature', va='bottom',pad=22);
    ax.spines.clear()
    
    ax.set_xlabel('')
    ax.set_xticks(month_start_theta)
    ax.set_xticklabels(month_label_long)
    
    ax.set_ylabel('')    
    ax.set_ylim(-5,28)
    ax.set_yticks(yt:=[0,5,10,15,20])
    ax.set_yticklabels([str(t)+'°C' for t in yt], rotation = 45)

    #Arrows / Annotations
    style = "Simple, tail_width=0.5, head_width=4, head_length=8"
    kw = dict(arrowstyle=style, color="dimgrey")
    ax.set_rlabel_position(1) 
    ax.text(13*2*np.pi/360,24,"Days",size=14,color='dimgrey',rotation=-80,va='center')
    ax.text(-3*2*np.pi/360,16,"Temperature",size=14,color='dimgrey',rotation=-0,va='center')
    a1 = patches.FancyArrowPatch((1*np.pi/180, -5), (1*np.pi/180, 26), **kw)

    a2 = patches.FancyArrowPatch((1*np.pi/180, 24), (25*np.pi/180, 24),
                                connectionstyle=f"arc3,rad={0.105}", **kw)
    
    ax.add_patch(a1)
    ax.add_patch(a2)


    ax.set_rorigin(-5)
    ax.xaxis.set_tick_params(which='major',pad=10)

    ax.legend(labels=month_label,ncol=2,facecolor='white',edgecolor='white',bbox_to_anchor=(1.1, 1.1), loc=1)

    ax.figure.set_figwidth(12)
    ax.figure.set_figheight(12)


ax.figure.savefig("../plots/Temp_polar.png",dpi=300)

In general, from June to September, we can expect temperatures to go past 20 degrees on average, while they regularly drop below zero in January and February.

In [None]:
sns.pairplot(df[['temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'visibility', 'wind_speed', 'wind_gust', 'rain_1h',
       'snow_1h', 'month']])

In [None]:
params = ['temp', 'feels_like', 'pressure', 'humidity', 'clouds',
       'visibility', 'wind_speed', 'wind_gust', 'rain_1h',
       'snow_1h', 'wind_dir_x', 'wind_dir_y']

In [None]:
df.columns

### Exploring Seasonality and Overall Trends

First, let's get an idea of how the different features behave over the course of a year by smoothening the data at a monthly rolling average.

In [None]:
# Plot different weather features over the course of time

# Smooth and plot
fig, ax = plt.subplots(len(params), 1, figsize=(12, 3*len(params)), constrained_layout=True, sharex=True)
plt.suptitle('Weather Data - Moving Average Smoothing', y=1.02);

smooth_df = df.groupby('date').agg('mean')

for i, e in enumerate(params):
    smooth_df[f'{e}_smooth_ma_30d'] = smooth_df[f'{e}'].rolling(window=30,center=True).mean() #compute the rolling mean
    sns.lineplot(x='date', y=f'{e}', label=f'{e} (Signal)', data=smooth_df,  ax=ax[i])
    sns.lineplot(x='date', y=f'{e}_smooth_ma_30d', label=f'{e} smoothed:\n 30 days', data=smooth_df, color=NF_ORANGE, ax=ax[i])

    ax[i].legend(title='', loc='center left', bbox_to_anchor=(1, 0.5))
    ax[i].set(title='');

    
fig.savefig("../plots/Weather_MA_Smoothing.png",dpi=300)

Apart from the fact that we can clearly see that something is off with our wind speed and -gust data here (the data looks different from May 2021 on, and wind gusts further are very odd between spring 2015 and the end of 2019), there are some very clear sesasonal patterns in almost all features.

We can decompose them in further plots.

We start with a yearly cycle:

In [None]:
elms=['temp',
 'feels_like',
 'pressure',
 'humidity',
 'clouds',
 #'visibility',
 'wind_speed',
 #'wind_gust',
 #'vislies'
 ]
 
for i, e in enumerate(elms):
    seasonal_decompose(x=df.groupby('date').agg('mean')[f'{e}'], #we use the daily mean instead of the hourly values to keep the plots readable
                        model='additive', 
                        two_sided=True,
                        period=365
                        ).plot()

As expected, we can identify clear seasonal patterns for most features.

Then, we inspect daily cycles:

In [None]:
#daily cycle

#since the graph otherwise gets unreadable, it's necessary to only use a subset of data, like one or two months.
p_start='2016-07-01'
p_end='2016-08-01'

#to get a better estimate of the daily cycle, you can enter another timespan here (preferrably 6 months earlier or later) that will also be included in the analysis. 
# If you want to do this, set comparison = True
comparison = False
p_start2='2016-01-01'
p_end2='2016-02-01'

if not comparison:
    p_start, p_end = p_start2, p_end2

elms=['temp',
 'feels_like',
 'pressure',
 'humidity',
 'clouds',
 #'visibility',
 'wind_speed',
 #'wind_gust',
 #'vislies'
 ]

for i, e in enumerate(elms):
    seasonal_decompose(x=df.loc[(df['time']>p_start)&(df['time']<p_end)|(df['time']>p_start2)&(df['time']<p_end2)].reset_index()[f'{e}'], 
                        model='additive', 
                        two_sided=True,
                        period=24
                        ).plot()

Here we see that although we can discover seasonal patterns in almost all features, some of the residual plots still exhibit recurring patterns. Further analysis needs to clarify where they come from.