In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib
import cufflinks as cf
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
import seaborn as sns
import random
import plotly.io as pio
import missingno as msno

cf.go_offline() # required to use plotly offline (no account required).
py.init_notebook_mode() # graphs charts inline (IPython).

## Query and clean data

In [None]:
weather = pd.read_csv('weather_NY_2010_2018Nov.csv')
print(weather.shape)
weather.head()

In [None]:
weather.describe()

In [None]:
weather.info()

In [None]:
weather.isnull().sum() / len(weather)*100

In [None]:
weather.sort_values(by=['Year', 'Month', 'Day'], inplace=True)

In [None]:
# visualize missing data (white spaces)
msno.matrix(weather)

Here, we are assuming on days where SnowDepth is missing, there is no snow event instead of being a missing data point

In [None]:
weather['SnowDepth'].fillna(0, inplace=True)

In [None]:
# construct a single date col
weather['Date'] = pd.to_datetime(weather[["Year", "Month", "Day"]])

In [None]:
weather.to_csv('weather_cleaned.csv', index=False)

# EDA

In [None]:
df = pd.read_csv('weather_cleaned.csv')
df.head()


In [None]:
df.columns

Various temperature derivatives are correlated, as are that of wind speed. There is also a moderately strong correlation between temperature derivatives and month, as season would dictate. There is also a strong negative correlation between snow (SnowDepth, SnowIce) and temperature derivatives.

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

# Group and aggregate by variables

In [None]:
df_agg_date = df.iloc[:,4:].groupby('Date').mean()

In [None]:
#create and show area plot
temp_fig = go.Figure()

temp_fig.add_trace(go.Scatter(
    x=df_agg_date.index, y=df_agg_date['MeanTemp'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='#003f5c'), name="Mean Temperature",
    stackgroup='two'
))

temp_fig.add_trace(go.Scatter(
    x=df_agg_date.index, y=df_agg_date['MinTemp'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='#58508d'), name="Min Temperature",
    stackgroup='three'
))
temp_fig.add_trace(go.Scatter(
    x=df_agg_date.index, y=df_agg_date['MaxTemp'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5, color='#bc5090'), name='Max Temperature',
    stackgroup='one' 
))

In [None]:
wind_fig = go.Figure()

wind_fig.add_trace(go.Scatter(
    x=df_agg_date.index, y=df_agg_date['WindSpeed'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), name='Wind Speed',
    stackgroup='three' 
))

wind_fig.add_trace(go.Scatter(
    x=df_agg_date.index, y=df_agg_date['MaxSustainedWind'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), name='Max Sustained Wind',
    stackgroup='two' 
))

wind_fig.add_trace(go.Scatter(
    x=df_agg_date.index, y=df_agg_date['Gust'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), name='Wind Gust',
    stackgroup='one' 
))

In [None]:
df_std_date = df.groupby(['Date'], as_index=True).std()
# df_std_date.reset_index(level=0, inplace=True)
df_std_date

In [None]:
wind_fig = go.Figure()

wind_fig.add_trace(go.Scatter(
    x=df_std_date.index, y=df_std_date['WindSpeed'],
    hoverinfo='x+y',
    mode='lines',
    line=dict(width=0.5), name='Wind Speed',
    stackgroup='three' 
))

wind_fig.update_layout(height=600, width=600)
wind_fig.update_layout(
    title={
        'text': "Standard Deviatioin of Wind Speed",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        yaxis=dict(title='Speed'))
# wind_fig.add_trace(go.Scatter(
#     x=df_std_date.year, y=df_std_date['MaxSustainedWind'],
#     hoverinfo='x+y',
#     mode='lines',
#     line=dict(width=0.5), name='Max Sustained Wind',
#     stackgroup='two' 
# ))

# wind_fig.add_trace(go.Scatter(
#     x=df_std_date.year, y=df_std_date['Gust'],
#     hoverinfo='x+y',
#     mode='lines',
#     line=dict(width=0.5), name='Wind Gust',
#     stackgroup='one' 
# ))

Wind speed appears to have higher standard deviation as time passes, whereas wind gust doesn't experience the same patten. It is hypothesized that with climate change, more extreme wether pattern would become more frequent.

In [None]:
import plotly.express as px

fig = px.scatter(x=df_std_date.index, y=df_std_date['WindSpeed'])
fig.show()

In [None]:
df_std_date = df.groupby(['Year'], as_index=True).std()
# df_std_date.reset_index(level=0, inplace=True)
df_std_date