In [1]:
import bokeh.io
# this is here only for completeness to clarify where
# the methods are nested (you probably already imported this earlier)


bokeh.io.reset_output()
bokeh.io.output_notebook()

In [2]:
import numpy as np
import pandas as pd
import tigramite
from tigramite import data_processing as pp
from tigramite import plotting as tp
from tigramite.pcmci import PCMCI
from tigramite.independence_tests import ParCorr, GPDC, CMIknn, CMIsymb
import time
from bokeh.plotting import figure, show, output_file
import bokeh
from causal_analysis.discovery import load_data

# EDA to explore causal hypotheses

Load the intermediate data from disk.

In [3]:
sensor_data = load_data("../data/processed/causalDiscoveryData.csv")

In [4]:
sensor_data.head()

Unnamed: 0,location,timestamp,lat,lon,dayOfYear,minuteOfDay,dayOfWeek,isWeekend,pressure_1,pressure_sealevel,...,precip_intensity,precip_probability,precip_type,pressure,uv_index,visibility,wind_bearing,wind_gust,wind_speed,minuteOfYear
405987,123,2017-01-12 14:00:00,52.54,13.386,12,840,5,0,,,...,0.0,0.0,,,0.0,10.01,224.0,5.55,4.42,1560
405988,123,2017-01-12 15:00:00,52.54,13.386,12,900,5,0,,,...,0.0,0.0,,,0.0,10.01,226.0,6.04,4.55,1620
407232,123,2017-01-12 16:00:00,52.54,13.386,12,960,5,0,,,...,0.0,0.0,,,0.0,10.01,211.0,5.14,4.12,1680
403857,123,2017-01-12 17:00:00,52.54,13.386,12,1020,5,0,,,...,0.0,0.0,,,0.0,10.01,214.0,5.53,3.89,1740
406241,123,2017-01-12 18:00:00,52.54,13.386,12,1080,5,0,,,...,0.0,0.0,,,0.0,10.01,211.0,4.91,3.9,1800


Subset the data to a random day to investigate patterns.

In [5]:
subset = sensor_data[sensor_data["dayOfYear"] == 153]

In [6]:
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
p = figure(title = "Time of Day vs. Temperature", output_backend="webgl")
mapper = linear_cmap(field_name='dayOfYear', palette=Spectral6 ,low=min(subset["dayOfYear"]) ,high=max(subset["dayOfYear"]))
p.circle(x="minuteOfDay", y="temperature", source=subset,
         color=mapper, fill_alpha=0.2, size=10)
show(p)

As we can see, the data shows a seasonal patterns over the day with higher temperatures during the day and lower ones at night.

In [7]:
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
p = figure(title = "Time of Day vs. Humidity on Sensor", output_backend="webgl")
mapper = linear_cmap(field_name='dayOfYear', palette=Spectral6 ,low=min(subset["dayOfYear"]) ,high=max(subset["dayOfYear"]))
p.circle(x="minuteOfDay", y="humidity_sensor", source=subset,
         color=mapper, fill_alpha=0.2, size=10)
show(p)

Humidity shows a seasonal patterns that is roughly inverse of the one for temperature.

In [8]:
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
p = figure(title = "Time of Day vs. Humidity on Weather Station", output_backend="webgl")
mapper = linear_cmap(field_name='dayOfYear', palette=Spectral6 ,low=min(subset["dayOfYear"]) ,high=max(subset["dayOfYear"]))
p.circle(x="minuteOfDay", y="humidity", source=subset,
         color=mapper, fill_alpha=0.2, size=10)
show(p)

In this plot, the patterns shows even more clearly.

In [None]:
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
p = figure(title = "Humidity vs. Precipitation Probability", output_backend="webgl")
p.circle(x="humidity", y="precip_probability", source=sensor_data.loc[:1000], fill_alpha=0.2, size=10)
show(p)

For high humidity, it appears to be related to precipitation, for lower values the probability appears to be around 0.

In [None]:
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
p = figure(title = "Humidity vs. P10 Pollution", output_backend="webgl")
p.circle(x="humidity_sensor", y="p1", source=sensor_data.loc[:1000], fill_alpha=0.2, size=10)
show(p)

There is appers to be little relation between humidity and pollution, even in the region of >70% humidity.