<img style="float: right;" src="hyperstream.svg">

# HyperStream Tutorial 5: Workflows

- What are workflows?
- 

In [1]:
%load_ext watermark

import sys
sys.path.append("../") # Add parent dir in the Path

from hyperstream import HyperStream
from hyperstream import TimeInterval
from hyperstream.utils import UTC

from datetime import datetime
from utils import plot_multiple_stock
from dateutil.parser import parse

%watermark -v -m -p hyperstream -g

hs = HyperStream(loglevel=20)
print hs

CPython 2.7.6
IPython 5.3.0

hyperstream 0.3.0-beta

compiler   : GCC 4.8.4
system     : Linux
release    : 3.19.0-80-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit
Git hash   : be5062cf8b5fda236fd35a7f80ec4b8c03cc14c9
HyperStream version 0.3.0-beta, connected to mongodb://localhost:27017/hyperstream


In [2]:
def dateparser(dt):
    return parse(dt.replace('M', '-')).replace(tzinfo=UTC)

ti_all = TimeInterval(datetime(1999, 1, 1).replace(tzinfo=UTC),
                      datetime(2013, 1, 1).replace(tzinfo=UTC))
ti_sample = TimeInterval(datetime(2007, 1, 1).replace(tzinfo=UTC),
                         datetime(2007, 3, 1).replace(tzinfo=UTC))

countries = ['USA', 'Asia', 'NZ', 'Australia']
temp_tools_csv = {}
temp_streams = {}
for country in countries:
    temp_tools_csv[country] = hs.plugins.example.tools.csv_reader(
            'data/TimeSeriesDatasets_130207/Temp{}.csv'.format(country),
            header=True, dateparser=dateparser)
    temp_streams[country] = hs.channel_manager.memory.get_or_create_stream(country)
    temp_tools_csv[country].execute(sources=[], sink=temp_streams[country],
                                    interval=ti_all)
    # Print two examples per stream
    print('\n{}: some samples'.format(country))
    for key, value in temp_streams[country].window(ti_sample).items():
        print '[%s]: %s' % (key, value)


USA: some samples
[2007-01-18 00:00:00+00:00]: {'ChicagoMax': 2.1, 'LosAngelesMin': 7.0, 'HoustonMax': 16.5, 'NYMax': 8.0, 'SeattleMax': 7.4, 'SeattleMin': 0.2, 'ChicagoMin': -6.1, 'NYMin': -0.9, 'HoustonMin': 5.7, 'LosAngelesMax': 18.7}
[2007-02-18 00:00:00+00:00]: {'ChicagoMax': -3.3, 'LosAngelesMin': 9.9, 'HoustonMax': 20.4, 'NYMax': 2.9, 'SeattleMax': 10.3, 'SeattleMin': 3.4, 'ChicagoMin': -11.6, 'NYMin': -5.7, 'HoustonMin': 6.3, 'LosAngelesMax': 19.3}

Asia: some samples
[2007-01-18 00:00:00+00:00]: {'NewDelhiMax': 21.7, 'NewDelhiMin': 7.0, 'HongKongMin': 13.3, 'KualaLumpurMin': 23.7, 'TokyoMax': 10.9, 'KualaLumpurMax': 31.8, 'HongKongMax': 19.3, 'BangkokMin': 23.4, 'BangkokMax': 33.4, 'TokyoMin': 4.6}
[2007-02-18 00:00:00+00:00]: {'NewDelhiMax': 24.1, 'NewDelhiMin': 11.9, 'HongKongMin': 17.5, 'KualaLumpurMin': 23.3, 'TokyoMax': 12.8, 'KualaLumpurMax': 32.7, 'HongKongMax': 23.3, 'BangkokMin': 24.3, 'BangkokMax': 34.0, 'TokyoMin': 5.0}

NZ: some samples
[2007-01-18 00:00:00+00:00]

In [3]:
country = 'Australia'
cities = [key for key, value in temp_streams[country].window().items()[0].value.iteritems()]
mapping = {}
for city in cities:
    mapping[city] = city

data = {city:[] for city in cities}
time = []
for key, values in temp_streams[country].window().items():
    time.append(str(key))
    for city, temp in values.iteritems():
        data[city].append(temp)
        
names = data.keys()
data = [value for key, value in data.iteritems()]
        
plot_multiple_stock(data, time=time, names=names, title='Temperatures in {}'.format(country))

In [4]:
from hyperstream import StreamInstance
from hyperstream import StreamId

this_stream = temp_streams[country]

# It is similar to a database channel
A = hs.channel_manager.assets
this_cities = A.get_or_create_stream('cities_{}'.format(country))

A.write_to_stream(stream_id=this_cities.stream_id, data=StreamInstance(ti_all.end, mapping))

this_cities.window(TimeInterval.up_to_now()).items()

[StreamInstance(timestamp=datetime.datetime(2013, 1, 1, 0, 0, tzinfo=<bson.tz_util.FixedOffset object at 0x7f63706e8310>), value={u'BrisbaneMax': u'BrisbaneMax', u'Melbournemax': u'Melbournemax', u'BrisbaneMin': u'BrisbaneMin', u'CanberraMin': u'CanberraMin', u'GoldCoastMax': u'GoldCoastMax', u'MelbourneMin': u'MelbourneMin', u'Canberramax': u'Canberramax', u'GoldCoastMin': u'GoldCoastMin', u'SydneyMin': u'SydneyMin', u'SydneyMax': u'SydneyMax'})]

In [5]:
splitter_tool = hs.tools.splitter_from_stream(element=None, use_mapping_keys_only=False)

In [6]:
for city in this_cities.window(TimeInterval.up_to_now()).last().value:
    hs.plate_manager.meta_data_manager.insert(parent='root', data=city, tag='city', identifier='city_'+city)

KeyError: 'Identifier city_BrisbaneMax already exists in tree'

In [14]:
cities_plate = hs.plate_manager.create_plate(plate_id='C', meta_data_id='city', parent_plate=None, 
                                             values=[], complement=True, description='Cities')
this_country_temps = []
for city in this_cities.window(TimeInterval.up_to_now()).last().value:
    this_country_temps.append(hs.channel_manager.memory.get_or_create_stream(stream_id=StreamId(name='temperature', meta_data=(('city', city),))))

INFO:root:Plate with id C already exists


In [15]:
splitter_tool.execute(source=this_stream, splitting_stream=this_cities, output_plate=cities_plate, 
                      interval=ti_all, input_plate_value=None, sinks=this_country_temps)



In [16]:
this_country_temps[0].window(ti_all).items()



[]