# WikiRecentPhase1 - 

This sample demostrates creating a Streams Python application to perform some analytics and viewing the results.
The data source is live SSE feed, provided by Wikipedia, that publishes updates as they occur in the wikipedia universe. 

## Continious processing with Streams.

The [WikiRecentPhase0](./WikiRecentPhase0.ipynb) illustrated accessing continious streams events from a notebook. 
Data is collected, averages are calculated as long as the notebook open, rather inconvient to keep for the 
workstation user. Running a notebook continuously is feaseable but fraught with issues.  Better, have the server stage the events in order they can be rendered or processed as demand. ~~Best, stage the events and do the aggregation.~~
This processes events as long as browser is open, 

## Overview 
**About the sample** 
The appliction recieves wikipedia updates via a SSE feed that transmits updates as the occur on the wikipedia site. 
The updates a filters and rendered and viewed using Pandas.

**How it works**

The Python application created in this notebook is submitted to the IBM Streams service for execution. Once the application is running in the service, you can connect to it from the notebook to retrieve the results.


### Documentation

- [Streams Python development guide](https://ibmstreams.github.io/streamsx.documentation/docs/latest/python/)
- [Streams Python API](https://streamsxtopology.readthedocs.io/)







## Submit the appplication that has a filter. 

- rough cut look at the data
- plot the 'type' data
- look at the filtered data

##  Collect in buffer / Aggregate
- Last 1000 in local buffer & aggregate
- Render
- Push code to server.

## Links
[Topology](https://streamsxtopology.readthedocs.io/en/latest/index.html) documentation.
[Widgets](https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20Basics.html)

<a name="setup"></a>
# 1. Setup
### 1.1 Add credentials for the IBM Streams service

With the cell below selected, click the "Connect to instance" button in the toolbar to insert the credentials for the service.

<a target="blank" href="https://developer.ibm.com/streamsdev/wp-content/uploads/sites/15/2019/02/connect_icp4d.gif">See an example</a>.

### 1.2 Verify `streamsx` package version

Run the cell below to check which version of the `streamsx` package is installed.  

If you need to upgrade,
- Use `!pip install --user --upgrade streamsx` to upgrade the package. 
- Or, use  `!pip install --user streamsx==somever` to install a specific version of the package. 

In [None]:
# Install components
!pip install --user --upgrade streamsx

In [None]:
import streamsx.topology.context
print("streamsx package version: " + streamsx.topology.context.__version__)

## ? Explain this or just say it's imports were using. 

In [None]:
# %load common_imports.py
import pandas as pd

from IPython.core.debugger import set_trace
from IPython.display import display, clear_output

from statistics import mean
from collections import deque
from collections import Counter

import matplotlib.pyplot as plt
import ipywidgets as widgets
%matplotlib inline

from sseclient import SSEClient as EventSource

from ipywidgets import Button, HBox, VBox, Layout

from streamsx.topology.topology import *
import streamsx.rest as rest

import streamsx.topology.context
print("streamsx package version: " + streamsx.topology.context.__version__)

## Support functions for Jupyter 
Make interacting with the Streams data friendlier.

In [None]:
def catchInterrupt(func):
    """decorator : when interupt occurs the display is lost if you don't catch it
       TODO * <view>.stop_data_fetch()  # stop
       
    ."""
    def catch_interrupt(*args, **kwargs):
        try: 
            func(*args, **kwargs)
        except (KeyboardInterrupt): pass
    return catch_interrupt


def display_view_stop(eventView, period=2):
    """Wrapper for streamsx.rest_primitives.View.display() to have button. """
    button =  widgets.Button(description="Stop Updating")
    display(button)
    eventView.display(period=period) 
    def on_button_clicked(b):
        eventView.stop_data_fetch()
        b.description = "Stopped"
    button.on_click(on_button_clicked)

def view_events(views):
    """
    Build interface to display a list of views and 
    display view when selected from list.
     
    """
    view_names = [view.name for view in views]
    nameView = dict(zip(view_names, views))    
    select = widgets.RadioButtons(
        options = view_names,
        value = None,
        description = 'Select view to display',
        disabled = False
    )
    def on_change(b):
        if (b['name'] == 'label'):
            clear_output(wait=True)
            [view.stop_data_fetch() for view in views ]
            display(select)
            display_view_stop(nameView[b['new']], period=2)
    select.observe(on_change)
    display(select)

def find_job(instance, job_name=None):
    """locate job within instance"""
    for job in instance.get_jobs():    
        if job.applicationName.split("::")[-1] == job_name:
            return job
    else:
        return None
    
def get_view(instance, job_name=None, view_name="view"):
    job = find_job(instance, job_name)
    return job.get_views(view_name)
    

def display_views(instance, job_name):
    "Locate/promote and display all views of a job"
    job = find_job(instance, job_name=job_name)
    if job is None:
        print("Failed to locate job")
    else:
        views = job.get_views()
        view_events(views)
        
def list_cancel_jobs(_instance=None):
    """
    Interactive selection of jobs to cancel.
    
    Prompts with SelectMultiple widget, if thier are no jobs, your presente with a blank list.
    
    """
    active_jobs = { "{}:{}".format(job.name, job.health):job for job in _instance.get_jobs()}

    selectMultiple_jobs = widgets.SelectMultiple(
        options=active_jobs.keys(),
        value=[],
        rows=len(active_jobs),
        description="Select job(s) to cancel",
        layout=Layout(width='60%')
    )
    cancel_jobs = widgets.ToggleButton(
        value=False,
        description='Cancel',
        disabled=False,
        button_style='warning', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Delete selected jobs',
        icon="stop"
    )
    def on_value_change(change):
        for job in selectMultiple_jobs.value:
            print("canceling job:", job, active_jobs[job].cancel())
        cancel_jobs.disabled = True
        selectMultiple_jobs.disabled = True

    cancel_jobs.observe(on_value_change, names='value')

    return HBox([selectMultiple_jobs, cancel_jobs])

## Connect to the server :  ICP4D or Cloud instance -
Attempt to import if fails the cfg will not be defined we know were using 
Cloud.

In [None]:
# ICP4D  - injected by 'Connected to instance' menu item
from streamsx.topology import context
try:
    from icpd_core import icpd_util
    cfg = icpd_util.get_service_instance_details(name='sample-icp1')
    cfg[context.ConfigParams.SSL_VERIFY] = False
    instance = rest.Instance.of_service(cfg)
    print("Within ICP4D")
except ImportError:
    cfg = None
    print("Outside ICP4D")
# ICP4D


In [None]:
# disable 'InsecureRequestWarning'  - must be put after startup
if cfg is not None:
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
# NOT ICP4D : cloud access - 
if cfg is None:
    import credential   # remove @ ICP4D
    import common
    # TODO * check if instance is up
    # - link up to first cell (can you do a test and execute)
    sc = rest.StreamingAnalyticsConnection(service_name='Streaming3Turbine', vcap_services={'streaming-analytics':[{'name':'Streaming3Turbine','credentials':credential.streaming3Turbine}]})
    instance = sc.get_instances()[0]
    # Render the views.....

### List jobs to cancel....
This page will submit a job named 'WikiPhase1'. If it's running you'll want to cancel it before submitting a new version. If it is running, no need to cancel/submit you can just procede to the [Viewing data section](#viewingData).


In [None]:
list_cancel_jobs(instance)

# Composing the Streams application
- get data from wiki usng SSE
- filter data, seperate out the humans and RObots
- setup views : allEvents, allHumans, paredHumans, paredAll

## Receive messages updates from Wikipedia
As updates are made to Wikipidia pages the changs are sent over and SSE feed. The get_events() function recieves the events and acting as a [source](https://streamsxtopology.readthedocs.io/en/latest/streamsx.topology.topology.html#streamsx.topology.topology.Topology.source) pushes them onto the Streams streasm.

In [None]:
def get_events():
    """fetch recent changes from wikievents site using SSE"""
    for change in EventSource('https://stream.wikimedia.org/v2/stream/recentchange'):
        if len(change.data):
            try:
                obj = json.loads(change.data)
            except json.JSONDecodeError as err:
                print("JSON l1 error:", err, "Invalid JSON:", change.data)
            except json.decoder.JSONDecodeError as err:
                print("JSON l2 error:", err, "Invalid JSON:", change.data)
            else:
                yield(obj)


## Filter messages
The [filter](https://streamsxtopology.readthedocs.io/en/latest/streamsx.topology.topology.html#streamsx.topology.topology.Stream.filter) is used to break out messages not generated by robots.

## View into the live stream
The [view](https://streamsxtopology.readthedocs.io/en/latest/streamsx.topology.topology.html#streamsx.topology.topology.Stream.view) enables access to live stream at runtime. We spread them liberaly throughout the application to observe how the processing is procedeing

<a id='composeBuildSubmit'></a>
## Compose, build and submit the Streams application.¶

In [None]:
def WikiPhase1(jobName=None):
    """
    Compose, build and submit topology. 

    """
    topo = Topology(name=jobName)
    topo.add_pip_package('sseclient')

    ## Receive wiki data - send frequently, only one
    source = topo.source(get_events, name="WikiFeed")
    allEvents = source.view(buffer_time=1.0, sample_size=1, name="allEvents", description="All wiki events")
    
      ## Filter out bots only humans
    allHumans_ = source.filter(lambda x: x['bot'] is False, name='humansOnlyFilter')
    allHumans = allHumans_.view(buffer_time=1.0, sample_size=5, name="allHumans", description="All human events")

    ## Pare/Reduce the information we move around about humans.
    # paredHumans_ = allHumans_.map(lambda x : {'timestamp':x['timestamp']},name="pared fields")
    paredHumans_ = allHumans_.map(lambda x : {'timestamp':x['timestamp'],'type':x['type'],'wiki':x['wiki'],'user':x['user'],'title':x['title']}, name="paredFields")
    paredHumans = paredHumans_.view(buffer_time=1.0, sample_size=200, name="paredHumans", description="Human events pared")

   ## Pare/Reduce the complete set of data. 
    paredAll_ = source.map(lambda x : {'timestamp':x['timestamp'],'type':x['type'],'wiki':x['wiki'],'bot':x['bot'],'user':x['user'],'title':x['title']}, 
                        name="paredAll")
    paredAll = paredAll_.view(buffer_time=1.0, sample_size=200, name="paredAll", description="All events pared")
    
    
    return ({"topo":topo})

## Submitting job : ICP or Cloud

In [None]:
#cfg=icpd_util.get_service_instance_details(name='sample-icp1')
if cfg is not None:
    resp = WikiPhase1(jobName="WikiPhase1")
    # Disable SSL certificate verification if necessary
    cfg[context.ConfigParams.SSL_VERIFY] = False

    submission_result = context.submit("DISTRIBUTED",
                                   resp['topo'], 
                                   config=cfg)
    # The submission_result object contains information about the running application, or job
    if submission_result.job:
        print("JobId: ", submission_result['id'] , "Name: ", submission_result['name'])


In [None]:
if cfg is None:
    resp = WikiPhase1(jobName="WikiPhase1")
    submitStatus = common.submitProcess(topology=resp['topo'],
                                        streamsService="Streaming3Turbine",
                                        buildType="DISTRIBUTED",
                                        serviceType="STREAMING_ANALYTIC",
                                        jobName="WikiPhase1",
                                        cancel=True)

## What does the application look like.....

Graph of application running, since we're using live data the values on the edges between the nodes
will be different but the nodes should be the same. Notice that the count of tuples input port of botFilter is 
diffent from the output port, only tuples match the critera appear on the output port.

**TO Decide**

The graph should stay but not telling how to access the console has been squashed, do
not show things that are not 'blessed'.

I can make this into a multipart gif, which gives a better idea of what is going on. But, not having a the console that they can go to see the live may not be a good idea.

![graph of application](images/stillPhase1.jpg)

![stillPhase1.jpg](attachment:stillPhase1.jpg)

<a id='viewingData'></a>
## Viewing data 

The running application has number of views to see what what data is moving through the stream. The following 
cell will fetch the views' queue and dipslay it's data when selected. 

|view name | description of data is the view |
|---------|-------------|
|allEvents  | all fields of all events  |
|allHumans | all fields of events where field 'bot' is **False** |
|paredAll | subset of fields for all events |
|paredHumans | subset of fields of all events where field 'bot is **False**|

You want to stop the the fetching the view data when done.



In [None]:
# Render the views.....
display_views(instance, job_name="WikiPhase1")

## View notes:
- constructing the view object : https://streamsxtopology.readthedocs.io/en/latest/streamsx.topology.topology.html?highlight=view#streamsx.topology.topology.Stream.view
- methods on the view object : https://streamsxtopology.readthedocs.io/en/latest/streamsx.topology.topology.html?highlight=view#streamsx.topology.topology.View

## Graph frequency of 'type' events and 'bots'

What bots in relation to type. 


In [None]:
# tally the the bots/types
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import ipywidgets as widgets
%matplotlib inline
from collections import Counter

@catchInterrupt
def tally_bot(view, ele_window=20):
    nam_list = ['new', 'edit', 'log','categorize']
    cntbot = {key:0 for key in nam_list}
    cntnobot = {key:0 for key in nam_list}
    view.start_data_fetch()

    while True:
        listTuples= view.fetch_tuples(max_tuples=20, timeout=4)
        cntbot = Counter({key:0 for key in nam_list})
        cntnobot = Counter({key:0 for key in nam_list})
        for evt in listTuples:
            if evt['bot']:
                cntbot[evt['type']] += 1
            else:
                cntnobot[evt['type']] += 1
        bot_list = [cntbot[key] for key in nam_list]
        nobot_list = [cntnobot[key] for key in nam_list]
        
        df = pd.DataFrame({'bot': bot_list, ' nobot': nobot_list}, index=nam_list)
        df.plot.bar(rot=0, stacked=True)
        plt.ylim(0.0, ele_window)
        plt.show()
        clear_output(wait=True)

view = get_view(instance, job_name="WikiPhase1", view_name="paredAll")
tally_bot(view=view[0])



## Graph frequency of *bot*less 'type'  : Avg vs instant

Display the last set of counts and the average of the last 20 sets.

Offload the aggregating of the data to the server. We can connect to the server at
anytime to the flow event types. Curently the window is set to 200 which tranlates to
approximatly a window of 20 seconds. Setting the value to 1200 will 120 seconds of events.

In [None]:
# Aggregate colllections of rows - support

class chunking_average:
    def __init__(self, init_base, mean_elements=20):
        self.deques = {key:deque([0],maxlen=mean_elements) for key in init_base.keys()}

    def aggregate(self, chunk):
        for key in self.deques.keys():
            if self.deques[key] and chunk[key]: self.deques[key].append(chunk[key])
        return {key:mean(self.deques[key]) for key in self.deques.keys()}

In [None]:
# tally the the types

@catchInterrupt
def tally_types(view, ele_window=20):
    global resp
    nam_list = ['new', 'edit', 'log','categorize']
    cnt = {key:0 for key in nam_list}
    run_avg = chunking_average(cnt)
    view.start_data_fetch()


    while True:
        listTuples= view.fetch_tuples(max_tuples=20, timeout=3)
        cnt = Counter({key:0 for key in nam_list})
        for evt in listTuples:cnt[evt['type']] += 1
        avg = run_avg.aggregate(cnt)
        evt_list = [cnt[key] for key in nam_list]
        avg_list = [avg[key] for key in nam_list]
        df = pd.DataFrame({'count': evt_list, 'running avg': avg_list}, index=nam_list)
        df.plot.bar(rot=0)
        plt.ylim(0.0, ele_window)
        plt.show()
        clear_output(wait=True)
view = get_view(instance, job_name="WikiPhase1", view_name="paredHumans")
tally_types(view=view[0])