In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import ujson as json
import math
import os
import re
import statistics
import sys

import pandas as pd

sys.path.append(os.path.expanduser("~/GitHub/EvenTDT"))
sys.path.append(os.path.expanduser("~/GitHub/multiplex-plot"))
sys.path = list(set(sys.path))

from eventdt import summarization
from eventdt.objects.exportable import Exportable

eld_dir = os.path.expanduser('~/DATA/analyses/tdt/eld-70/timelines/')
original = [ '#WALSUI-s3-f20', '#TURITA-s3-f20', '#SOUARS-s3-f20',
             '#LEIMUN-s10-f5', '#SCOCZE-s3-f20', '#HUNFRA-s5-f20' ]
filtered = [ '#WALSUI-s3-f20-filtered', '#TURITA-s3-f20-filtered', '#SOUARS-s3-f20-filtered',
             '#LEIMUN-s10-f5-filtered', '#SCOCZE-s3-f20-filtered', '#HUNFRA-s5-f20-filtered' ]
fuego_dir = os.path.expanduser('~/DATA/analyses/tdt/fuego-15/timelines/')
fuego = [ '#WALSUI-v10-b0.7-e0.7-grouped', '#TURITA-v15-b0.6-e0.6-grouped', '#SOUARS-v15-b0.7-e0.7-grouped',
          '#LEIMUN-v30-b0.6-e0.6-grouped', '#SCOCZE-v15-b0.7-e0.7-grouped', '#HUNFRA-v15-b0.7-e0.7-grouped' ]

In [2]:
%%time
re_event = re.compile('#[A-Z]{6}')
rows = { }
for _original in original:
    with open(os.path.join(eld_dir, f"{ _original }.json"), 'r') as file:
        data = json.loads(file.readline())
        timeline = Exportable.decode(data)
        rows[re_event.findall(_original)[0]] = {
            'read': timeline['read'], 'consumed': timeline['consumed'], 'filtered': timeline['filtered']
        }

eld_df = pd.DataFrame.from_dict(rows, "index")
eld_df['consumed%'] = eld_df.consumed/eld_df.read
eld_df['filtered%'] = eld_df.filtered/eld_df.read
eld_df.to_csv('../exports/6.3-eld.csv')
eld_df.head(10)

CPU times: user 6.93 s, sys: 900 ms, total: 7.83 s
Wall time: 7.21 s


Unnamed: 0,read,consumed,filtered,consumed%,filtered%
#WALSUI,87717,87717,67641,1.0,0.771128
#TURITA,109888,109888,81897,1.0,0.745277
#SOUARS,97874,97874,84400,1.0,0.862333
#LEIMUN,209132,209132,178743,1.0,0.85469
#SCOCZE,120194,120194,94811,1.0,0.788816
#HUNFRA,122069,122069,97646,1.0,0.799925


In [3]:
%%time
re_event = re.compile('#[A-Z]{6}')
rows = { }
for _filtered in filtered:
    with open(os.path.join(eld_dir, f"{ _filtered }.json"), 'r') as file:
        data = json.loads(file.readline())
        timeline = Exportable.decode(data)
        rows[re_event.findall(_filtered)[0]] = {
            'read': timeline['read'], 'consumed': timeline['consumed'], 'filtered': timeline['filtered'],
            'filters': len(timeline['pcmd']['filters'])
        }

eld_df = pd.DataFrame.from_dict(rows, "index")
eld_df['consumed%'] = eld_df.consumed/eld_df.read
eld_df['filtered%'] = eld_df.filtered/eld_df.read
eld_df.to_csv('../exports/6.3-feld.csv')
eld_df.head(10)

CPU times: user 2.72 s, sys: 155 ms, total: 2.87 s
Wall time: 2.89 s


Unnamed: 0,read,consumed,filtered,filters,consumed%,filtered%
#WALSUI,87717,42901,32107,70,0.489084,0.366029
#TURITA,109888,50545,37013,70,0.459968,0.336825
#SOUARS,97874,41207,34869,70,0.421021,0.356264
#LEIMUN,209132,106505,88906,70,0.509272,0.425119
#SCOCZE,120194,61480,48247,70,0.511506,0.401409
#HUNFRA,122069,52152,40315,70,0.427234,0.330264


In [4]:
eld_df.mean()

read         124479.000000
consumed      59131.666667
filtered      46909.500000
filters          70.000000
consumed%         0.469681
filtered%         0.369318
dtype: float64

Note that each FUEGO timeline has several similar `pcmd` keys:

- `read`: the number of read tweets from the file (all tweets, unless there's sampling)
- `split.consumed`: the number of tweets sent to each stream; **all** tweets are sent to each stream
- `filter.consumed`: each stream consumer has a filter consumer, and this key describes the number of tweets that pass the stream's filter
- `consumed`: the number of tweets consumed by each FUEGO consumer; should be identical to the `filter.consumed` key
- `filtered`: the number of tweets that pass the FUEGO consumer's filters in each stream

> Note: The `filtered.total` key calculated below is the number of tweets processed by the consumer after its own filtering.
  In other words, `filtered.total` key represents the number of tweets used to calculate nutrition and burst.
  However, tweets can belong to multiple streams, and thus processed multiple times.
  Therefore the number of **unique** filtered and processed tweets is smaller than this number.
>  
> In the same way, the `filtered.total%` key is not the percentage of total tweets that are processed.
  However, it can be thought of as how much the consumer simplifies the dataset (in terms of the number of tweets).
  Therefore it still makes to report about these statistics, especially if we assume that all tweets take the same amount of processing power: a `filtered.total%` of 100% means we processed as many tweets as there are in the dataset, even if we processed just one tweet repeatedly.

In [5]:
rows = { }
streams = None
for _fuego in fuego:
    with open(os.path.join(fuego_dir, f"{ _fuego }.json"), 'r') as file:
        data = json.loads(file.readline())
        timeline = Exportable.decode(data)
        assert streams is None or streams == timeline['pcmd']['splits'] # assert that the streams have the right order
        streams = timeline['pcmd']['splits']
        assert all(timeline['read'] == timeline['split.consumed'][i] for i in range(len(timeline['split.consumed'])))
        rows[re_event.findall(_fuego)[0]] = {
            'read': timeline['read'], 'consumed': timeline['consumed'], 'filtered': timeline['filtered'],
            'splits.n': len(timeline['pcmd']['splits'])
        }

fuego_df = pd.DataFrame.from_dict(rows, "index")
fuego_df['consumed.min'] = fuego_df.consumed.map(lambda consumed: min(consumed))
fuego_df['consumed.max'] = fuego_df.consumed.map(lambda consumed: max(consumed))
fuego_df['consumed.median'] = fuego_df.consumed.map(lambda consumed: statistics.median(consumed))
fuego_df['consumed.avg'] = fuego_df.consumed.map(lambda consumed: statistics.mean(consumed))
fuego_df['consumed.total'] = fuego_df.consumed.map(lambda consumed: sum(consumed))
fuego_df['consumed.total%'] = fuego_df.apply(lambda row: row['consumed.total']/row.read, axis=1)
fuego_df['filtered.min'] = fuego_df.filtered.map(lambda filtered: min(filtered))
fuego_df['filtered.max'] = fuego_df.filtered.map(lambda filtered: max(filtered))
fuego_df['filtered.median'] = fuego_df.filtered.map(lambda filtered: statistics.median(filtered))
fuego_df['filtered.avg'] = fuego_df.filtered.map(lambda filtered: statistics.mean(filtered))
fuego_df['filtered.total'] = fuego_df.filtered.map(lambda filtered: sum(filtered))
fuego_df['filtered.total%'] = fuego_df.apply(lambda row: row['filtered.total']/row.read, axis=1)
fuego_df.to_csv('../exports/6.3-ueld.csv')
fuego_df.head(10)

Unnamed: 0,read,consumed,filtered,splits.n,consumed.min,consumed.max,consumed.median,consumed.avg,consumed.total,consumed.total%,filtered.min,filtered.max,filtered.median,filtered.avg,filtered.total,filtered.total%
#WALSUI,87717,"[2309, 7852, 10222, 6227, 4961, 2688, 9797, 93...","[998, 816, 3714, 2221, 2276, 788, 2507, 394, 2...",15,78,10222,2688,3862.8,57942,0.660556,47,3714,816,1294.666667,19420,0.221394
#TURITA,109888,"[2963, 6790, 16234, 8323, 4892, 4274, 8820, 14...","[1152, 835, 6836, 3431, 1914, 1588, 2764, 414,...",15,35,16234,4274,4710.866667,70663,0.643046,8,6836,1152,1763.533333,26453,0.240727
#SOUARS,97874,"[1619, 5971, 10808, 7264, 4055, 2159, 12074, 1...","[884, 1214, 5249, 3126, 1552, 892, 4941, 993, ...",15,207,12074,2159,3881.8,58227,0.594918,68,5249,993,1593.133333,23897,0.244161
#LEIMUN,209132,"[13696, 11953, 30929, 16317, 5765, 4112, 32354...","[8275, 2228, 18462, 8969, 2208, 1519, 12728, 3...",15,105,32354,7621,10327.866667,154918,0.740767,71,18462,3791,4936.533333,74048,0.354073
#SCOCZE,120194,"[1828, 5746, 33185, 9191, 5444, 2574, 6831, 47...","[765, 936, 19162, 2715, 2203, 728, 1728, 110, ...",15,79,33185,3185,5408.0,81120,0.674909,34,19162,936,2361.266667,35419,0.294682
#HUNFRA,122069,"[1525, 7272, 15975, 10088, 4664, 3219, 9772, 1...","[578, 933, 7739, 4778, 1904, 1922, 3779, 422, ...",15,105,15975,4664,4861.733333,72926,0.597416,67,7739,1904,2080.333333,31205,0.255634


In [6]:
fuego_df.mean()

read               124479.000000
splits.n               15.000000
consumed.min          101.500000
consumed.max        20007.333333
consumed.median      4098.500000
consumed.avg         5508.844444
consumed.total      82632.666667
consumed.total%         0.651935
filtered.min           49.166667
filtered.max        10193.666667
filtered.median      1598.666667
filtered.avg         2338.244444
filtered.total      35073.666667
filtered.total%         0.268445
dtype: float64

Note that by observing the distribution of tweets in each stream, we can immediately draw some conclusions.

- The stream about goals received a quarter of all tweets in the game between Scotland and the Czech Republic after Patrik Schick's wonder goal
- The stream with _utd_ was only active in the match involving Manchester United
- The stream with _champion[s]_ and _league_ was most active in the match between Leicester City and Manchester United, receiving 15% of all tweets with Champions League qualification in play
- Most of the other streams, especially the one about the start and end of each _half_ and _sub[stitutions]_ received a consistent rate of tweets

In [7]:
_streams = [ ', '.join(stream) for stream in streams ]
consumed = { stream: { } for stream in _streams}
for i in range(len(fuego_df)):
    row = fuego_df.iloc[i]
    for stream, tweets in zip(streams, row.consumed):
#         consumed[', '.join(stream)][row.name] = tweets/row['consumed.total'] # each column sums up to 1
        consumed[', '.join(stream)][row.name] = tweets/row.read # the fraction of all tweets consumed by streams

stream_dict = pd.DataFrame.from_dict(consumed, "index")
stream_dict

Unnamed: 0,#WALSUI,#TURITA,#SOUARS,#LEIMUN,#SCOCZE,#HUNFRA
"foul, refere, book, decis, var, given, pen, dive, ref, penalti",0.026323,0.026964,0.016542,0.06549,0.015209,0.012493
"gol, stream, onlin, free, reddit, link, EXCLUDED-manchest, ff, live",0.089515,0.06179,0.061007,0.057155,0.047806,0.059573
"goal, score, conced, equalis, offsid, assist",0.116534,0.147732,0.110428,0.147892,0.276095,0.130869
"need, half, sub, second, lead, 2nd",0.07099,0.075741,0.074218,0.078022,0.076468,0.082642
"keeper, best, goalkeep, defend, EXCLUDED-kepa, save",0.056557,0.044518,0.041431,0.027566,0.045293,0.038208
"deflect, kick, corner, shot, net",0.030644,0.038894,0.022059,0.019662,0.021415,0.02637
"champion, final, leagu, footbal, win",0.111689,0.080264,0.123363,0.154706,0.056833,0.080053
"tackl, dribbl, yellow, red, card",0.010648,0.013095,0.017921,0.026959,0.003927,0.009634
"touch, cross, ball, pass",0.045362,0.057368,0.04843,0.036441,0.026499,0.04223
"world, class, striker",0.022937,0.022195,0.011934,0.013939,0.036133,0.042959
