In [2]:
import pandas as pd
import numpy as np
import re
import json
import ast
import tweepy
from textblob import TextBlob
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer
import emoji
import yake

In [4]:
activity_watch = pd.read_json("../data/activity_watch_2021_11.json")
#separating the buckets column as that is where the dictionary of relevant data is

activity_watch = activity_watch['buckets'].apply(pd.Series)
activity_watch
#Further splitting the events column to extract the time stamp, duration and data

activity_watch_split = pd.DataFrame(activity_watch['events'][0])
activity_watch_split

Unnamed: 0,timestamp,duration,data
0,2021-11-13T14:49:19.458000+00:00,76.420,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
1,2021-11-13T14:49:18.332000+00:00,0.000,"{'app': 'msedge.exe', 'title': 'Watchers and 6..."
2,2021-11-13T14:48:43.893000+00:00,33.322,"{'app': 'msedge.exe', 'title': 'Releases · Act..."
3,2021-11-13T14:48:17.172000+00:00,25.607,"{'app': 'msedge.exe', 'title': 'fix: fixed imp..."
4,2021-11-13T14:48:14.915000+00:00,1.137,"{'app': 'msedge.exe', 'title': 'Releases · Act..."
...,...,...,...
86122,2021-02-27T15:09:10.151000+00:00,4.846,"{'app': 'SearchApp.exe', 'title': 'Search'}"
86123,2021-02-27T15:08:49.183000+00:00,19.718,"{'app': 'msedge.exe', 'title': 'Sponsor @Activ..."
86124,2021-02-27T15:08:24.630000+00:00,23.357,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
86125,2021-02-27T15:08:23.428000+00:00,0.000,"{'app': 'msedge.exe', 'title': 'https://github..."


In [5]:
#dropping the data column in a copy of this

activity_watch_split_timestamps = activity_watch_split.drop(columns=["data"])
activity_watch_split_timestamps

Unnamed: 0,timestamp,duration
0,2021-11-13T14:49:19.458000+00:00,76.420
1,2021-11-13T14:49:18.332000+00:00,0.000
2,2021-11-13T14:48:43.893000+00:00,33.322
3,2021-11-13T14:48:17.172000+00:00,25.607
4,2021-11-13T14:48:14.915000+00:00,1.137
...,...,...
86122,2021-02-27T15:09:10.151000+00:00,4.846
86123,2021-02-27T15:08:49.183000+00:00,19.718
86124,2021-02-27T15:08:24.630000+00:00,23.357
86125,2021-02-27T15:08:23.428000+00:00,0.000


In [6]:
#splitting the data column of activity_watch_split, to extract the app and title being used

activity_watch_split_events = activity_watch_split['data'].apply(pd.Series)
activity_watch_split_events

Unnamed: 0,app,title
0,msedge.exe,ActivityWatch and 68 more pages - Personal - M...
1,msedge.exe,Watchers and 68 more pages - Personal - Micros...
2,msedge.exe,Releases · ActivityWatch/activitywatch and 68 ...
3,msedge.exe,"fix: fixed import in main, moved macos permiss..."
4,msedge.exe,Releases · ActivityWatch/activitywatch and 68 ...
...,...,...
86122,SearchApp.exe,Search
86123,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
86124,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
86125,msedge.exe,https://github.com/ActivityWatch and 22 more p...


In [7]:
#combining the two split activity watch dataframes so that it contains both the timestamp, app, title and duration

activity_watch_final = pd.concat([activity_watch_split_timestamps, activity_watch_split_events], axis=1)
activity_watch_final

Unnamed: 0,timestamp,duration,app,title
0,2021-11-13T14:49:19.458000+00:00,76.420,msedge.exe,ActivityWatch and 68 more pages - Personal - M...
1,2021-11-13T14:49:18.332000+00:00,0.000,msedge.exe,Watchers and 68 more pages - Personal - Micros...
2,2021-11-13T14:48:43.893000+00:00,33.322,msedge.exe,Releases · ActivityWatch/activitywatch and 68 ...
3,2021-11-13T14:48:17.172000+00:00,25.607,msedge.exe,"fix: fixed import in main, moved macos permiss..."
4,2021-11-13T14:48:14.915000+00:00,1.137,msedge.exe,Releases · ActivityWatch/activitywatch and 68 ...
...,...,...,...,...
86122,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search
86123,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
86124,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
86125,2021-02-27T15:08:23.428000+00:00,0.000,msedge.exe,https://github.com/ActivityWatch and 22 more p...


In [8]:
#checking for nulls in the title column that may not have been labelled as null

activity_watch_nulls = activity_watch_final[activity_watch_final['title'] == ""]
activity_watch_nulls

Unnamed: 0,timestamp,duration,app,title
18,2021-11-13T14:46:23.338000+00:00,0.000,explorer.exe,
20,2021-11-13T14:46:18.901000+00:00,1.113,explorer.exe,
24,2021-11-13T14:46:04.388000+00:00,0.000,explorer.exe,
26,2021-11-13T14:45:54.404000+00:00,4.446,explorer.exe,
30,2021-11-13T14:45:29.754000+00:00,10.089,explorer.exe,
...,...,...,...,...
86052,2021-02-27T15:23:45.428000+00:00,0.100,explorer.exe,
86088,2021-02-27T15:15:53.395000+00:00,0.000,explorer.exe,
86092,2021-02-27T15:15:44.524000+00:00,0.027,explorer.exe,
86096,2021-02-27T15:15:38.558000+00:00,0.042,explorer.exe,


In [10]:
#extracting just the duplicates in the timestamp column

activity_watch_final_duplicates = activity_watch_final[activity_watch_final.duplicated(subset=["timestamp"]) == True]
activity_watch_final_duplicates.duplicated(subset=["timestamp"]).value_counts()

#diving deeper into the duplicates to see if other variables such as "app" and "title" are the same
# if they are, it means that it is one app spawning multiple processes ( or one for each vscode window lol)


#taking only the maximum value duplicate

activity_watch_final = activity_watch_final.sort_values('duration').drop_duplicates('timestamp', keep='last')
activity_watch_final

Unnamed: 0,timestamp,duration,app,title
38163,2021-07-02T17:57:46.667000+00:00,0.000,msedge.exe,support and 15 more pages - Personal - Microso...
46002,2021-05-21T07:11:52.717000+00:00,0.000,msedge.exe,Untitled and 25 more pages - Personal - Micros...
46014,2021-05-21T07:06:24.968000+00:00,0.000,msedge.exe,Gmail and 24 more pages - Personal - Microsoft...
46015,2021-05-21T07:06:23.681000+00:00,0.000,msedge.exe,New tab and 24 more pages - Personal - Microso...
46016,2021-05-21T07:06:22.436000+00:00,0.000,msedge.exe,https://calendly.com and 23 more pages - Perso...
...,...,...,...,...
77339,2021-03-15T04:57:50.661000+00:00,36343.626,unknown,
72485,2021-03-27T02:29:07.006000+00:00,42341.804,LockApp.exe,Windows Default Lock Screen
72738,2021-03-25T23:30:00.056000+00:00,48138.768,LockApp.exe,Windows Default Lock Screen
71314,2021-03-30T23:44:12.699000+00:00,50594.155,LockApp.exe,Windows Default Lock Screen


In [11]:
#renaming duration and timestamp column to reflect the appropriate units

activity_watch_final.rename(columns={"duration":"duration_seconds"}, inplace=True)
activity_watch_final.rename(columns={"timestamp":"timestamp_utc"}, inplace=True)
activity_watch_final

Unnamed: 0,timestamp_utc,duration_seconds,app,title
38163,2021-07-02T17:57:46.667000+00:00,0.000,msedge.exe,support and 15 more pages - Personal - Microso...
46002,2021-05-21T07:11:52.717000+00:00,0.000,msedge.exe,Untitled and 25 more pages - Personal - Micros...
46014,2021-05-21T07:06:24.968000+00:00,0.000,msedge.exe,Gmail and 24 more pages - Personal - Microsoft...
46015,2021-05-21T07:06:23.681000+00:00,0.000,msedge.exe,New tab and 24 more pages - Personal - Microso...
46016,2021-05-21T07:06:22.436000+00:00,0.000,msedge.exe,https://calendly.com and 23 more pages - Perso...
...,...,...,...,...
77339,2021-03-15T04:57:50.661000+00:00,36343.626,unknown,
72485,2021-03-27T02:29:07.006000+00:00,42341.804,LockApp.exe,Windows Default Lock Screen
72738,2021-03-25T23:30:00.056000+00:00,48138.768,LockApp.exe,Windows Default Lock Screen
71314,2021-03-30T23:44:12.699000+00:00,50594.155,LockApp.exe,Windows Default Lock Screen


In [12]:
#checking for numerical anomalies that look too small

# activity_watch_final[activity_watch_final["duration_seconds"] == 0]

### TODO: Write to out to it's own dataset & Create a category for tabs open but never visited

#checking for numerical anomalies that look too big

# activity_watch_final[activity_watch_final["duration_seconds"] > 14].sort_values("duration_seconds", ascending=False)[30:50]

Unnamed: 0,timestamp_utc,duration_seconds,app,title
76763,2021-03-18T00:54:53.119000+00:00,13866.799,LockApp.exe,Windows Default Lock Screen
72050,2021-03-28T20:54:20.495000+00:00,13779.963,unknown,unknown
35652,2021-07-14T08:50:05.965000+00:00,13752.258,msedge.exe,Update on exploring HMO Partnership - oreogund...
77079,2021-03-17T00:12:32.322000+00:00,13736.064,LockApp.exe,Windows Default Lock Screen
82300,2021-03-08T01:26:55.219000+00:00,13505.167,unknown,
37730,2021-07-03T10:36:44.571000+00:00,13231.558,Code.exe,eeg.py - eeg-notebooks - Visual Studio Code
73656,2021-03-23T01:30:36.216000+00:00,12998.759,msedge.exe,"Inbox (3,030) - oreogundipe@gmail.com - Gmail ..."
51980,2021-04-27T18:21:22.732000+00:00,12952.699,msrdc.exe,MININT-IICQLJG.northamerica.corp.microsoft.com...
67867,2021-04-08T17:24:24.674000+00:00,12786.679,msrdc.exe,MININT-IICQLJG.northamerica.corp.microsoft.com...
80797,2021-03-11T04:35:56.409000+00:00,12723.571,CodeSetup-stable-f30a9b73e8ffc278e71575118b6bf...,Setup


In [13]:
#removing rows with LockApp

activity_watch_final.drop(index=activity_watch_final[activity_watch_final['app'] == 'LockApp.exe'].index, inplace=True)

#removing rows with unknown

activity_watch_final.drop(index=activity_watch_final[activity_watch_final['app'] == 'unknown'].index, inplace=True)

In [14]:
#checking for different variations of the same name
activity_watch_final["app"].value_counts()

#sorting the values from the earliest date to the latest date
activity_watch_final.sort_values("timestamp_utc", ascending=True, inplace=True)

#resetting the index values
activity_watch_final.reset_index(drop=True, inplace=True)
activity_watch_final

Unnamed: 0,timestamp_utc,duration_seconds,app,title
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...
1,2021-02-27T15:08:23.428000+00:00,0.000,msedge.exe,https://github.com/ActivityWatch and 22 more p...
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
4,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search
...,...,...,...,...
78667,2021-11-13T14:48:14.915000+00:00,1.137,msedge.exe,Releases · ActivityWatch/activitywatch and 68 ...
78668,2021-11-13T14:48:17.172000+00:00,25.607,msedge.exe,"fix: fixed import in main, moved macos permiss..."
78669,2021-11-13T14:48:43.893000+00:00,33.322,msedge.exe,Releases · ActivityWatch/activitywatch and 68 ...
78670,2021-11-13T14:49:18.332000+00:00,0.000,msedge.exe,Watchers and 68 more pages - Personal - Micros...


Rename activity titles

In [21]:
#For names that include "code" we will be changing it to "VSCode" to avoid double counting

#TODO: change to lamda
for index in range(len(activity_watch_final["app"])):
    if "Code" in activity_watch_final["app"][index]:
        activity_watch_final["app"][index] = "VSCode"

#For names that include "spotify" we will be changing it to "Spotify" to avoid double counting

for index in range(len(activity_watch_final["app"])):
    if "Spotify" in activity_watch_final["app"][index] or "spotify" in activity_watch_final["app"][index]:
        activity_watch_final["app"][index] = "Spotify"

#For titles that are empty we are converting them to null values
for index in range(len(activity_watch_final["title"])):
    if len(activity_watch_final["title"][index]) == 0:
        activity_watch_final["title"][index] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## extracting topics using yake

In [22]:
language = "en"
max_ngram_size = 4
deduplication_threshold = 0.9
numOfKeywords = 5 # the max number of results to expect

#creating a function that removes key words with low value, and then extracting key words for titles with "msedge" as the named app
browsing_yake_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)

activity_watch_edge_events = activity_watch_final[activity_watch_final["app"] == "msedge.exe"]

skip_words = ["Personal", "Microsoft", "Edge", "pages", "tab", "slack", "Ọrẹ̀", "Ore", "oreHGA", "April", "Ogundipe", "Timi", "ORE_OGUNDIPE_T4.pdf", "oreHGA", "(Ore Ogundipe)", "Ọrẹ̀ Ògúndípẹ̀", "API" ]

def getkeywords(text):
    #TODO: get rid of stop words before doing extraction
    # - punctuations
    # - "Personal - Microsoft Edge"
    if type(text) is not str:
        return None

    text = text.replace(" - Personal - Microsoft Edge", "")
    text = text.replace("slack", "")
    text = text.replace("Slack", "")
    text = text.replace("Twitter", "")
    text = text.replace("tweets", "")
    text = text.replace("Ore", "")
    text = text.replace("Ọrẹ̀", "")
    text = text.replace("Orehga", "")
    text = text.replace("oreHGA", "")
    text = text.replace("April", "")
    text = text.replace("Ogundipe", "")
    text = text.replace("Timi", "")
    text = text.replace("ORE_OGUNDIPE_T4.pdf", "")
    text = text.replace("(Ore Ogundipe)", "")
    text = text.replace("Ọrẹ̀ Ògúndípẹ̀", "")
    text = text.replace("buraksekili/", "")
    text = text.replace("meet", "")
    text = text.replace("Meet", "")

    yake_keywords = browsing_yake_extractor.extract_keywords(text)

    resulting_keywords = []
    for result in yake_keywords:
        if result[0] in skip_words:
            continue
        # convert all keyword entries to lower case
        # each keyword has a structure of (keyword, confidence)
        resulting_keywords.append((result[0].lower(), result[1]))
    
    return resulting_keywords

activity_watch_edge_events["keywords"] = activity_watch_edge_events["title"].apply(lambda x: getkeywords(x))
activity_watch_edge_events["keywords"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0        [(open-source automated time tracker, 0.002262...
1                                                       []
2                   [(activitywatch, 0.15831692877998726)]
3        [(activitywatch on github sponsors, 0.00377236...
7                        [(untitled, 0.15831692877998726)]
                               ...                        
78667    [(releases, 0.05393656033701752), (activitywat...
78668    [(moved macos permission prompt, 0.00580131487...
78669    [(releases, 0.05393656033701752), (activitywat...
78670                    [(watchers, 0.15831692877998726)]
78671               [(activitywatch, 0.15831692877998726)]
Name: keywords, Length: 48991, dtype: object

we then go ahead to a frequency mapping for the event classifications to pick the most relevant

In [23]:
browser_keyword_freq_dict = {}
for keyword_entries in activity_watch_edge_events["keywords"]:
    if keyword_entries is None:
        continue
    
    # each keyword has a structure of (keyword, confidence)
    # we don't need confidence so we only take keyword
    for single_keyword_entry in keyword_entries:
        keyword = single_keyword_entry[0]
        if browser_keyword_freq_dict.get(keyword) == None:
            browser_keyword_freq_dict[keyword] = 1
        else:
            browser_keyword_freq_dict[keyword] += 1

# the result of browser_keyword_freq_dict has a strcuture of (keyword, appearance_count)
sorted_browser_keyword_freq_dict = dict(sorted(browser_keyword_freq_dict.items(), key=lambda item: item[1], reverse=True))

the result of browser_keyword_freq_dict has a strcuture of (keyword, appearance_count)

now we select a single winner called "context" from the keyword entries

In [26]:
#choosing the most relevant topic out of all the selected topics

def choose_winning_key(keyword_entries):
    """This is chosen as a function of occurence in total data set
    and confidence
    
    use weight ( confidence * keyword frequency in total dataset) to
    decide what is the winning keyword/event classification"""
    if keyword_entries is None or len(keyword_entries) < 1:
        return None
        
    keyword_weights_dict = {}
    # check frequency count for each word in descending order of confidence
    for entry in keyword_entries:
        # essentially weight = confidence * frequency
        keyword_weights_dict[entry[0]] = entry[1] * browser_keyword_freq_dict[entry[0]]

    # the dictionary will be of structure {"event_keyword", "weight"}
    # pick the keyword with the highest weight
    sorted_keyword_weights_dict = dict(sorted(keyword_weights_dict.items(), key=lambda item: item[1], reverse=True))
    
    #TODO:what happens when there's a tie - for now we're just picking the heighest
    return list(sorted_keyword_weights_dict.keys())[0]
    


activity_watch_edge_events["event_classification"] = activity_watch_edge_events["keywords"].apply(choose_winning_key)

activity_watch_edge_events["event_classification"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


tweets                      1895
gmail                       1494
bing                        1382
neurotechx                  1268
ndi                          887
                            ... 
computationally fast,not       1
selective                      1
tanjalo.com                    1
david eagleman                 1
obyezeks                       1
Name: event_classification, Length: 4143, dtype: int64

## Get an event classification for VSCode events

In [27]:
#Removing words from VSCode that gives us little information, and classifying based on hihg value words

activity_watch_events_vscode = activity_watch_final[activity_watch_final["app"] == "VSCode"]

def get_vscode_context(text):
    if type(text) is not str:
        return (None, None)
    # output from this will be "filename", "project"
    # first split string with space " - "
    text = text.replace("Visual Studio Code - Insiders", "")
    text = text.replace("Visual Studio Code", "")
    text = text.replace("(Untracked)", "")
    text = text.replace("(Working Tree)", "")
    text = text.replace("●", "")
    text = text.rstrip(" - ")

    # VSCode has a structure of how the window title is defined (need to validate if this is same on mac)
    # filename - projectname - Visua Studio Code
    text_split = text.split(" - ")
    # sort in descending order
    reverse_text_split_sort = sorted(text_split, key=text_split.index, reverse=True)

    # item 0 will be project name
    # item 1 will be the file name
    project = None
    filename = None

    if len(reverse_text_split_sort) == 1:
        project = reverse_text_split_sort[0]

    if len(reverse_text_split_sort) >= 2:
        project = reverse_text_split_sort[0]
        filename = reverse_text_split_sort[1]

    # return a tuple "(project, filename)"
    return (project, filename)


# calling the column keywords to be consistent with activity watch
activity_watch_events_vscode["keywords"] = activity_watch_events_vscode["title"].apply(get_vscode_context)

#seeing what the data looks like
activity_watch_events_vscode["event_classification"] = activity_watch_events_vscode["keywords"].apply(lambda x: x[0])


combined_aw_edge_vscode =  activity_watch_edge_events.append(activity_watch_events_vscode)
combined_aw_edge_vscode_final = combined_aw_edge_vscode.drop(["timestamp_utc", "app", "duration_seconds", "title"], axis=1)
combined_aw_edge_vscode_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,keywords,event_classification
0,"[(open-source automated time tracker, 0.002262...",activitywatch
1,[],
2,"[(activitywatch, 0.15831692877998726)]",activitywatch
3,"[(activitywatch on github sponsors, 0.00377236...",github sponsors
7,"[(untitled, 0.15831692877998726)]",untitled
...,...,...
78336,"(logger, package.json)",logger
78337,"(logger, .gitignore)",logger
78338,"(, None)",
78339,"(logger, package.json)",logger


## combine vscode & msedge events

In [28]:

activity_watch_dataframe_final = pd.concat([activity_watch_final, combined_aw_edge_vscode_final], axis=1, join="outer")
activity_watch_dataframe_final["event_classification"].value_counts()[30:60]

issue                                208
adaobi                               208
google docs                          201
notion                               194
python                               186
eeg-notebooks developers             181
sharing                              178
search                               173
vauban                               162
report.pdf                           152
famasi                               150
updating                             149
correlations                         148
mne                                  147
collection of classic eeg            140
latest tweets                        139
home                                 133
status                               133
future africa                        130
demystifying brain                   128
eeg notebooks                        126
google.com                           126
akb-intel                            125
general                              123
neurosity-resear

In [30]:
#changing mislaballed topics

def format_label(label):
    if type(label) == str:
        label = label.lower()

    if label == "eeg-notebooks developers":
        label = "eeg-notebooks"
    if label == "calendly":
        label = "planning"
    if label == "google calendar":
        label = "planning"
    if label == "latest tweets":
        label = "tweets"
    if label == "brain-dump":
        label = "braindump"
    if label == "eeg-notebooks project":
        label = "eeg-notebooks"
    if label == "spotify streaming":
        label = "spotify"
    if label == "openai-api":
        label = "openai api"
        
    
    return label

activity_watch_dataframe_final["event_classification"] = activity_watch_dataframe_final["event_classification"].apply(format_label)

In [31]:
#Filling in null values in the classifcation column, with values from the app column

activity_watch_dataframe_final["event_classification"].fillna(activity_watch_dataframe_final["app"], inplace=True)

# replace null labels as "untitled"
def null_label(label):
    if label == "":
        label = "untitled"
    return label
        
activity_watch_dataframe_final["event_classification"] = activity_watch_dataframe_final["event_classification"].apply(null_label)

format and explort activity watch data

In [36]:
activity_watch_dataframe_final[0:10]

Unnamed: 0,timestamp_utc,duration_seconds,app,title,keywords,event_classification
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...,"[(open-source automated time tracker, 0.002262...",activitywatch
1,2021-02-27T15:08:23.428000+00:00,0.0,msedge.exe,https://github.com/ActivityWatch and 22 more p...,[],msedge.exe
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...,"[(activitywatch, 0.15831692877998726)]",activitywatch
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...,"[(activitywatch on github sponsors, 0.00377236...",github sponsors
4,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search,,SearchApp.exe
5,2021-02-27T15:09:16.200000+00:00,13.051,explorer.exe,,,explorer.exe
6,2021-02-27T15:09:30.308000+00:00,3.712,aw-qt.exe,aw-qt,,aw-qt.exe
7,2021-02-27T15:09:35.229000+00:00,0.104,msedge.exe,Untitled and 23 more pages - Personal - Micros...,"[(untitled, 0.15831692877998726)]",untitled
8,2021-02-27T15:09:36.444000+00:00,0.089,msedge.exe,localhost:5600 and 23 more pages - Personal - ...,"[(localhost, 0.04491197687864554)]",localhost
9,2021-02-27T15:09:37.642000+00:00,22.687,msedge.exe,ActivityWatch and 23 more pages - Personal - M...,"[(activitywatch, 0.15831692877998726)]",activitywatch


In [37]:
from datetime import datetime, timedelta
# --- initial potential 
#activity_watch_summarized = summary_day, total_time_surfing, what_app_did_user_spend_x_percent_of_time_on (one column per percent) , events_usage (tuple: event_classification (encoded), sum_total_time)
# i realised that this can actually be better derived on the fly with visualizations
# when summarize this much, we lose a lot of granularity

activity_watch_dataframe_final["summary_day"] = activity_watch_dataframe_final['timestamp_utc'].apply(
    lambda timestamp: datetime.fromisoformat(timestamp).strftime("%Y-%m-%d"))

N=10
aw_summary_grouped = activity_watch_dataframe_final.groupby(["summary_day", "app", "event_classification"], as_index=False)
aw_summary_durations = aw_summary_grouped.agg({
                            'duration_seconds': 'sum'
                        })
# aw_summary_durations = aw_summary_durations.apply(lambda g: g.sort_values(by=["duration_seconds"], ascending=False).head(N))

aw_summary_durations


aw_summary_durations['summary_day_end'] = aw_summary_durations['summary_day'].apply(
    lambda value: (datetime.strptime(value, "%Y-%m-%d") + timedelta(days=1))
)

aw_summary_durations.to_csv('aw_data_grouped_nov_21.csv', index=False)