In [1]:
import pandas as pd
import numpy as np
import re
import json
import ast
import tweepy
from textblob import TextBlob
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer
import emoji
import yake
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

As we are still unsure of the data we want, and are subject to changing our minds, I will be dropping irrelevant columns at the very end. This will allow us to quickly re-add columns in case we do change our minds.

In [4]:
#reading in activity watch data

activity_watch = pd.read_json("data/activity_watch_2021_08_04.json")

activity_watch

Unnamed: 0,buckets
aw-watcher-window_DESKTOP-TOACMEE,"{'id': 'aw-watcher-window_DESKTOP-TOACMEE', 'c..."


In [5]:
#separating the buckets column as that is where the dictionary of relevant data is

activity_watch = activity_watch['buckets'].apply(pd.Series)
activity_watch

Unnamed: 0,id,created,name,type,client,hostname,events
aw-watcher-window_DESKTOP-TOACMEE,aw-watcher-window_DESKTOP-TOACMEE,2021-02-27T07:07:20.499312+00:00,,currentwindow,aw-watcher-window,DESKTOP-TOACMEE,[{'timestamp': '2021-08-04T15:32:07.636000+00:...


In [6]:
#Further splitting the events column to extract the time stamp, duration and data

activity_watch_split = pd.DataFrame(activity_watch['events'][0])
activity_watch_split

Unnamed: 0,timestamp,duration,data
0,2021-08-04T15:32:07.636000+00:00,10.968,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
1,2021-08-04T15:31:54.423000+00:00,11.985,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
2,2021-08-04T15:31:41.178000+00:00,12.019,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
3,2021-08-04T15:31:19.433000+00:00,20.519,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
4,2021-08-04T15:31:18.201000+00:00,0.000,"{'app': 'msedge.exe', 'title': 'Untitled and 2..."
...,...,...,...
57185,2021-02-27T15:09:10.151000+00:00,4.846,"{'app': 'SearchApp.exe', 'title': 'Search'}"
57186,2021-02-27T15:08:49.183000+00:00,19.718,"{'app': 'msedge.exe', 'title': 'Sponsor @Activ..."
57187,2021-02-27T15:08:24.630000+00:00,23.357,"{'app': 'msedge.exe', 'title': 'ActivityWatch ..."
57188,2021-02-27T15:08:23.428000+00:00,0.000,"{'app': 'msedge.exe', 'title': 'https://github..."


In [7]:
#dropping the data column in a copy of this

activity_watch_split_1 = activity_watch_split.drop(columns=["data"])
activity_watch_split_1

Unnamed: 0,timestamp,duration
0,2021-08-04T15:32:07.636000+00:00,10.968
1,2021-08-04T15:31:54.423000+00:00,11.985
2,2021-08-04T15:31:41.178000+00:00,12.019
3,2021-08-04T15:31:19.433000+00:00,20.519
4,2021-08-04T15:31:18.201000+00:00,0.000
...,...,...
57185,2021-02-27T15:09:10.151000+00:00,4.846
57186,2021-02-27T15:08:49.183000+00:00,19.718
57187,2021-02-27T15:08:24.630000+00:00,23.357
57188,2021-02-27T15:08:23.428000+00:00,0.000


In [8]:
#splitting the data column of activity_watch_split, to extract the app and title being used

activity_watch_split_2 = activity_watch_split['data'].apply(pd.Series)
activity_watch_split_2

Unnamed: 0,app,title
0,msedge.exe,ActivityWatch and 24 more pages - Personal - M...
1,msedge.exe,ActivityWatch and 23 more pages - Personal - M...
2,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,msedge.exe,ActivityWatch and 21 more pages - Personal - M...
4,msedge.exe,Untitled and 21 more pages - Personal - Micros...
...,...,...
57185,SearchApp.exe,Search
57186,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
57187,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
57188,msedge.exe,https://github.com/ActivityWatch and 22 more p...


In [9]:
#combining the two split activity watch dataframes so that it contains both the timestamp, app, title and duration

activity_watch_final = pd.concat([activity_watch_split_1, activity_watch_split_2], axis=1)
activity_watch_final

Unnamed: 0,timestamp,duration,app,title
0,2021-08-04T15:32:07.636000+00:00,10.968,msedge.exe,ActivityWatch and 24 more pages - Personal - M...
1,2021-08-04T15:31:54.423000+00:00,11.985,msedge.exe,ActivityWatch and 23 more pages - Personal - M...
2,2021-08-04T15:31:41.178000+00:00,12.019,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,2021-08-04T15:31:19.433000+00:00,20.519,msedge.exe,ActivityWatch and 21 more pages - Personal - M...
4,2021-08-04T15:31:18.201000+00:00,0.000,msedge.exe,Untitled and 21 more pages - Personal - Micros...
...,...,...,...,...
57185,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search
57186,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
57187,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
57188,2021-02-27T15:08:23.428000+00:00,0.000,msedge.exe,https://github.com/ActivityWatch and 22 more p...


Here I will be checking for any duplicates in the timestamp column, and any anomalies in the duration column. I will also be going over any nulls in the app and title column.

In [12]:
#checking for nulls in the title column that may not have been labelled as null

activity_watch_nulls = activity_watch_final[activity_watch_final['title'] == ""]
activity_watch_nulls


Unnamed: 0,timestamp,duration,app,title
5,2021-08-04T15:31:16.893000+00:00,0.000,explorer.exe,
7,2021-08-04T15:31:09.770000+00:00,3.548,explorer.exe,
16,2021-08-04T15:30:23.346000+00:00,0.000,explorer.exe,
28,2021-08-04T15:28:04.976000+00:00,0.000,explorer.exe,
48,2021-08-04T15:24:02.825000+00:00,6.236,explorer.exe,
...,...,...,...,...
57115,2021-02-27T15:23:45.428000+00:00,0.100,explorer.exe,
57151,2021-02-27T15:15:53.395000+00:00,0.000,explorer.exe,
57155,2021-02-27T15:15:44.524000+00:00,0.027,explorer.exe,
57159,2021-02-27T15:15:38.558000+00:00,0.042,explorer.exe,


In [13]:
#checking if these null columns have any useful value in the app column

activity_watch_nulls["app"].value_counts()

explorer.exe                                                      3060
unknown                                                            982
msedge.exe                                                         286
msrdc.exe                                                           79
EXCEL.EXE                                                           39
CredentialUIBroker.exe                                              21
LockApp.exe                                                         15
SystemSettingsAdminFlows.exe                                        10
dwm.exe                                                             10
WINWORD.EXE                                                          7
Code - Insiders.exe                                                  6
Skype.exe                                                            5
Adobe Premiere Rush.exe                                              5
Code.exe                                                             5
Screen

The majority of it is unlabelled browsing on Windows File Explorer. We have two options:

Option 1) Drop all these empty columns
Option 2) Keep these empty columns

This will depend on how much information the original "app" column gives us, so I will be checking that first.

In [14]:
activity_watch_final["app"].value_counts()[0:15]

msedge.exe                  32774
explorer.exe                 6408
Code.exe                     4437
unknown                      3268
msrdc.exe                    2322
Code - Insiders.exe          1342
LockApp.exe                  1179
WindowsTerminal.exe           710
notepad.exe                   692
ShellExperienceHost.exe       519
msrdcw.exe                    489
ApplicationFrameHost.exe      441
Spotify.exe                   418
EXCEL.EXE                     334
SearchApp.exe                 292
Name: app, dtype: int64

Given that the majority of apps are msedge and explorer, and the null "app" values represent 9% and 16% of the original "app" column, it is best that we keep this data, as they are statistically significant.

Now we will be checking for duplicates in the timestamp column.

In [15]:
#extracting just the duplicates in the timestamp column

activity_watch_final_duplicates = activity_watch_final[activity_watch_final.duplicated(subset=["timestamp"]) == True]
activity_watch_final_duplicates.duplicated(subset=["timestamp"]).value_counts()



False    437
True      57
dtype: int64

In [16]:
#diving deeper into the duplicates to see if other variables such as "app" and "title" are the same
# if they are, it means that it is one app spawning multiple processes ( or one for each vscode window lol)

activity_watch_final_duplicates[30:51]

Unnamed: 0,timestamp,duration,app,title
49828,2021-03-14T14:20:57.804000+00:00,311.013,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49829,2021-03-14T14:20:57.804000+00:00,300.541,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49830,2021-03-14T14:20:57.804000+00:00,290.18,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49831,2021-03-14T14:20:57.804000+00:00,279.835,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49832,2021-03-14T14:20:57.804000+00:00,269.534,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49833,2021-03-14T14:20:57.804000+00:00,259.147,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49834,2021-03-14T14:20:57.804000+00:00,248.822,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49835,2021-03-14T14:20:57.804000+00:00,238.499,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49836,2021-03-14T14:20:57.804000+00:00,228.142,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...
49837,2021-03-14T14:20:57.804000+00:00,217.765,Code - Insiders.exe,● scratch.ipynb - braindump - Visual Studio Co...


In [17]:
#taking only the maximum value duplicate

activity_watch_final = activity_watch_final.sort_values('duration').drop_duplicates('timestamp', keep='last')
activity_watch_final

Unnamed: 0,timestamp,duration,app,title
21489,2021-05-05T13:09:08.089000+00:00,0.000,msrdc.exe,Remote Desktop
27528,2021-04-22T06:58:18.030000+00:00,0.000,msedge.exe,[WIP] Decoding tutorial by hubertjb · Pull Req...
27527,2021-04-22T06:58:19.195000+00:00,0.000,msedge.exe,[WIP] Decoding tutorial by hubertjb · Pull Req...
53307,2021-03-08T05:20:25.275000+00:00,0.000,msedge.exe,Discrete Fourier Transform (numpy.fft) — NumPy...
27526,2021-04-22T06:58:20.392000+00:00,0.000,explorer.exe,
...,...,...,...,...
48402,2021-03-15T04:57:50.661000+00:00,36343.626,unknown,
43548,2021-03-27T02:29:07.006000+00:00,42341.804,LockApp.exe,Windows Default Lock Screen
43801,2021-03-25T23:30:00.056000+00:00,48138.768,LockApp.exe,Windows Default Lock Screen
42377,2021-03-30T23:44:12.699000+00:00,50594.155,LockApp.exe,Windows Default Lock Screen


In [18]:
#renaming duration and timestamp column to reflect the appropriate units

activity_watch_final.rename(columns={"duration":"duration_seconds"}, inplace=True)
activity_watch_final.rename(columns={"timestamp":"timestamp_utc"}, inplace=True)
activity_watch_final


Unnamed: 0,timestamp_utc,duration_seconds,app,title
21489,2021-05-05T13:09:08.089000+00:00,0.000,msrdc.exe,Remote Desktop
27528,2021-04-22T06:58:18.030000+00:00,0.000,msedge.exe,[WIP] Decoding tutorial by hubertjb · Pull Req...
27527,2021-04-22T06:58:19.195000+00:00,0.000,msedge.exe,[WIP] Decoding tutorial by hubertjb · Pull Req...
53307,2021-03-08T05:20:25.275000+00:00,0.000,msedge.exe,Discrete Fourier Transform (numpy.fft) — NumPy...
27526,2021-04-22T06:58:20.392000+00:00,0.000,explorer.exe,
...,...,...,...,...
48402,2021-03-15T04:57:50.661000+00:00,36343.626,unknown,
43548,2021-03-27T02:29:07.006000+00:00,42341.804,LockApp.exe,Windows Default Lock Screen
43801,2021-03-25T23:30:00.056000+00:00,48138.768,LockApp.exe,Windows Default Lock Screen
42377,2021-03-30T23:44:12.699000+00:00,50594.155,LockApp.exe,Windows Default Lock Screen


In [19]:
#checking for numerical anomalies

activity_watch_final["duration_seconds"].describe()


count    56696.000000
mean        74.072117
std        757.938974
min          0.000000
25%          0.000000
50%          3.497000
75%         14.809250
max      53778.259000
Name: duration_seconds, dtype: float64

I think we should remove titles with a duration of 0 seconds.

In [21]:
#checking for numerical anomalies that look too small

activity_watch_final[activity_watch_final["duration_seconds"] == 0]

### TODO:I just realised this could be a caretgory for tabs that I opened but never visited


Unnamed: 0,timestamp_utc,duration_seconds,app,title
21489,2021-05-05T13:09:08.089000+00:00,0.0,msrdc.exe,Remote Desktop
27528,2021-04-22T06:58:18.030000+00:00,0.0,msedge.exe,[WIP] Decoding tutorial by hubertjb · Pull Req...
27527,2021-04-22T06:58:19.195000+00:00,0.0,msedge.exe,[WIP] Decoding tutorial by hubertjb · Pull Req...
53307,2021-03-08T05:20:25.275000+00:00,0.0,msedge.exe,Discrete Fourier Transform (numpy.fft) — NumPy...
27526,2021-04-22T06:58:20.392000+00:00,0.0,explorer.exe,
...,...,...,...,...
46446,2021-03-20T14:37:35.077000+00:00,0.0,unknown,unknown
45616,2021-03-21T19:09:21.752000+00:00,0.0,msedge.exe,(2) Twitter and 27 more pages - Personal - Mic...
7688,2021-07-08T06:49:25.925000+00:00,0.0,msedge.exe,https://compneuro.neuromatch.io/tutorials/W0D0...
45961,2021-03-21T15:45:58.873000+00:00,0.0,msedge.exe,https://services.securekeyconcierge.com/cbs/sa...


In [22]:
#checking for numerical anomalies that look too big

activity_watch_final[activity_watch_final["duration_seconds"] > 14].sort_values("duration_seconds", ascending=False)[30:50]

Unnamed: 0,timestamp_utc,duration_seconds,app,title
48142,2021-03-17T00:12:32.322000+00:00,13736.064,LockApp.exe,Windows Default Lock Screen
53363,2021-03-08T01:26:55.219000+00:00,13505.167,unknown,
8793,2021-07-03T10:36:44.571000+00:00,13231.558,Code.exe,eeg.py - eeg-notebooks - Visual Studio Code
44719,2021-03-23T01:30:36.216000+00:00,12998.759,msedge.exe,"Inbox (3,030) - oreogundipe@gmail.com - Gmail ..."
23043,2021-04-27T18:21:22.732000+00:00,12952.699,msrdc.exe,MININT-IICQLJG.northamerica.corp.microsoft.com...
38930,2021-04-08T17:24:24.674000+00:00,12786.679,msrdc.exe,MININT-IICQLJG.northamerica.corp.microsoft.com...
51860,2021-03-11T04:35:56.409000+00:00,12723.571,CodeSetup-stable-f30a9b73e8ffc278e71575118b6bf...,Setup
9929,2021-07-01T22:10:47.178000+00:00,12402.146,Spotify.exe,Spotify Premium
12990,2021-06-09T07:54:09.595000+00:00,12295.218,msedge.exe,Netflix and 4 more pages - Personal - Microsof...
36774,2021-04-12T21:40:19.294000+00:00,12071.589,LockApp.exe,Windows Default Lock Screen


Time spent on LockApp.exe is time spent on the laptops homepage, therefore we will be removing it.

In [23]:
#removing rows with LockApp

activity_watch_final.drop(index=activity_watch_final[activity_watch_final['app'] == 'LockApp.exe'].index, inplace=True)



In [24]:
#removing rows with unknown

activity_watch_final.drop(index=activity_watch_final[activity_watch_final['app'] == 'unknown'].index, inplace=True)

In [25]:
#checking for different variations of the same name
activity_watch_final["app"].value_counts()

msedge.exe                                                        32524
explorer.exe                                                       6359
Code.exe                                                           4410
msrdc.exe                                                          2321
Code - Insiders.exe                                                1245
                                                                  ...  
CodeSetup-insider-84fe402d655e029eb1a5c04e675bf64788fa7fcf.tmp        1
WWAHost.exe                                                           1
CodeSetup-insider-1082913dd012e21cc5f35a06f15c480ca447f67f.tmp        1
spotify_installer-1.1.56.595.g2d2da0de-20.exe                         1
CodeSetup-insider-e590188f17162393f50feec19263398e6fe02d13.tmp        1
Name: app, Length: 98, dtype: int64

In [26]:
#sorting the values from the earliest date to the latest date
activity_watch_final.sort_values("timestamp_utc", ascending=True, inplace=True)

In [27]:
#resetting the index values
activity_watch_final.reset_index(drop=True, inplace=True)
activity_watch_final

Unnamed: 0,timestamp_utc,duration_seconds,app,title
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...
1,2021-02-27T15:08:23.428000+00:00,0.000,msedge.exe,https://github.com/ActivityWatch and 22 more p...
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
4,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search
...,...,...,...,...
52270,2021-08-04T15:31:18.201000+00:00,0.000,msedge.exe,Untitled and 21 more pages - Personal - Micros...
52271,2021-08-04T15:31:19.433000+00:00,20.519,msedge.exe,ActivityWatch and 21 more pages - Personal - M...
52272,2021-08-04T15:31:41.178000+00:00,12.019,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
52273,2021-08-04T15:31:54.423000+00:00,11.985,msedge.exe,ActivityWatch and 23 more pages - Personal - M...


In [28]:
#For names that include "code" I will be changing it to "VSCode" to avoid double counting

for index in range(len(activity_watch_final["app"])):
    if "Code" in activity_watch_final["app"][index]:
        activity_watch_final["app"][index] = "VSCode"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [29]:
#For names that include "spotify" I will be changing it to "Spotify" to avoid double counting

for index in range(len(activity_watch_final["app"])):
    if "Spotify" in activity_watch_final["app"][index] or "spotify" in activity_watch_final["app"][index]:
        activity_watch_final["app"][index] = "Spotify"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [30]:
#For titles that are empty I am converting them to null values

for index in range(len(activity_watch_final["title"])):
    if len(activity_watch_final["title"][index]) == 0:
        activity_watch_final["title"][index] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [31]:
activity_watch_final[0:30]

Unnamed: 0,timestamp_utc,duration_seconds,app,title
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...
1,2021-02-27T15:08:23.428000+00:00,0.0,msedge.exe,https://github.com/ActivityWatch and 22 more p...
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
4,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search
5,2021-02-27T15:09:16.200000+00:00,13.051,explorer.exe,
6,2021-02-27T15:09:30.308000+00:00,3.712,aw-qt.exe,aw-qt
7,2021-02-27T15:09:35.229000+00:00,0.104,msedge.exe,Untitled and 23 more pages - Personal - Micros...
8,2021-02-27T15:09:36.444000+00:00,0.089,msedge.exe,localhost:5600 and 23 more pages - Personal - ...
9,2021-02-27T15:09:37.642000+00:00,22.687,msedge.exe,ActivityWatch and 23 more pages - Personal - M...


In [32]:
#double checking for values that may be the same but have different names
activity_watch_final["timestamp_utc"].value_counts()

2021-04-16T09:09:13.024000+00:00    1
2021-08-03T05:36:17.193000+00:00    1
2021-04-23T11:06:24.698000+00:00    1
2021-04-21T14:15:29.104000+00:00    1
2021-04-18T11:29:04.395000+00:00    1
                                   ..
2021-03-19T05:12:13.271000+00:00    1
2021-04-13T03:51:07.382000+00:00    1
2021-05-31T05:20:01.493000+00:00    1
2021-05-25T09:03:08.868000+00:00    1
2021-04-20T02:33:11.808000+00:00    1
Name: timestamp_utc, Length: 52275, dtype: int64

In [33]:
activity_watch_final[activity_watch_final["app"]== "msedge.exe"].iloc[0:30]

Unnamed: 0,timestamp_utc,duration_seconds,app,title
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...
1,2021-02-27T15:08:23.428000+00:00,0.0,msedge.exe,https://github.com/ActivityWatch and 22 more p...
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
7,2021-02-27T15:09:35.229000+00:00,0.104,msedge.exe,Untitled and 23 more pages - Personal - Micros...
8,2021-02-27T15:09:36.444000+00:00,0.089,msedge.exe,localhost:5600 and 23 more pages - Personal - ...
9,2021-02-27T15:09:37.642000+00:00,22.687,msedge.exe,ActivityWatch and 23 more pages - Personal - M...
10,2021-02-27T15:10:01.559000+00:00,0.031,msedge.exe,Features - ActivityWatch Forum and 23 more pag...
11,2021-02-27T15:10:02.769000+00:00,17.903,msedge.exe,Top Features topics - ActivityWatch Forum and ...
12,2021-02-27T15:10:21.865000+00:00,13.1,msedge.exe,ActivityWatch and 23 more pages - Personal - M...


In [35]:
#extracting key topics using yake --- this is used to get more context from browser events

language = "en"
max_ngram_size = 4
deduplication_threshold = 0.9
numOfKeywords = 5 # the max number of results to expect

In [36]:
#creating a function that removes key words with low value, and then extracting key words for titles with "msedge" as the named app
browsing_yake_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)

activity_watch_edge_events = activity_watch_final[activity_watch_final["app"] == "msedge.exe"]

skip_words = ["Personal", "Microsoft", "Edge", "pages", "tab", "slack", "Ọrẹ̀", "Ore", "oreHGA", "April", "Ogundipe", "Timi", "ORE_OGUNDIPE_T4.pdf", "oreHGA", "(Ore Ogundipe)", "Ọrẹ̀ Ògúndípẹ̀", "API" ]

def getkeywords(text):
    #TODO: get rid of stop words before doing extraction
    # - punctuations
    # - "Personal - Microsoft Edge"
    if type(text) is not str:
        return None

    text = text.replace(" - Personal - Microsoft Edge", "")
    text = text.replace("slack", "")
    text = text.replace("Slack", "")
    text = text.replace("Twitter", "")
    text = text.replace("tweets", "")
    text = text.replace("Ore", "")
    text = text.replace("Ọrẹ̀", "")
    text = text.replace("Orehga", "")
    text = text.replace("oreHGA", "")
    text = text.replace("April", "")
    text = text.replace("Ogundipe", "")
    text = text.replace("Timi", "")
    text = text.replace("ORE_OGUNDIPE_T4.pdf", "")
    text = text.replace("(Ore Ogundipe)", "")
    text = text.replace("Ọrẹ̀ Ògúndípẹ̀", "")
    text = text.replace("buraksekili/", "")
    text = text.replace("meet", "")
    text = text.replace("Meet", "")

    yake_keywords = browsing_yake_extractor.extract_keywords(text)

    resulting_keywords = []
    for result in yake_keywords:
        if result[0] in skip_words:
            continue
        # convert all keyword entries to lower case
        # each keyword has a structure of (keyword, confidence)
        resulting_keywords.append((result[0].lower(), result[1]))
    
    return resulting_keywords

activity_watch_edge_events["keywords"] = activity_watch_edge_events["title"].apply(lambda x: getkeywords(x))
activity_watch_edge_events["keywords"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0        [(open-source automated time tracker, 0.002262...
1                                                       []
2                   [(activitywatch, 0.15831692877998726)]
3        [(activitywatch on github sponsors, 0.00377236...
7                        [(untitled, 0.15831692877998726)]
                               ...                        
52270                    [(untitled, 0.15831692877998726)]
52271               [(activitywatch, 0.15831692877998726)]
52272               [(activitywatch, 0.15831692877998726)]
52273               [(activitywatch, 0.15831692877998726)]
52274               [(activitywatch, 0.15831692877998726)]
Name: keywords, Length: 32524, dtype: object

In [37]:
# we then go ahead to a frequency mapping for the event classifications to pick the most relevant

browser_keyword_freq_dict = {}
for keyword_entries in activity_watch_edge_events["keywords"]:
    if keyword_entries is None:
        continue
    
    # each keyword has a structure of (keyword, confidence)
    # we don't need confidence so we only take keyword
    for single_keyword_entry in keyword_entries:
        keyword = single_keyword_entry[0]
        if browser_keyword_freq_dict.get(keyword) == None:
            browser_keyword_freq_dict[keyword] = 1
        else:
            browser_keyword_freq_dict[keyword] += 1

# the result of browser_keyword_freq_dict has a strcuture of (keyword, appearance_count)
sorted_browser_keyword_freq_dict = dict(sorted(browser_keyword_freq_dict.items(), key=lambda item: item[1], reverse=True))

the result of browser_keyword_freq_dict has a strcuture of (keyword, appearance_count)

now we select a single winner called "context" from the keyword entries

In [38]:
#choosing the most relevant topic out of all the selected topics

def choose_winning_key(keyword_entries):
    """This is chosen as a function of occurence in total data set
    and confidence
    
    use weight ( confidence * keyword frequency in total dataset) to
    decide what is the winning keyword/event classification"""
    if keyword_entries is None or len(keyword_entries) < 1:
        return None
        
    keyword_weights_dict = {}
    # check frequency count for each word in descending order of confidence
    for entry in keyword_entries:
        # essentially weight = confidence * frequency
        keyword_weights_dict[entry[0]] = entry[1] * browser_keyword_freq_dict[entry[0]]

    # the dictionary will be of structure {"event_keyword", "weight"}
    # pick the keyword with the highest weight
    sorted_keyword_weights_dict = dict(sorted(keyword_weights_dict.items(), key=lambda item: item[1], reverse=True))
    
    #TODO:what happens when there's a tie - for now we're just picking the heighest
    return list(sorted_keyword_weights_dict.keys())[0]
    


activity_watch_edge_events["event_classification"] = activity_watch_edge_events["keywords"].apply(choose_winning_key)

activity_watch_edge_events["event_classification"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


tweets              1289
gmail               1073
bing                 913
neurotechx           861
whatsapp             611
                    ... 
columns to json        1
complete               1
floyd mayweather       1
binweek                1
timigod                1
Name: event_classification, Length: 3081, dtype: int64

In [39]:
#seeing what the topics are for different rows of data

test_index = 7522
print(activity_watch_edge_events.iloc[test_index]["title"])
print(activity_watch_edge_events.iloc[test_index]["keywords"])
print(activity_watch_edge_events.iloc[test_index]["event_classification"])
# activity_watch_edge_events["event_classification"][420]

Meet - Debugging Decide Uploads and 6 more pages - Personal - Microsoft​ Edge
[('debugging decide uploads', 0.001881309737406442), ('debugging decide', 0.012602360123953448), ('decide uploads', 0.02140921543860024)]
decide uploads


In [40]:
activity_watch_edge_events[["title", "event_classification"]][11500:11550]

Unnamed: 0,title,event_classification
18360,People following Sotiris Nanopoulos (@davinci2...,sotiris nanopoulos
18361,Sotiris Nanopoulos (@davinci260) / Twitter and...,nanopoulos
18362,"David Fowler 🇧🇧 on Twitter: "". @mattklein123 q...",david
18363,Latest Tweets / Twitter and 15 more pages - Pe...,tweets
18364,"The Recount on Twitter: ""Biden: “Was it an acc...",accident
18365,Latest Tweets / Twitter and 15 more pages - Pe...,tweets
18369,Latest Tweets / Twitter and 15 more pages - Pe...,tweets
18370,Latest Tweets / Twitter and 14 more pages - Pe...,tweets
18371,Twitter and 14 more pages - Personal - Microso...,
18372,"mogwai. on Twitter: ""are there people who have...",mogwai


Get an event classification for VSCode events

In [41]:
#Removing words from VSCode that gives us little information, and classifying based on hihg value words

activity_watch_events_vscode = activity_watch_final[activity_watch_final["app"] == "VSCode"]

def get_vscode_context(text):
    if type(text) is not str:
        return (None, None)
    # output from this will be "filename", "project"
    # first split string with space " - "
    text = text.replace("Visual Studio Code - Insiders", "")
    text = text.replace("Visual Studio Code", "")
    text = text.replace("(Untracked)", "")
    text = text.replace("(Working Tree)", "")
    text = text.replace("●", "")
    text = text.rstrip(" - ")

    # VSCode has a structure of how the window title is defined (need to validate if this is same on mac)
    # filename - projectname - Visua Studio Code
    text_split = text.split(" - ")
    # sort in descending order
    reverse_text_split_sort = sorted(text_split, key=text_split.index, reverse=True)

    # item 0 will be project name
    # item 1 will be the file name
    project = None
    filename = None

    if len(reverse_text_split_sort) == 1:
        project = reverse_text_split_sort[0]

    if len(reverse_text_split_sort) >= 2:
        project = reverse_text_split_sort[0]
        filename = reverse_text_split_sort[1]

    # return a tuple "(project, filename)"
    return (project, filename)


# calling the column keywords to be consistent with activity watch
activity_watch_events_vscode["keywords"] = activity_watch_events_vscode["title"].apply(get_vscode_context)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


for now, we will only use the project as the event classification

In [42]:
#seeing what the data looks like
activity_watch_events_vscode["event_classification"] = activity_watch_events_vscode["keywords"].apply(lambda x: x[0])

activity_watch_events_vscode["event_classification"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


fusion                                                2643
braindump                                             1091
eeg-notebooks                                          902
blog                                                   276
                                                       193
akb-intel                                              125
neurosity-research-program-vscode                      121
Welcome                                                 42
neurosity-browser-extension                             36
Setup                                                   34
webvr-musicalforest                                     31
neoish                                                  29
gpt3-sandbox                                            27
Getting Started                                         26
Open Folder                                             15
like.js                                                  9
Select Python interpreter                               

In [43]:
#combine vscode & msedge events

combined_aw_edge_vscode =  activity_watch_edge_events.append(activity_watch_events_vscode)
combined_aw_edge_vscode_final = combined_aw_edge_vscode.drop(["timestamp_utc", "app", "duration_seconds", "title"], axis=1)
combined_aw_edge_vscode_final

Unnamed: 0,keywords,event_classification
0,"[(open-source automated time tracker, 0.002262...",activitywatch
1,[],
2,"[(activitywatch, 0.15831692877998726)]",activitywatch
3,"[(activitywatch on github sponsors, 0.00377236...",github sponsors
7,"[(untitled, 0.15831692877998726)]",untitled
...,...,...
52253,"(fusion, main.py )",fusion
52254,"(fusion, main.py)",fusion
52255,"(fusion, main.py)",fusion
52256,"(fusion, main.py)",fusion


In [44]:

activity_watch_final[0:10]

Unnamed: 0,timestamp_utc,duration_seconds,app,title
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...
1,2021-02-27T15:08:23.428000+00:00,0.0,msedge.exe,https://github.com/ActivityWatch and 22 more p...
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...
4,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search
5,2021-02-27T15:09:16.200000+00:00,13.051,explorer.exe,
6,2021-02-27T15:09:30.308000+00:00,3.712,aw-qt.exe,aw-qt
7,2021-02-27T15:09:35.229000+00:00,0.104,msedge.exe,Untitled and 23 more pages - Personal - Micros...
8,2021-02-27T15:09:36.444000+00:00,0.089,msedge.exe,localhost:5600 and 23 more pages - Personal - ...
9,2021-02-27T15:09:37.642000+00:00,22.687,msedge.exe,ActivityWatch and 23 more pages - Personal - M...


In [45]:
#activity watch with final data

activity_watch_dataframe_final = pd.concat([activity_watch_final, combined_aw_edge_vscode_final], axis=1, join="outer")
activity_watch_dataframe_final["event_classification"].value_counts()[30:60]

python                               151
dash                                 149
notion                               145
vauban                               126
akb-intel                            125
mne                                  124
neurosity-research-program-vscode    121
sharing                              121
project                              118
status                               118
activitywatch                        105
eeg notebooks                        103
demystifying brain                   102
search                               100
collection of classic eeg            100
general                               93
future africa                         93
dashboard                             92
computational neuroscience            91
inprivate                             89
dazn                                  89
streamlit                             85
collection                            84
google developers                     83
home            

In [46]:
activity_watch_dataframe_final["event_classification"].value_counts()[0:30]

fusion                      3019
tweets                      1289
braindump                   1091
gmail                       1073
bing                         913
eeg-notebooks                908
neurotechx                   861
whatsapp                     611
google calendar              591
untitled                     554
ndi                          509
page                         388
neurosity                    372
linkedin                     361
notifications                288
blog                         285
documentation                244
netflix                      231
google                       228
openai api                   214
pull request                 212
questrade                    208
                             193
issue                        177
eeg-notebooks developers     172
stack overflow               169
google drive                 165
adaobi                       163
github                       163
calendly                     152
Name: even

In [47]:
activity_watch_dataframe_final["event_classification"].value_counts()[90:120]

knabu                          48
bloomberg-terminal             47
plan                           47
air canada                     46
eeg                            45
jupyter notebook               45
localhost                      44
wealthsimple                   43
app                            43
dash for python                42
free online appointment        42
interact                       42
investing in peoples           42
Welcome                        42
microsoft                      42
decentralized start-up         41
crypto                         40
messages                       40
error                          39
https                          39
send money                     38
date and time                  37
click documentation            37
python string                  37
neurosity-browser-extension    36
ted talk                       36
brain computer                 35
oura api                       35
Setup                          34
canada revenue

In [45]:
#replace mislablled titles with correct values

activity_watch_dataframe_final["title"].iloc[[5393, 5395, 5413]] = "neurotechx"
activity_watch_dataframe_final["title"].iloc[[12293,12295]] = "linkedin"
activity_watch_dataframe_final["title"].iloc[[3925,3930,3934,3938, 3940, 3944, 3947 ]] = "linkedin"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [48]:
#changing mislaballed topics

def format_label(label):
    if label == "eeg-notebooks developers":
        label = "eeg-notebooks"
    if label == "calendly":
        label = "planning"
    if label == "google calendar":
        label = "planning"
    if label == "latest tweets":
        label = "tweets"
    if label == "brain-dump":
        label = "braindump"
    if label == "eeg-notebooks project":
        label = "eeg-notebooks"
    if label == "spotify streaming":
        label = "spotify"
    if label == "openai-api":
        label = "openai api"
        
        
    
        
    return label

activity_watch_dataframe_final["event_classification"] = activity_watch_dataframe_final["event_classification"].apply(format_label)

In [49]:
activity_watch_dataframe_final["event_classification"].value_counts()[0:60]

fusion                               3019
tweets                               1359
braindump                            1113
eeg-notebooks                        1091
gmail                                1073
bing                                  913
neurotechx                            861
planning                              743
whatsapp                              611
untitled                              554
ndi                                   509
page                                  388
neurosity                             372
linkedin                              361
notifications                         288
openai api                            287
blog                                  285
documentation                         244
netflix                               231
google                                228
pull request                          212
questrade                             208
                                      193
issue                             

In [50]:
#making sure everything is in lowercase
activity_watch_dataframe_final["event_classification"] = activity_watch_dataframe_final["event_classification"].str.lower()
activity_watch_dataframe_final["event_classification"].value_counts()

fusion                                   3019
tweets                                   1359
braindump                                1113
eeg-notebooks                            1091
gmail                                    1073
                                         ... 
investigators have recovered millions       1
paypal.me                                   1
emmanuel                                    1
birthday reminders                          1
invitation to wearable                      1
Name: event_classification, Length: 3108, dtype: int64

In [51]:
#Filling in null values in the classifcation column, with values from the app column


activity_watch_dataframe_final["event_classification"].fillna(activity_watch_dataframe_final["app"], inplace=True)
activity_watch_dataframe_final["event_classification"].fillna(activity_watch_dataframe_final["app"], inplace=True)

In [52]:
def null_label(label):
    if label == "":
        label = "untitled"
    return label
        
activity_watch_dataframe_final["event_classification"] = activity_watch_dataframe_final["event_classification"].apply(null_label)


In [53]:
activity_watch_dataframe_final["event_classification"].value_counts()

explorer.exe            6359
msedge.exe              5012
fusion                  3019
msrdc.exe               2321
tweets                  1359
                        ... 
openneuro                  1
neurocircuits              1
mirella pappe              1
aina                       1
images in powerpoint       1
Name: event_classification, Length: 3173, dtype: int64

In [52]:
#reading google calendar data in

g_calendar = pd.read_csv("data/google_calendar_ore_2021_04_18.csv")
g_calendar.columns

Index(['kind', 'etag', 'id', 'status', 'htmlLink', 'created', 'updated',
       'summary', 'creator', 'organizer', 'start', 'end', 'recurringEventId',
       'originalStartTime', 'visibility', 'iCalUID', 'sequence', 'reminders',
       'eventType', 'description', 'colorId', 'transparency', 'location',
       'attendees', 'guestsCanInviteOthers', 'privateCopy', 'source',
       'hangoutLink', 'conferenceData', 'extendedProperties',
       'guestsCanModify', 'guestsCanSeeOtherGuests'],
      dtype='object')

In [53]:
#inspecting the data to identify any potential anomalies
g_calendar["created"].unique()

array(['2015-04-04T02:15:49.000Z', '2016-04-01T01:56:31.000Z',
       '2020-10-19T15:35:00.000Z', '2020-10-19T15:34:38.000Z',
       '2020-10-19T15:34:39.000Z', '2020-12-17T16:17:18.000Z',
       '2020-12-13T19:54:36.000Z', '2020-12-25T20:07:28.000Z',
       '2020-12-26T14:36:21.000Z', '2020-12-26T20:06:11.000Z',
       '2020-12-23T23:29:59.000Z', '2020-12-31T20:26:42.000Z',
       '2021-01-01T01:12:06.000Z', '2021-01-01T10:22:12.000Z',
       '2020-12-21T01:50:28.000Z', '2021-01-04T22:56:34.000Z',
       '2021-01-06T15:56:23.000Z', '2021-01-10T20:33:09.000Z',
       '2021-01-12T10:00:46.000Z', '2021-01-14T17:55:48.000Z',
       '2021-01-15T06:34:48.000Z', '2020-12-21T18:30:05.000Z',
       '2021-01-13T15:53:47.000Z', '2021-01-15T18:00:44.000Z',
       '2021-01-12T08:52:04.000Z', '2021-01-17T23:38:48.000Z',
       '2021-01-18T15:14:06.000Z', '2020-12-23T20:39:07.000Z',
       '2021-01-22T06:10:43.000Z', '2021-01-22T09:05:15.000Z',
       '2021-01-21T10:09:12.000Z', '2021-01-25T02:27:55

In [54]:
#inspecting the data to identify any potential anomalies
g_calendar["location"].value_counts()

Google Meet (instructions in description)                                                 59
Amsterdam AMS                                                                              1
https://us02web.zoom.us/j/84101445821                                                      1
https://meet.google.com/vra-emhm-zoo                                                       1
https://meet.google.com/kgk-qgha-gai                                                       1
Accra ACC                                                                                  1
https://zoom.us/j/97477945016?pwd=c1FLMWJjZDVucnFGY2k1MXV4MHpDUT09                         1
Schiphol Airport (Evert v/d Beekstraat 202, 1118 Amsterdam North Holland, Netherlands)     1
https://zoom.us/j/92547005201                                                              1
Kotoka International Airport (Accra Accra, Ghana)                                          1
Tribeca Hotel, Accra                                                  

In [55]:
#inspecting the data to identify any potential anomalies
type(g_calendar["start"][0])

str

The columns start and end both contain dictionaries, so I will be separating them into dataframes then recombining.

In [56]:
#As discovered above, the dictionaries have been stored as strings. I will need to convert them into dictionaries so that I can split the columns.
g_calendar_start = g_calendar["start"].apply(ast.literal_eval)
g_calendar_start

0      {'dateTime': '2021-04-14T16:00:00-07:00', 'tim...
1                                 {'date': '2021-03-15'}
2              {'dateTime': '2021-01-11T04:00:00-08:00'}
3      {'dateTime': '2021-01-10T14:15:00-08:00', 'tim...
4      {'dateTime': '2021-01-11T04:00:00-08:00', 'tim...
                             ...                        
252            {'dateTime': '2021-04-17T12:15:00-07:00'}
253            {'dateTime': '2021-04-17T18:15:00-07:00'}
254            {'dateTime': '2021-04-17T20:30:00-07:00'}
255            {'dateTime': '2021-04-17T21:00:00-07:00'}
256            {'dateTime': '2021-04-17T21:00:00-07:00'}
Name: start, Length: 257, dtype: object

In [57]:
#The start times for google calendar events
g_calendar_start = g_calendar_start.apply(pd.Series)
g_calendar_start.rename(columns={"dateTime": "start_dateTime_utc"}, inplace=True)
g_calendar_start

Unnamed: 0,start_dateTime_utc,timeZone,date
0,2021-04-14T16:00:00-07:00,Europe/Dublin,
1,,,2021-03-15
2,2021-01-11T04:00:00-08:00,,
3,2021-01-10T14:15:00-08:00,Atlantic/Reykjavik,
4,2021-01-11T04:00:00-08:00,Europe/Berlin,
...,...,...,...
252,2021-04-17T12:15:00-07:00,,
253,2021-04-17T18:15:00-07:00,,
254,2021-04-17T20:30:00-07:00,,
255,2021-04-17T21:00:00-07:00,,


In [58]:
#splitting google calendar end date
g_calendar_end = g_calendar["end"].apply(ast.literal_eval)
g_calendar_end = g_calendar_end.apply(pd.Series)
g_calendar_end.rename(columns={"dateTime": "end_dateTime_utc"}, inplace=True)
g_calendar_end

Unnamed: 0,end_dateTime_utc,timeZone,date
0,2021-04-14T17:00:00-07:00,Europe/Dublin,
1,,,2021-03-16
2,2021-01-11T13:55:00-08:00,,
3,2021-01-10T20:50:00-08:00,Europe/Berlin,
4,2021-01-11T13:55:00-08:00,America/Vancouver,
...,...,...,...
252,2021-04-17T18:00:00-07:00,,
253,2021-04-17T20:15:00-07:00,,
254,2021-04-17T21:00:00-07:00,,
255,2021-04-17T22:15:00-07:00,,


from looking at these different datetimes, it looks as though it is in UTC minus X hours, depending on the timezone it is in.

In [59]:
#combining both the start and end columns into one dataframe

g_calendar_times_combined = g_calendar_start.join(g_calendar_end, lsuffix='_act', rsuffix='_res')
g_calendar_times_combined

Unnamed: 0,start_dateTime_utc,timeZone_act,date_act,end_dateTime_utc,timeZone_res,date_res
0,2021-04-14T16:00:00-07:00,Europe/Dublin,,2021-04-14T17:00:00-07:00,Europe/Dublin,
1,,,2021-03-15,,,2021-03-16
2,2021-01-11T04:00:00-08:00,,,2021-01-11T13:55:00-08:00,,
3,2021-01-10T14:15:00-08:00,Atlantic/Reykjavik,,2021-01-10T20:50:00-08:00,Europe/Berlin,
4,2021-01-11T04:00:00-08:00,Europe/Berlin,,2021-01-11T13:55:00-08:00,America/Vancouver,
...,...,...,...,...,...,...
252,2021-04-17T12:15:00-07:00,,,2021-04-17T18:00:00-07:00,,
253,2021-04-17T18:15:00-07:00,,,2021-04-17T20:15:00-07:00,,
254,2021-04-17T20:30:00-07:00,,,2021-04-17T21:00:00-07:00,,
255,2021-04-17T21:00:00-07:00,,,2021-04-17T22:15:00-07:00,,


In [60]:
#combining the calendar times dataframe with the original dataframe

g_calendar_final = g_calendar.join(g_calendar_times_combined, lsuffix='_act', rsuffix='_res')
g_calendar_final

Unnamed: 0,kind,etag,id,status,htmlLink,created,updated,summary,creator,organizer,...,conferenceData,extendedProperties,guestsCanModify,guestsCanSeeOtherGuests,start_dateTime_utc,timeZone_act,date_act,end_dateTime_utc,timeZone_res,date_res
0,calendar#event,"""2885397680780000""",7ui7kk61env04pff2r4oet7ba4_20210414T230000Z,confirmed,https://www.google.com/calendar/event?eid=N3Vp...,2015-04-04T02:15:49.000Z,2016-03-06T17:05:31.131Z,Jemimah's Birthday,"{'email': 'oreogundipe@gmail.com', 'displayNam...","{'email': 'oreogundipe@gmail.com', 'displayNam...",...,,,,,2021-04-14T16:00:00-07:00,Europe/Dublin,,2021-04-14T17:00:00-07:00,Europe/Dublin,
1,calendar#event,"""2918951584034000""",ccsj6c9oc9j62b9j70qjeb9k6gp3ab9pchi3cb9m6so36e...,confirmed,https://www.google.com/calendar/event?eid=Y2Nz...,2016-04-01T01:56:31.000Z,2016-04-01T01:56:32.052Z,It's your day Capt. OG,"{'email': 'oreogundipe@gmail.com', 'displayNam...","{'email': 'oreogundipe@gmail.com', 'displayNam...",...,,,,,,,2021-03-15,,,2021-03-16
2,calendar#event,"""3211334650498000""",_6tlnaqrle5p6cpb4dhmj4phpehlmio9j6hlj4ord71jn0...,confirmed,https://www.google.com/calendar/event?eid=XzZ0...,2020-10-19T15:35:00.000Z,2020-12-17T13:49:10.095Z,Flight to Vancouver (KL 681),"{'email': 'oreogundipe@gmail.com', 'self': True}",{'email': 'unknownorganizer@calendar.google.com'},...,,,,,2021-01-11T04:00:00-08:00,,,2021-01-11T13:55:00-08:00,,
3,calendar#event,"""3211334719809000""",_60q30c1g60o30e1i60o4ac1g60rj8gpl88rj2c1h84s34...,confirmed,https://www.google.com/calendar/event?eid=XzYw...,2020-10-19T15:34:38.000Z,2020-12-17T13:50:40.936Z,KLM flight 590 to Amsterdam (N8JTST),"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-01-10T14:15:00-08:00,Atlantic/Reykjavik,,2021-01-10T20:50:00-08:00,Europe/Berlin,
4,calendar#event,"""3211334720967000""",_60q30c1g60o30e1i60o4ac1g60rj8gpl88rj2c1h84s34...,confirmed,https://www.google.com/calendar/event?eid=XzYw...,2020-10-19T15:34:39.000Z,2020-12-17T13:50:41.515Z,KLM flight 681 to Vancouver (N8JTST),"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-01-11T04:00:00-08:00,Europe/Berlin,,2021-01-11T13:55:00-08:00,America/Vancouver,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,calendar#event,"""3237363132626000""",5mtl0s7s1l9f9d11584bmvvpih,confirmed,https://www.google.com/calendar/event?eid=NW10...,2021-04-17T17:46:06.000Z,2021-04-17T17:46:06.313Z,fusion - putting data together,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-04-17T12:15:00-07:00,,,2021-04-17T18:00:00-07:00,,
253,calendar#event,"""3237363179034000""",1961jk9r264751n3i683cdoi7g,confirmed,https://www.google.com/calendar/event?eid=MTk2...,2021-04-17T17:46:29.000Z,2021-04-17T17:46:29.517Z,go outside,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-04-17T18:15:00-07:00,,,2021-04-17T20:15:00-07:00,,
254,calendar#event,"""3237363211286000""",6o9tc9gtav6i52ond6tph8ujqq,confirmed,https://www.google.com/calendar/event?eid=Nm85...,2021-04-17T17:46:45.000Z,2021-04-17T17:46:45.643Z,somma - java homework,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-04-17T20:30:00-07:00,,,2021-04-17T21:00:00-07:00,,
255,calendar#event,"""3237363241070000""",6upb8fv4kkur6ugmunc8jsesm1,confirmed,https://www.google.com/calendar/event?eid=NnVw...,2021-04-17T17:47:00.000Z,2021-04-17T17:47:00.535Z,eat & nap,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-04-17T21:00:00-07:00,,,2021-04-17T22:15:00-07:00,,


In [61]:
g_calendar_final["timeZone_act"].value_counts()

America/Vancouver     112
Africa/Algiers         14
America/New_York        5
Europe/London           2
Europe/Berlin           1
Europe/Dublin           1
Atlantic/Reykjavik      1
Name: timeZone_act, dtype: int64

In [62]:
#here I will be dropping the irrelevant columns

Based on the results above I will only be keeping the following columns as they provide us with the most information:

- meeting start time (g_calendar_start)
- meeting end time (g_clanedar_end)
- timezone_act
- created
- summary
- description

In [64]:
#reading oura file in

oura_file = open('data/oura_data_ore_2021_04_18.json')
oura_json = oura_file.read()
oura_data = json.loads(oura_json)


In [65]:
#splitting oura_data into first dataframe which just has activity data. I will be combining it with other oura dataframes on the summary_date column as that column is in all dataframes

oura_activity = pd.DataFrame.from_records(oura_data["activity"])
oura_activity
oura_activity["summary_date"].describe()

count             84
unique            84
top       2021-04-07
freq               1
Name: summary_date, dtype: object

In [66]:
#extracting restufl periods into it's own dataframe

oura_restful_periods = pd.DataFrame.from_records(oura_data["restful_periods"])
oura_restful_periods
oura_restful_periods["summary_date"].describe()

count             52
unique            39
top       2021-02-07
freq               3
Name: summary_date, dtype: object

In [67]:
#extracting readiness into it's own dataframe

oura_readiness = pd.DataFrame.from_records(oura_data["readiness"])
type(oura_readiness["summary_date"][5])
oura_readiness

Unnamed: 0,period_id,rest_mode_state,score,score_activity_balance,score_hrv_balance,score_previous_day,score_previous_night,score_recovery_index,score_resting_hr,score_sleep_balance,score_temperature,summary_date
0,0,0,87,0,0,0,71,100,95,0,99,2021-01-21
1,1,0,73,0,0,90,62,48,90,0,100,2021-01-22
2,1,0,87,0,0,89,82,100,93,80,90,2021-01-23
3,2,0,81,92,0,93,70,81,95,70,98,2021-01-24
4,0,0,51,59,0,74,32,7,82,50,97,2021-01-25
...,...,...,...,...,...,...,...,...,...,...,...,...
77,1,0,47,36,70,87,39,57,50,19,92,2021-04-13
78,0,0,44,31,64,51,64,64,27,25,88,2021-04-14
79,1,0,54,40,66,80,46,74,76,25,100,2021-04-15
80,0,0,46,52,69,68,26,41,68,21,90,2021-04-16


In [68]:
#extracting sleep into it's own dataframe

oura_sleep = pd.DataFrame.from_records(oura_data["sleep"])
oura_sleep["summary_date"][3]

'2021-01-24'

In [69]:
#left joining datarame on the summary column first
oura_combined = oura_activity.join(oura_restful_periods, lsuffix='_act', rsuffix='_res')
oura_combined

Unnamed: 0,average_met,cal_active,cal_total,class_5min,daily_movement,day_end,day_start,high,inactive,inactivity_alerts,...,bedtime_end,bedtime_start,breath_average,duration,period_id,summary_date_res,timezone_res,hr_average,hr_lowest,rmssd
0,0.93750,23,1942,0000000000000000000000000000000000000000000000...,461,2021-01-21T03:59:59-08:00,2021-01-20T04:00:00-08:00,0,141,1,...,2021-01-21T07:36:24-08:00,2021-01-21T07:23:24-08:00,15.250,780.0,1.0,2021-01-21,-480.0,,,
1,1.34375,234,2311,2232332222222221222222322222333332322223311221...,4151,2021-01-22T03:59:59-08:00,2021-01-21T04:00:00-08:00,1,792,1,...,2021-01-23T13:25:29-08:00,2021-01-23T13:09:29-08:00,15.125,960.0,3.0,2021-01-23,-480.0,59.00,58.0,64.0
2,1.34375,290,2350,1211222221111111122212211222222222222224434333...,5337,2021-01-23T03:59:59-08:00,2021-01-22T04:00:00-08:00,0,612,0,...,2021-01-23T14:50:29-08:00,2021-01-23T14:39:29-08:00,15.125,660.0,4.0,2021-01-23,-480.0,60.00,60.0,
3,1.15625,96,2048,2232222211122211111111111111111111112211111111...,1832,2021-01-24T03:59:59-08:00,2021-01-23T04:00:00-08:00,0,517,1,...,2021-01-28T15:34:12-08:00,2021-01-28T15:06:12-08:00,15.250,1680.0,1.0,2021-01-28,-480.0,85.67,83.0,29.0
4,1.68750,707,2894,1111111111111111111111221212000000000221111112...,13257,2021-01-25T03:59:59-08:00,2021-01-24T04:00:00-08:00,2,703,0,...,2021-01-30T13:01:59-08:00,2021-01-30T12:40:59-08:00,14.875,1260.0,1.0,2021-01-30,-480.0,65.75,64.0,63.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,1.65625,721,2859,1111112112111112222222222222222323222222212211...,12814,2021-04-14T03:59:59-07:00,2021-04-13T04:00:00-07:00,20,710,1,...,,,,,,,,,,
80,1.40625,301,2400,1111111111111111111112111111111111111121121121...,5541,2021-04-15T03:59:59-07:00,2021-04-14T04:00:00-07:00,0,774,1,...,,,,,,,,,,
81,1.37500,193,2324,1111111111111112111112221222222222233333222222...,3148,2021-04-16T03:59:59-07:00,2021-04-15T04:00:00-07:00,1,1062,3,...,,,,,,,,,,
82,1.50000,550,2609,1111111111111111122111111111111111111221122211...,10674,2021-04-17T03:59:59-07:00,2021-04-16T04:00:00-07:00,2,451,0,...,,,,,,,,,,


In [70]:
#combining the readiness dataframe to the above combined dataframe
oura_combined_1 = oura_combined.join(oura_readiness, lsuffix='_act_1', rsuffix='_res_1')
oura_combined_1

Unnamed: 0,average_met,cal_active,cal_total,class_5min,daily_movement,day_end,day_start,high,inactive,inactivity_alerts,...,score_res_1,score_activity_balance,score_hrv_balance,score_previous_day,score_previous_night,score_recovery_index,score_resting_hr,score_sleep_balance,score_temperature,summary_date
0,0.93750,23,1942,0000000000000000000000000000000000000000000000...,461,2021-01-21T03:59:59-08:00,2021-01-20T04:00:00-08:00,0,141,1,...,87.0,0.0,0.0,0.0,71.0,100.0,95.0,0.0,99.0,2021-01-21
1,1.34375,234,2311,2232332222222221222222322222333332322223311221...,4151,2021-01-22T03:59:59-08:00,2021-01-21T04:00:00-08:00,1,792,1,...,73.0,0.0,0.0,90.0,62.0,48.0,90.0,0.0,100.0,2021-01-22
2,1.34375,290,2350,1211222221111111122212211222222222222224434333...,5337,2021-01-23T03:59:59-08:00,2021-01-22T04:00:00-08:00,0,612,0,...,87.0,0.0,0.0,89.0,82.0,100.0,93.0,80.0,90.0,2021-01-23
3,1.15625,96,2048,2232222211122211111111111111111111112211111111...,1832,2021-01-24T03:59:59-08:00,2021-01-23T04:00:00-08:00,0,517,1,...,81.0,92.0,0.0,93.0,70.0,81.0,95.0,70.0,98.0,2021-01-24
4,1.68750,707,2894,1111111111111111111111221212000000000221111112...,13257,2021-01-25T03:59:59-08:00,2021-01-24T04:00:00-08:00,2,703,0,...,51.0,59.0,0.0,74.0,32.0,7.0,82.0,50.0,97.0,2021-01-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,1.65625,721,2859,1111112112111112222222222222222323222222212211...,12814,2021-04-14T03:59:59-07:00,2021-04-13T04:00:00-07:00,20,710,1,...,54.0,40.0,66.0,80.0,46.0,74.0,76.0,25.0,100.0,2021-04-15
80,1.40625,301,2400,1111111111111111111112111111111111111121121121...,5541,2021-04-15T03:59:59-07:00,2021-04-14T04:00:00-07:00,0,774,1,...,46.0,52.0,69.0,68.0,26.0,41.0,68.0,21.0,90.0,2021-04-16
81,1.37500,193,2324,1111111111111112111112221222222222233333222222...,3148,2021-04-16T03:59:59-07:00,2021-04-15T04:00:00-07:00,1,1062,3,...,35.0,51.0,57.0,86.0,40.0,23.0,1.0,17.0,96.0,2021-04-17
82,1.50000,550,2609,1111111111111111122111111111111111111221122211...,10674,2021-04-17T03:59:59-07:00,2021-04-16T04:00:00-07:00,2,451,0,...,,,,,,,,,,


In [71]:
#combining the readiness dataframe to the above combined dataframe, to make the full oura dataframe
oura_test_combined_final = oura_combined_1.join(oura_sleep, lsuffix='_act_1', rsuffix='_res_1')
oura_test_combined_final

Unnamed: 0,average_met,cal_active,cal_total,class_5min,daily_movement,day_end,day_start,high,inactive,inactivity_alerts,...,score_efficiency,score_latency,score_rem,score_total,summary_date_res_1,temperature_delta,temperature_deviation,timezone,total_res_1,temperature_trend_deviation
0,0.93750,23,1942,0000000000000000000000000000000000000000000000...,461,2021-01-21T03:59:59-08:00,2021-01-20T04:00:00-08:00,0,141,1,...,86.0,91.0,63.0,84.0,2021-01-21,-0.15,-0.15,-480.0,28830.0,
1,1.34375,234,2311,2232332222222221222222322222333332322223311221...,4151,2021-01-22T03:59:59-08:00,2021-01-21T04:00:00-08:00,1,792,1,...,83.0,67.0,39.0,55.0,2021-01-22,-0.10,-0.10,-480.0,21420.0,0.01
2,1.34375,290,2350,1211222221111111122212211222222222222224434333...,5337,2021-01-23T03:59:59-08:00,2021-01-22T04:00:00-08:00,0,612,0,...,69.0,97.0,86.0,82.0,2021-01-23,-0.32,-0.32,-480.0,28410.0,0.01
3,1.15625,96,2048,2232222211122211111111111111111111112211111111...,1832,2021-01-24T03:59:59-08:00,2021-01-23T04:00:00-08:00,0,517,1,...,86.0,86.0,66.0,61.0,2021-01-24,-0.26,-0.26,-480.0,22890.0,0.02
4,1.68750,707,2894,1111111111111111111111221212000000000221111112...,13257,2021-01-25T03:59:59-08:00,2021-01-24T04:00:00-08:00,2,703,0,...,76.0,91.0,72.0,69.0,2021-01-27,0.16,0.16,-480.0,25020.0,0.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,1.65625,721,2859,1111112112111112222222222222222323222222212211...,12814,2021-04-14T03:59:59-07:00,2021-04-13T04:00:00-07:00,20,710,1,...,,,,,,,,,,
80,1.40625,301,2400,1111111111111111111112111111111111111121121121...,5541,2021-04-15T03:59:59-07:00,2021-04-14T04:00:00-07:00,0,774,1,...,,,,,,,,,,
81,1.37500,193,2324,1111111111111112111112221222222222233333222222...,3148,2021-04-16T03:59:59-07:00,2021-04-15T04:00:00-07:00,1,1062,3,...,,,,,,,,,,
82,1.50000,550,2609,1111111111111111122111111111111111111221122211...,10674,2021-04-17T03:59:59-07:00,2021-04-16T04:00:00-07:00,2,451,0,...,,,,,,,,,,


In [72]:
#inspecting what columns to drop
oura_test_combined_final.columns

Index(['average_met', 'cal_active', 'cal_total', 'class_5min',
       'daily_movement', 'day_end', 'day_start', 'high', 'inactive',
       'inactivity_alerts', 'low', 'medium', 'met_1min', 'met_min_high',
       'met_min_inactive', 'met_min_low', 'met_min_medium', 'non_wear', 'rest',
       'rest_mode_state_act_1', 'score_act_1', 'score_meet_daily_targets',
       'score_move_every_hour', 'score_recovery_time', 'score_stay_active',
       'score_training_frequency', 'score_training_volume', 'steps',
       'summary_date_act', 'target_calories', 'target_km', 'target_miles',
       'timezone_act', 'to_target_km', 'to_target_miles', 'total_act_1',
       'bedtime_end_act_1', 'bedtime_start_act_1', 'breath_average_act_1',
       'duration_act_1', 'period_id_act_1', 'summary_date_res', 'timezone_res',
       'hr_average_act_1', 'hr_lowest_act_1', 'rmssd_act_1', 'period_id_res_1',
       'rest_mode_state_res_1', 'score_res_1', 'score_activity_balance',
       'score_hrv_balance', 'score_prev

In [73]:
#renaming columns with the correct unit of time
oura_test_combined_final.rename(columns={"inactive":"inactive_minutes"}, inplace=True)
oura_test_combined_final.rename(columns={"non_wear":"non_wear_minutes"}, inplace=True)
oura_test_combined_final.rename(columns={"rest":"rest_minutes"}, inplace=True)
oura_test_combined_final.rename(columns={"low":"low_minutes"}, inplace=True)
oura_test_combined_final.rename(columns={"medium":"medium_minutes"}, inplace=True)
oura_test_combined_final.rename(columns={"high":"high_minutes"}, inplace=True)
oura_test_combined_final.rename(columns={"awake":"awake_seconds"}, inplace=True)
oura_test_combined_final.rename(columns={"rem":"rem_seconds"}, inplace=True)
oura_test_combined_final.rename(columns={"light":"light_seconds"}, inplace=True)
oura_test_combined_final.rename(columns={"deep":"deep_seconds"}, inplace=True)
oura_test_combined_final.rename(columns={"temperature_delta":"temperature_delta_celsius"}, inplace=True)
oura_test_combined_final.rename(columns={"cal_total":"cal_total_kcal"}, inplace=True)

In [74]:
#checking for numerical anomalies

oura_test_combined_final["inactive_minutes"].describe()

count      84.000000
mean      623.785714
std       182.630846
min       141.000000
25%       516.500000
50%       648.500000
75%       745.250000
max      1062.000000
Name: inactive_minutes, dtype: float64

In [75]:
#checking for numerical anomalies

oura_test_combined_final["medium_minutes"].describe()

count     84.000000
mean      28.119048
std       24.372315
min        2.000000
25%       10.000000
50%       22.000000
75%       38.000000
max      119.000000
Name: medium_minutes, dtype: float64

In [76]:
# checking for numerical anomalies

oura_test_combined_final["daily_movement"].describe()

count       84.000000
mean      6216.178571
std       4326.205648
min        461.000000
25%       3137.750000
50%       5568.500000
75%       7763.500000
max      22091.000000
Name: daily_movement, dtype: float64

In [77]:
#checking for numerical anomalies

oura_test_combined_final["score_resting_hr"].describe()

count     82.000000
mean      80.256098
std       23.369185
min        1.000000
25%       70.000000
50%       88.500000
75%       99.000000
max      100.000000
Name: score_resting_hr, dtype: float64

In [78]:
#checking for numerical anomalies in the

oura_test_combined_final[oura_test_combined_final["daily_movement"] > 20000]["daily_movement"]

71    22091
Name: daily_movement, dtype: int64

As a note, the summary_date column is calculated as activity period between 4am - 3:59am user's local time. With day_start == 4am local time, and day_end == 3:59am local time

After going through different numerical columns we concluded that most data is correct, including any extremes found. Data that we are unsure of (such as the heart rate) will be left alone for the time being as we are unsure of how it is calculated and if we will be using it. If we decide to use it we will figure out how the values were calculated, and then remove it if necessary.

Going through the API documentation, it became apparent that the value '0' was often put in as a substitute for null. I will be changing these values to null.

In [79]:
# creating a function that changes the value 0 to null in places where 0 represents no available data

def replace_zero(column):
    for index in range(len(column)):
        if column[index] == 0:
            column[index] = np.nan

In [80]:
#replacing all zeros that are representative of null to null
replace_zero(oura_test_combined_final["score_recovery_time"])
replace_zero(oura_test_combined_final["score_training_volume"])
replace_zero(oura_test_combined_final["score_training_frequency"])
replace_zero(oura_test_combined_final["score_meet_daily_targets"])
replace_zero(oura_test_combined_final["score_move_every_hour"])
replace_zero(oura_test_combined_final["score_stay_active"])
replace_zero(oura_test_combined_final["score"])
replace_zero(oura_test_combined_final["score_total"])
replace_zero(oura_test_combined_final["score_rem"])
replace_zero(oura_test_combined_final["score_efficiency"])
replace_zero(oura_test_combined_final["score_deep"])
replace_zero(oura_test_combined_final["score_latency"])
replace_zero(oura_test_combined_final["score_disturbances"])
replace_zero(oura_test_combined_final["score_alignment"])

In [81]:
oura_test_combined_final["score_alignment"].value_counts()

100.0    46
94.0      2
88.0      2
85.0      2
32.0      1
9.0       1
69.0      1
1.0       1
41.0      1
31.0      1
45.0      1
65.0      1
84.0      1
78.0      1
70.0      1
61.0      1
89.0      1
91.0      1
71.0      1
87.0      1
37.0      1
86.0      1
98.0      1
72.0      1
96.0      1
68.0      1
Name: score_alignment, dtype: int64

In [82]:
oura_test_combined_final["summary_date_act"].value_counts()

2021-04-07    1
2021-02-13    1
2021-02-10    1
2021-03-02    1
2021-04-03    1
             ..
2021-02-04    1
2021-02-11    1
2021-03-15    1
2021-02-25    1
2021-02-24    1
Name: summary_date_act, Length: 84, dtype: int64

In [83]:
twitter_tweets = pd.read_json("data/ore_tweets_content.json")
twitter_tweets

Unnamed: 0,tweet
0,"{'retweeted': False, 'source': '<a href=""http:..."
1,"{'retweeted': False, 'source': '<a href=""http:..."
2,"{'retweeted': False, 'source': '<a href=""http:..."
3,"{'retweeted': False, 'source': '<a href=""http:..."
4,"{'retweeted': False, 'source': '<a href=""http:..."
...,...
2699,"{'retweeted': False, 'source': '<a href=""http:..."
2700,"{'retweeted': False, 'source': '<a href=""http:..."
2701,"{'retweeted': False, 'source': '<a href=""http:..."
2702,"{'retweeted': False, 'source': '<a href=""http:..."


In [84]:
#splitting the above dataframe

twitter_tweets_split = twitter_tweets['tweet'].apply(pd.Series)
twitter_tweets_split


Unnamed: 0,retweeted,source,entities,display_text_range,favorite_count,in_reply_to_status_id_str,id_str,in_reply_to_user_id,truncated,retweet_count,id,in_reply_to_status_id,created_at,favorited,full_text,lang,in_reply_to_screen_name,in_reply_to_user_id_str,possibly_sensitive,extended_entities
0,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 50]",0,1031515975928213505,1031526032153735168,29176998,False,0,1031526032153735168,1031515975928213505,Mon Aug 20 12:59:05 +0000 2018,False,@silentworks @jakuuire @pharingee Nice! I’m ga...,en,silentworks,29176998,,
1,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 16]",0,1031500040408387584,1031503428672008193,29176998,False,0,1031503428672008193,1031500040408387584,Mon Aug 20 11:29:16 +0000 2018,False,@silentworks 💀💀💀,und,silentworks,29176998,,
2,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 140]",0,,1031263237025812485,,False,0,1031263237025812485,,Sun Aug 19 19:34:50 +0000 2018,False,RT @IbukunOg: Benjamin @DadaBen_ ! I’m so prou...,en,,,,
3,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 85]",0,,1031262871240486912,,False,0,1031262871240486912,,Sun Aug 19 19:33:23 +0000 2018,False,RT @Mz_Chi: Here is a link to the video if you...,en,,,False,
4,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 121]",0,,1031262859899154432,,False,0,1031262859899154432,,Sun Aug 19 19:33:20 +0000 2018,False,RT @Mz_Chi: I'm going to make a YouTube video ...,en,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 140]",0,,1236903007364632578,,False,0,1236903007364632578,,Mon Mar 09 06:33:52 +0000 2020,False,RT @DavisVilums: Exactly 5 years ago I started...,en,,,,
2700,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 97]",0,,1236902584012509184,,False,0,1236902584012509184,,Mon Mar 09 06:32:11 +0000 2020,False,RT @paulbatum: Come work with me on Azure Func...,en,,,False,
2701,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [{'text': 'dataviz', 'indices': [...","[0, 140]",0,,1235819868873605120,,False,0,1235819868873605120,,Fri Mar 06 06:49:51 +0000 2020,False,RT @sxywu: ICYMI MY LATEST🎊is a physical #data...,en,,,,
2702,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 59]",2,1235524406778032128,1235568746711756801,29176998,False,1,1235568746711756801,1235524406778032128,Thu Mar 05 14:11:59 +0000 2020,False,@silentworks https://t.co/mBQ2Zx0Rqt\n\nYou kn...,en,silentworks,29176998,False,


In [85]:
#splitting different user's mentioned into the same column
def get_user_mentions(entities):
    if entities is None:
        return []
    
    user_mentions_list = entities["user_mentions"]

    return [ single_mention["screen_name"] for single_mention in user_mentions_list]

twitter_tweets_split["users_mentioned"] = twitter_tweets_split["entities"].apply(get_user_mentions)

twitter_tweets_split["users_mentioned"]

0       [silentworks, jakuuire, pharingee]
1                            [silentworks]
2                     [IbukunOg, DadaBen_]
3                                 [Mz_Chi]
4                                 [Mz_Chi]
                       ...                
2699                         [DavisVilums]
2700                           [paulbatum]
2701                               [sxywu]
2702                         [silentworks]
2703                         [emeka_boris]
Name: users_mentioned, Length: 2704, dtype: object

In [86]:
#final tweets dataframe
twitter_tweets_final = twitter_tweets_split
twitter_tweets_final.columns

Index(['retweeted', 'source', 'entities', 'display_text_range',
       'favorite_count', 'in_reply_to_status_id_str', 'id_str',
       'in_reply_to_user_id', 'truncated', 'retweet_count', 'id',
       'in_reply_to_status_id', 'created_at', 'favorited', 'full_text', 'lang',
       'in_reply_to_screen_name', 'in_reply_to_user_id_str',
       'possibly_sensitive', 'extended_entities', 'users_mentioned'],
      dtype='object')

Here I will be performin sentiment analysis on the tweets. To do so I will first need to clean the contents of the tweets - specifically get rid of any text that says "RT" or  user names.

In [87]:
#Creating new dataframe and new features
twitter_tweets_sentiment = pd.DataFrame(twitter_tweets_final)
twitter_tweets_sentiment["clean_text"] = twitter_tweets_sentiment["full_text"]
twitter_tweets_sentiment


Unnamed: 0,retweeted,source,entities,display_text_range,favorite_count,in_reply_to_status_id_str,id_str,in_reply_to_user_id,truncated,retweet_count,...,created_at,favorited,full_text,lang,in_reply_to_screen_name,in_reply_to_user_id_str,possibly_sensitive,extended_entities,users_mentioned,clean_text
0,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 50]",0,1031515975928213505,1031526032153735168,29176998,False,0,...,Mon Aug 20 12:59:05 +0000 2018,False,@silentworks @jakuuire @pharingee Nice! I’m ga...,en,silentworks,29176998,,,"[silentworks, jakuuire, pharingee]",@silentworks @jakuuire @pharingee Nice! I’m ga...
1,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 16]",0,1031500040408387584,1031503428672008193,29176998,False,0,...,Mon Aug 20 11:29:16 +0000 2018,False,@silentworks 💀💀💀,und,silentworks,29176998,,,[silentworks],@silentworks 💀💀💀
2,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 140]",0,,1031263237025812485,,False,0,...,Sun Aug 19 19:34:50 +0000 2018,False,RT @IbukunOg: Benjamin @DadaBen_ ! I’m so prou...,en,,,,,"[IbukunOg, DadaBen_]",RT @IbukunOg: Benjamin @DadaBen_ ! I’m so prou...
3,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 85]",0,,1031262871240486912,,False,0,...,Sun Aug 19 19:33:23 +0000 2018,False,RT @Mz_Chi: Here is a link to the video if you...,en,,,False,,[Mz_Chi],RT @Mz_Chi: Here is a link to the video if you...
4,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 121]",0,,1031262859899154432,,False,0,...,Sun Aug 19 19:33:20 +0000 2018,False,RT @Mz_Chi: I'm going to make a YouTube video ...,en,,,,,[Mz_Chi],RT @Mz_Chi: I'm going to make a YouTube video ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 140]",0,,1236903007364632578,,False,0,...,Mon Mar 09 06:33:52 +0000 2020,False,RT @DavisVilums: Exactly 5 years ago I started...,en,,,,,[DavisVilums],RT @DavisVilums: Exactly 5 years ago I started...
2700,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 97]",0,,1236902584012509184,,False,0,...,Mon Mar 09 06:32:11 +0000 2020,False,RT @paulbatum: Come work with me on Azure Func...,en,,,False,,[paulbatum],RT @paulbatum: Come work with me on Azure Func...
2701,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [{'text': 'dataviz', 'indices': [...","[0, 140]",0,,1235819868873605120,,False,0,...,Fri Mar 06 06:49:51 +0000 2020,False,RT @sxywu: ICYMI MY LATEST🎊is a physical #data...,en,,,,,[sxywu],RT @sxywu: ICYMI MY LATEST🎊is a physical #data...
2702,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 59]",2,1235524406778032128,1235568746711756801,29176998,False,1,...,Thu Mar 05 14:11:59 +0000 2020,False,@silentworks https://t.co/mBQ2Zx0Rqt\n\nYou kn...,en,silentworks,29176998,False,,[silentworks],@silentworks https://t.co/mBQ2Zx0Rqt\n\nYou kn...


In [88]:
#Identifying tweets that are retweets
for index in range(len(twitter_tweets_sentiment)):
    if bool("RT" in twitter_tweets_sentiment["clean_text"][index]) == True:
        twitter_tweets_sentiment["retweeted"][index] = True
    else:
        twitter_tweets_sentiment["retweeted"][index] = False
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_tweets_sentiment["retweeted"][index] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_tweets_sentiment["retweeted"][index] = True


In [89]:
twitter_tweets_sentiment["retweeted"].value_counts()

True     1841
False     863
Name: retweeted, dtype: int64

In [90]:
#Removing RT, Punctuation etc
#create seperate column for RT

twitter_tweets_sentiment["clean_text"]= twitter_tweets_sentiment["clean_text"].replace(to_replace ='[RT]', value = '', regex = True)
twitter_tweets_sentiment["clean_text"]= twitter_tweets_sentiment["clean_text"].replace(to_replace ='@[\w:]+', value = '', regex = True)
twitter_tweets_sentiment["clean_text"] = twitter_tweets_sentiment.clean_text.str.lower()
twitter_tweets_sentiment["clean_text"][0:20]

0                                      nice! i’m game 💯
1                                                   💀💀💀
2       benjamin  ! i’m so proud of you! i see this ...
3       here is a link to the video if you be intere...
4       i'm going to make a youube video showing how...
5            from dusk till dawn https://t.co/bziziam9w
6       "maybe you don't understand"\nẹgbàmí, what w...
7       here is something that i discovered quite ea...
8       woohoo! my first linkedin article sharing my...
9       update!: #forlooppod is now available on iun...
10      at  2.0., one thing you are certain of getti...
11       oh: “be like a compiler, and ignore comments.”
12      important notice to developers of software t...
13      his would be good for    😂 and we can put  o...
14       hello ess,\ni am working on my master's deg...
15      when you remember that one edge case that yo...
16      on your first day at the new job, squash eve...
17      do you like statistics? do you dislike s

In [91]:
#remove emojis
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

twitter_tweets_sentiment["clean_text"] = twitter_tweets_sentiment["clean_text"].apply(remove_emoji)

In [92]:
#remove links
links = "((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)"
twitter_tweets_sentiment["clean_text"]= twitter_tweets_sentiment["clean_text"].replace(to_replace =links, value = '', regex = True)
twitter_tweets_sentiment["clean_text"][156]

'  his is part of a full day of ai programming worldwide! join us on wednesday at '

In [93]:
#remove punctuation
twitter_tweets_sentiment["clean_text"]= twitter_tweets_sentiment["clean_text"].replace(to_replace =r'[^\w\s]', value = '', regex = True)




In [94]:
#remove double white spaces
twitter_tweets_sentiment["clean_text"]= twitter_tweets_sentiment["clean_text"].replace(to_replace =r'[\s]{2,}', value = ' ', regex = True)


In [95]:
#remove white space from the start of sentences

twitter_tweets_sentiment["clean_text"] = twitter_tweets_sentiment["clean_text"].apply(lambda x: x.strip())
twitter_tweets_sentiment["clean_text"]

0                                            nice im game
1                                                        
2       benjamin im so proud of you i see this guy tak...
3        here is a link to the video if you be interested
4       im going to make a youube video showing how yo...
                              ...                        
2699    exactly 5 years ago i started my journey to vi...
2700    come work with me on azure functions for more ...
2701    icymi my laesis a physical dataviz of women in...
2702                                  you know what to do
2703                                          yes you can
Name: clean_text, Length: 2704, dtype: object

In [96]:
twitter_tweets_sentiment["full_text"][13]

'RT @Babajiide: This Would be good for @codebeast @segunfamisa @Soloxpress 😂 and we can put @unicodeveloper on watch list as he as a babe no…'

In [97]:
#Calculating sentiment using TextBlob. Polarity in this case is meant to show how positive or negative the tweet was, with super positive being 1 and negative being -1.

twitter_tweets_sentiment[['polarity', 'subjectivity']] = twitter_tweets_sentiment["clean_text"].apply(lambda text: pd.Series(TextBlob(text).sentiment))
twitter_tweets_sentiment[0:10]

Unnamed: 0,retweeted,source,entities,display_text_range,favorite_count,in_reply_to_status_id_str,id_str,in_reply_to_user_id,truncated,retweet_count,...,full_text,lang,in_reply_to_screen_name,in_reply_to_user_id_str,possibly_sensitive,extended_entities,users_mentioned,clean_text,polarity,subjectivity
0,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 50]",0,1.0315159759282136e+18,1031526032153735168,29176998.0,False,0,...,@silentworks @jakuuire @pharingee Nice! I’m ga...,en,silentworks,29176998.0,,,"[silentworks, jakuuire, pharingee]",nice im game,0.1,0.7
1,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 16]",0,1.0315000404083876e+18,1031503428672008193,29176998.0,False,0,...,@silentworks 💀💀💀,und,silentworks,29176998.0,,,[silentworks],,0.0,0.0
2,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 140]",0,,1031263237025812485,,False,0,...,RT @IbukunOg: Benjamin @DadaBen_ ! I’m so prou...,en,,,,,"[IbukunOg, DadaBen_]",benjamin im so proud of you i see this guy tak...,0.245455,0.684848
3,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 85]",0,,1031262871240486912,,False,0,...,RT @Mz_Chi: Here is a link to the video if you...,en,,,False,,[Mz_Chi],here is a link to the video if you be interested,0.25,0.5
4,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 121]",0,,1031262859899154432,,False,0,...,RT @Mz_Chi: I'm going to make a YouTube video ...,en,,,,,[Mz_Chi],im going to make a youube video showing how yo...,0.0,0.0
5,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'user_mentions': [{'name': 'The Chi', 'screen...","[0, 55]",0,,1031262697109835777,,False,0,...,RT @Mz_Chi: From dusk till dawn https://t.co/B...,en,,,False,{'media': [{'expanded_url': 'https://twitter.c...,[Mz_Chi],from dusk till dawn,0.0,0.0
6,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 115]",0,,1031205963964067842,,False,0,...,"RT @allenakinkunle: ""Maybe you don't understan...",ca,,,False,,[allenakinkunle],maybe you dont understand\nẹgbàmí what was he ...,0.0,0.0
7,True,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 139]",0,,1031147154709069824,,False,0,...,RT @asemota: There is something that I discove...,en,,,,,[asemota],here is something that i discovered quite earl...,0.152083,0.360417
8,True,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [{'text': 'WISPDEFCONSQUAD', 'ind...","[0, 120]",0,,1031074729610895360,,False,0,...,RT @Ebunsky: Woohoo! My first LinkedIn article...,en,,,False,,[Ebunsky],woohoo my first linkedin article sharing my de...,0.25,0.333333
9,True,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [{'text': 'forLoopPod', 'indices'...","[0, 140]",0,,1030796350580031489,,False,0,...,RT @forLoopAfrica: Update!: #forLoopPod is now...,en,,,,,[forloopAfrica],update forlooppod is now available on iunes no...,0.3,0.4


Twitter has multiple timestamps per day (as you can tweet more than once)
GCalender multiple timestamps per day (more than one meetings)
Oura data one per day

In [98]:
#Calculating Negative, Positive, Neutral and Compound values using Vader
import nltk
nltk.download('vader_lexicon')

for index, row in twitter_tweets_sentiment["clean_text"].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score["neg"]
    neu = score["neu"]
    pos = score["pos"]
    comp = score["compound"]
    if score["neg"] > score["pos"]:
        twitter_tweets_sentiment.loc[index, "sentiment"] = 'negative'
    elif score["pos"] > score["neg"]:
        twitter_tweets_sentiment.loc[index, "sentiment"] = 'positive'
    else:
        twitter_tweets_sentiment.loc[index, "sentiment"] = 'neutral'
    twitter_tweets_sentiment.loc[index, "compound"] = score["compound"]

twitter_tweets_sentiment.head(10)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\oreog\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,retweeted,source,entities,display_text_range,favorite_count,in_reply_to_status_id_str,id_str,in_reply_to_user_id,truncated,retweet_count,...,in_reply_to_screen_name,in_reply_to_user_id_str,possibly_sensitive,extended_entities,users_mentioned,clean_text,polarity,subjectivity,sentiment,compound
0,False,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 50]",0,1.0315159759282136e+18,1031526032153735168,29176998.0,False,0,...,silentworks,29176998.0,,,"[silentworks, jakuuire, pharingee]",nice im game,0.1,0.7,positive,0.4215
1,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 16]",0,1.0315000404083876e+18,1031503428672008193,29176998.0,False,0,...,silentworks,29176998.0,,,[silentworks],,0.0,0.0,neutral,0.0
2,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 140]",0,,1031263237025812485,,False,0,...,,,,,"[IbukunOg, DadaBen_]",benjamin im so proud of you i see this guy tak...,0.245455,0.684848,positive,0.6113
3,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 85]",0,,1031262871240486912,,False,0,...,,,False,,[Mz_Chi],here is a link to the video if you be interested,0.25,0.5,positive,0.4019
4,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 121]",0,,1031262859899154432,,False,0,...,,,,,[Mz_Chi],im going to make a youube video showing how yo...,0.0,0.0,negative,-0.128
5,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'user_mentions': [{'name': 'The Chi', 'screen...","[0, 55]",0,,1031262697109835777,,False,0,...,,,False,{'media': [{'expanded_url': 'https://twitter.c...,[Mz_Chi],from dusk till dawn,0.0,0.0,neutral,0.0
6,True,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 115]",0,,1031205963964067842,,False,0,...,,,False,,[allenakinkunle],maybe you dont understand\nẹgbàmí what was he ...,0.0,0.0,neutral,0.0
7,True,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 139]",0,,1031147154709069824,,False,0,...,,,,,[asemota],here is something that i discovered quite earl...,0.152083,0.360417,positive,0.8176
8,True,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [{'text': 'WISPDEFCONSQUAD', 'ind...","[0, 120]",0,,1031074729610895360,,False,0,...,,,False,,[Ebunsky],woohoo my first linkedin article sharing my de...,0.25,0.333333,positive,0.7269
9,True,"<a href=""http://twitter.com/download/iphone"" r...","{'hashtags': [{'text': 'forLoopPod', 'indices'...","[0, 140]",0,,1030796350580031489,,False,0,...,,,,,[forloopAfrica],update forlooppod is now available on iunes no...,0.3,0.4,positive,0.2263


In [99]:
#drop empty rows



In [100]:
#example sentiment
print(twitter_tweets_sentiment["clean_text"][123])

#vader
print(twitter_tweets_sentiment["sentiment"][123])

#textblog
print(twitter_tweets_sentiment["polarity"][123])

last week wasnt too bad
positive
-0.3499999999999999


In [101]:
#final tweets dataframe

tweets_final_dataframe = twitter_tweets_sentiment

In [102]:
# get topics from tweets

# we can apply this to the cleaned tweets to see if there's something good there
tweets_final_dataframe["topics"] = tweets_final_dataframe["clean_text"].apply(lambda x: browsing_yake_extractor.extract_keywords(x))

In [103]:
tweets_final_dataframe["topics"][450]

[('walking freely today', 0.0042542192213185686),
 ('josiah was murdered', 0.015380821171891606),
 ('party in humboldt', 0.02570861714399338),
 ('white man', 0.02570861714399338),
 ('years ago', 0.02570861714399338)]

In [104]:
#Checking how many words contain short hand
shorthand = ["omg", "til", "tfwiw", "tgif", "icymi", "lol", "lool", "loool"]
count = 0

for word in shorthand:
    for text in twitter_tweets_sentiment["clean_text"]:
        if bool(word in text) == True:
            count = count + 1
            
count

70

Changing these words will need to be done when putting this into production

In [105]:
activity_watch_dataframe_final.columns

Index(['timestamp_utc', 'duration_seconds', 'app', 'title', 'keywords',
       'event_classification'],
      dtype='object')

## combine the rows together to form a single dataset 


activity_watch_summarized = summary_day, total_time_surfing, what_app_did_user_spend_x_percent_of_time_on (one column per percent) , events_usage (tuple: event_classification (encoded), sum_total_time)

In [106]:
oura_test_combined_final.columns

Index(['average_met', 'cal_active', 'cal_total_kcal', 'class_5min',
       'daily_movement', 'day_end', 'day_start', 'high_minutes',
       'inactive_minutes', 'inactivity_alerts', 'low_minutes',
       'medium_minutes', 'met_1min', 'met_min_high', 'met_min_inactive',
       'met_min_low', 'met_min_medium', 'non_wear_minutes', 'rest_minutes',
       'rest_mode_state_act_1', 'score_act_1', 'score_meet_daily_targets',
       'score_move_every_hour', 'score_recovery_time', 'score_stay_active',
       'score_training_frequency', 'score_training_volume', 'steps',
       'summary_date_act', 'target_calories', 'target_km', 'target_miles',
       'timezone_act', 'to_target_km', 'to_target_miles', 'total_act_1',
       'bedtime_end_act_1', 'bedtime_start_act_1', 'breath_average_act_1',
       'duration_act_1', 'period_id_act_1', 'summary_date_res', 'timezone_res',
       'hr_average_act_1', 'hr_lowest_act_1', 'rmssd_act_1', 'period_id_res_1',
       'rest_mode_state_res_1', 'score_res_1', 'scor

In [107]:
tweets_final_dataframe.columns

Index(['retweeted', 'source', 'entities', 'display_text_range',
       'favorite_count', 'in_reply_to_status_id_str', 'id_str',
       'in_reply_to_user_id', 'truncated', 'retweet_count', 'id',
       'in_reply_to_status_id', 'created_at', 'favorited', 'full_text', 'lang',
       'in_reply_to_screen_name', 'in_reply_to_user_id_str',
       'possibly_sensitive', 'extended_entities', 'users_mentioned',
       'clean_text', 'polarity', 'subjectivity', 'sentiment', 'compound',
       'topics'],
      dtype='object')

In [108]:
tweets_final_dataframe.iloc[1233]

retweeted                                                                False
source                       <a href="http://twitter.com/download/iphone" r...
entities                     {'hashtags': [], 'symbols': [], 'user_mentions...
display_text_range                                                     [0, 46]
favorite_count                                                               2
in_reply_to_status_id_str                                                  NaN
id_str                                                     1319278958156734470
in_reply_to_user_id                                                        NaN
truncated                                                                False
retweet_count                                                                2
id                                                         1319278958156734470
in_reply_to_status_id                                                      NaN
created_at                                      Thu 

In [109]:
g_calendar_final.columns

Index(['kind', 'etag', 'id', 'status', 'htmlLink', 'created', 'updated',
       'summary', 'creator', 'organizer', 'start', 'end', 'recurringEventId',
       'originalStartTime', 'visibility', 'iCalUID', 'sequence', 'reminders',
       'eventType', 'description', 'colorId', 'transparency', 'location',
       'attendees', 'guestsCanInviteOthers', 'privateCopy', 'source',
       'hangoutLink', 'conferenceData', 'extendedProperties',
       'guestsCanModify', 'guestsCanSeeOtherGuests', 'start_dateTime_utc',
       'timeZone_act', 'date_act', 'end_dateTime_utc', 'timeZone_res',
       'date_res'],
      dtype='object')

In [110]:
g_calendar_final.sample(10)

Unnamed: 0,kind,etag,id,status,htmlLink,created,updated,summary,creator,organizer,...,conferenceData,extendedProperties,guestsCanModify,guestsCanSeeOtherGuests,start_dateTime_utc,timeZone_act,date_act,end_dateTime_utc,timeZone_res,date_res
185,calendar#event,"""3228403934781000""",ac3gk0hrgh9im9tp203db3cd2s_20210415T020000Z,confirmed,https://www.google.com/calendar/event?eid=YWMz...,2021-01-20T12:30:44.000Z,2021-03-18T03:35:31.198Z,NeuroTech Hacking,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-04-14T19:00:00-07:00,America/Vancouver,,2021-04-14T20:30:00-07:00,America/Vancouver,
132,calendar#event,"""3222976715751000""",3pu3cntn8me9ckgl3me3pu7sb9_20210314T190000Z,confirmed,https://www.google.com/calendar/event?eid=M3B1...,2020-12-06T19:52:42.000Z,2021-03-14T03:25:53.253Z,Adaobi/Ore Weekend Sync,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,"{'entryPoints': [{'entryPointType': 'video', '...",,,,2021-03-14T12:00:00-07:00,America/Vancouver,,2021-03-14T13:00:00-07:00,America/Vancouver,
50,calendar#event,"""3225067660702000""",rganm03dr0t95v677lcj71ssoo,confirmed,https://www.google.com/calendar/event?eid=cmdh...,2021-02-05T14:03:50.000Z,2021-02-05T14:03:50.351Z,Imposter Syndrome in Software Engineering,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-02-11T10:00:00-08:00,,,2021-02-11T11:00:00-08:00,,
107,calendar#event,"""3227878782712000""",8mdt3atmbh8juob1m1iv4irr98,confirmed,https://www.google.com/calendar/event?eid=OG1k...,2021-02-14T19:40:52.000Z,2021-02-21T20:29:51.356Z,Umar Faruq Akinwunmi and Ọrẹ̀ Ògúndípẹ̀,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,{'createRequest': {'requestId': '1b57bdf4-d275...,,,,2021-02-21T12:30:00-08:00,,,2021-02-21T13:00:00-08:00,,
84,calendar#event,"""3225691056730000""",fev6bb43bdk2hn7ljsbn2at5bg_20210121T134500Z,confirmed,https://www.google.com/calendar/event?eid=ZmV2...,2021-01-20T12:30:44.000Z,2021-02-14T17:59:53.383Z,BrainDump Project Updates,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-01-21T05:45:00-08:00,America/Vancouver,,2021-01-21T06:45:00-08:00,America/Vancouver,
104,calendar#event,"""3227403442458000""",85pf1omdibd7uieghgtr7b1bs4,confirmed,https://www.google.com/calendar/event?eid=ODVw...,2021-02-19T02:28:41.000Z,2021-02-19T02:28:41.229Z,The cleanrooms where it happened - 10 years of...,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-02-25T17:00:00-08:00,,,2021-02-25T18:00:00-08:00,,
87,calendar#event,"""3225691056730000""",fev6bb43bdk2hn7ljsbn2at5bg_20210127T134500Z,confirmed,https://www.google.com/calendar/event?eid=ZmV2...,2021-01-20T12:30:44.000Z,2021-02-14T17:59:53.383Z,BrainDump Project Updates,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-01-27T05:45:00-08:00,America/Vancouver,,2021-01-27T06:45:00-08:00,America/Vancouver,
116,calendar#event,"""3229042140458000""",84ak0fgt67fh6ibhrqsci2qad0,confirmed,https://www.google.com/calendar/event?eid=ODRh...,2021-02-27T13:11:39.000Z,2021-02-28T14:04:30.229Z,Ada and Ọrẹ̀ Ògúndípẹ̀,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-02-28T06:30:00-08:00,,,2021-02-28T07:00:00-08:00,,
216,calendar#event,"""3235232815770000""",d7ocbgcqks135l97ah4ab9o63o,confirmed,https://www.google.com/calendar/event?eid=ZDdv...,2021-04-05T04:26:03.000Z,2021-04-05T09:53:27.885Z,Ore and Neo,{'email': 'neo@creativitykills.co'},{'email': 'neo@creativitykills.co'},...,{'createRequest': {'requestId': '58022a90-ebeb...,,,,2021-04-10T06:30:00-07:00,,,2021-04-10T06:45:00-07:00,,
200,calendar#event,"""3233681389790000""",2g2kk42fhecdbudvus11mrfldo,confirmed,https://www.google.com/calendar/event?eid=Mmcy...,2021-03-27T10:24:54.000Z,2021-03-27T10:24:54.895Z,a16z Jay Bradner on the future of bio & heal...,"{'email': 'oreogundipe@gmail.com', 'self': True}","{'email': 'oreogundipe@gmail.com', 'self': True}",...,,,,,2021-03-29T17:00:00-07:00,,,2021-03-29T18:00:00-07:00,,


## format and export activity watch data

In [54]:
# now we need to get an aggreate summarization on the existing dataste

activity_watch_dataframe_final[0:10]

Unnamed: 0,timestamp_utc,duration_seconds,app,title,keywords,event_classification
0,2021-02-27T15:07:29.345000+00:00,52.882,msedge.exe,ActivityWatch/activitywatch: The best free and...,"[(open-source automated time tracker, 0.002262...",activitywatch
1,2021-02-27T15:08:23.428000+00:00,0.0,msedge.exe,https://github.com/ActivityWatch and 22 more p...,[],msedge.exe
2,2021-02-27T15:08:24.630000+00:00,23.357,msedge.exe,ActivityWatch and 22 more pages - Personal - M...,"[(activitywatch, 0.15831692877998726)]",activitywatch
3,2021-02-27T15:08:49.183000+00:00,19.718,msedge.exe,Sponsor @ActivityWatch on GitHub Sponsors and ...,"[(activitywatch on github sponsors, 0.00377236...",github sponsors
4,2021-02-27T15:09:10.151000+00:00,4.846,SearchApp.exe,Search,,SearchApp.exe
5,2021-02-27T15:09:16.200000+00:00,13.051,explorer.exe,,,explorer.exe
6,2021-02-27T15:09:30.308000+00:00,3.712,aw-qt.exe,aw-qt,,aw-qt.exe
7,2021-02-27T15:09:35.229000+00:00,0.104,msedge.exe,Untitled and 23 more pages - Personal - Micros...,"[(untitled, 0.15831692877998726)]",untitled
8,2021-02-27T15:09:36.444000+00:00,0.089,msedge.exe,localhost:5600 and 23 more pages - Personal - ...,"[(localhost, 0.04491197687864554)]",localhost
9,2021-02-27T15:09:37.642000+00:00,22.687,msedge.exe,ActivityWatch and 23 more pages - Personal - M...,"[(activitywatch, 0.15831692877998726)]",activitywatch


In [60]:
from datetime import datetime, timedelta
# --- initial potential 
#activity_watch_summarized = summary_day, total_time_surfing, what_app_did_user_spend_x_percent_of_time_on (one column per percent) , events_usage (tuple: event_classification (encoded), sum_total_time)
# i realised that this can actually be better derived on the fly with visualizations
# when summarize this much, we lose a lot of granularity

activity_watch_dataframe_final["summary_day"] = activity_watch_dataframe_final['timestamp_utc'].apply(
    lambda timestamp: datetime.fromisoformat(timestamp).strftime("%Y-%m-%d"))

N=10
aw_summary_grouped = activity_watch_dataframe_final.groupby(["summary_day", "app", "event_classification"], as_index=False)
aw_summary_durations = aw_summary_grouped.agg({
                            'duration_seconds': 'sum'
                        })
# aw_summary_durations = aw_summary_durations.apply(lambda g: g.sort_values(by=["duration_seconds"], ascending=False).head(N))

aw_summary_durations


aw_summary_durations['summary_day_end'] = aw_summary_durations['summary_day'].apply(
    lambda value: (datetime.strptime(value, "%Y-%m-%d") + timedelta(days=1))
)

aw_summary_durations.to_csv('aw_data_grouped.csv', index=False)