## Adding necessary imports

In [106]:
import numpy as np
import pandas as pd
from datetime import datetime
import plotly
import plotly.express as px
import plotly.graph_objs as go
%matplotlib inline

## Method for reading the dataset of i-th User

In [107]:
def read_user_data(userno):
    filename=f"data/User{userno}/User{userno}_behacom.csv"
    print(f'[INFO] reading file <{filename}>...')

    data_user=pd.read_csv(filename, encoding='latin-1',chunksize=1000)

    a = next(data_user)
    dtypes_col = a.dtypes.index
    dtypes_type = [i.name for i in a.dtypes.values]
    column_types = dict(zip(dtypes_col, dtypes_type))

    for k,v in column_types.items():
        if k == 'timestamp':
            column_types[k] = 'float64'
        elif ('average' in k):
            column_types[k] = 'float32'
        elif ('stddev' in k):
            column_types[k] = 'float32'
        elif v == 'float64':
            column_types[k] = 'float32'
        elif v == 'int64':
            if (k.startswith('press') or ('counter' in k) or ('usage' in k)):
                column_types[k] = 'int8'
            else:
                column_types[k] = 'int32'

    data_user=pd.read_csv(filename, encoding='latin-1',dtype=column_types)
    #print(data_user.shape)
    return data_user

## Task 1: Read dataset
Reading all dataset iteratively and saving in a list: ```df_users```, an indication of file reading is printed during the read process.
Note that, the ```timestamp``` feature has been converted to ```datetime``` and stored in the dataframe.

In [108]:
total_users = 3
df_users = []
basic_info = { 'name':[], 'length':[]}
for i in range(total_users):
    df = read_user_data(i)
    df_users.append(df)
    basic_info['name'].append(f'User {i}')
    basic_info['length'].append(df.shape[0])
    df_users[i]['date_time'] = df_users[i][:]['timestamp'].astype('datetime64[ms]')

[INFO] reading file <data/User0/User0_behacom.csv>...
[INFO] reading file <data/User1/User1_behacom.csv>...
[INFO] reading file <data/User2/User2_behacom.csv>...


## Task 2 & 3: Dataset overview
In the first figure we see the volume of data/input per user and observe that User 7 contains most and User 2 contains least amount of data.


In [109]:
basic_info = pd.DataFrame(basic_info)
fig = px.bar(basic_info, x='name', y='length', color='name', title='Distribution of data per user')
fig.show()

In the following figure we plot the keystroke trend per user over time. First the ```date_time``` feature is grouped with daily frequency then ```keystroke_counter``` is summed up. Finally the output is plotted in the figure. Note that, User x's _(x:0 to 11)_ input start and end date are also informed in the figure's legend.

In [110]:
fig = go.Figure()
for i in range(total_users):
    df_grp_keystroke = df_users[i].groupby(pd.Grouper(key='date_time',freq='D')).agg({"keystroke_counter": "sum"}).reset_index().sort_values(by='date_time')
    startDate = df_grp_keystroke['date_time'].dt.date.min()
    endDate = df_grp_keystroke['date_time'].dt.date.max()
    fig.add_trace(go.Scatter(x=df_grp_keystroke['date_time'], y=df_grp_keystroke['keystroke_counter'], name=f'User {i}, start: {startDate}, end: {endDate}'))

fig.update_layout(
    title='Keystroke trend per user over time'
)
fig.show()

Here the ```date_time``` feature is grouped again with daily frequency but now the size of the daily volume of input is taken into account.

In [111]:
fig = go.Figure()
for i in range(total_users):
    df_grp_input_trend = df_users[i].groupby(pd.Grouper(key='date_time',freq='D')).size().to_frame(name='counts').reset_index().sort_values(by='date_time')
    fig.add_trace(go.Scatter(x=df_grp_input_trend['date_time'], y=df_grp_input_trend['counts'], name=f'User {i}'))

fig.update_layout(
    title='Daily input distribution per user'
)
fig.show()

## Task 7: Insight about user's behaviors
One interesting fact would be to learn the most used application by an user. Here we group the dataset by ```current_app``` then sum ```current_app_foreground_time```, from these the app with maximum foregound time is stored for each user. Finally the info is plotted into the first figure. From the second figure we learn the most used app of all time.

In [141]:
columns = ['user', 'current_app', 'current_app_foreground_time']
data = []
for i in range(total_users):
    df_current_app = df_users[i].groupby(pd.Grouper(key='current_app')).agg({"current_app_foreground_time": "sum"}).reset_index()
    row_current_app = df_current_app.loc[df_current_app['current_app_foreground_time'].idxmax()]
    data.append([f'User {i}', row_current_app.current_app, row_current_app.current_app_foreground_time])
# most used app by user
df_current_app = pd.DataFrame(data=data, columns=columns)
df_current_app['current_app'] = df_current_app['current_app'].str.replace('.exe', '')
fig = px.bar(df_current_app, x='user', y='current_app_foreground_time', color='current_app', title='Most active/used app per User')
fig.show()
# most used app count
df_current_app = df_current_app.groupby(pd.Grouper(key='current_app')).size().to_frame(name='counts').reset_index()
fig = px.bar(df_current_app, x='current_app', y='counts', color='current_app', title='Most active/used app')
fig.show()


The default value of regex will change from True to False in a future version.



Another interesting fact is that, a typo in dataset describing paper was found regarding a feature: ```click_speed_aveage_N``` on page 6 in the first row of table 4, it should be ```click_speed_average_N```.

## Task 4 & 5

In [5]:
df_user0 = df_users[0].sort_values(by='date_time')
# df_user0[:5]

Unnamed: 0,timestamp,keystroke_counter,erase_keys_counter,erase_keys_percentage,press_press_average_interval,press_press_stddev_interval,press_release_average_interval,press_release_stddev_interval,word_counter,word_average_length,...,system_average_cpu,system_stddev_cpu,current_app_average_mem,current_app_stddev_mem,system_average_mem,system_stddev_mem,received_bytes,sent_bytes,USER,date_time
0,1574257000000.0,14,2,0.14,200.690002,165.470001,107.0,36.450001,2,5.0,...,13.03,14.42,11.07,0.0,1.15,2.57,1618568.0,274869.0,0,2019-11-20 13:44:15.128
1,1574258000000.0,37,3,0.08,825.690002,1842.589966,132.679993,96.690002,2,10.5,...,17.5,6.45,1.44,0.08,12.71,0.97,14835273.0,449913.0,0,2019-11-20 13:45:18.073
2,1574258000000.0,24,0,0.0,2399.870117,5181.549805,122.220001,28.139999,3,5.0,...,19.73,4.89,11.66,0.56,16.5,6.15,5097547.0,272460.0,0,2019-11-20 13:46:18.881
3,1574258000000.0,28,0,0.0,2087.780029,4272.740234,308.109985,272.179993,1,3.0,...,13.6,3.18,12.55,1.31,25.07,2.55,3926179.0,293861.0,0,2019-11-20 13:47:19.398
4,1574258000000.0,71,6,0.08,421.290009,1260.380005,153.820007,138.589996,5,11.2,...,16.09,8.56,12.89,1.32,25.719999,2.54,2106149.0,241120.0,0,2019-11-20 13:48:20.304


In [6]:
df_user0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6059 entries, 0 to 6058
Columns: 12052 entries, timestamp to date_time
dtypes: datetime64[ns](1), float32(6022), float64(1), int32(138), int8(5888), object(2)
memory usage: 176.6+ MB


In [43]:
df_grp_user0 = df_user0.groupby(pd.Grouper(key='date_time',freq='600s')).agg(total_keystroke_counter=('keystroke_counter', 'sum'), total_mouse_average_movement_duration=('mouse_average_movement_duration', 'sum'), total_changes_between_apps=('changes_between_apps', 'sum'), total_click_speed_average_0=('click_speed_average_0', 'sum'), total_click_speed_average_1=('click_speed_average_1', 'sum'), total_click_speed_average_2=('click_speed_average_2', 'sum'), total_click_speed_average_3=('click_speed_average_3', 'sum')).reset_index()

df_grp_user0['user_activity'] = df_grp_user0['total_keystroke_counter'] + df_grp_user0['total_mouse_average_movement_duration'] + df_grp_user0['total_changes_between_apps'] + df_grp_user0['total_click_speed_average_0'] + df_grp_user0['total_click_speed_average_1'] + df_grp_user0['total_click_speed_average_2'] + df_grp_user0['total_click_speed_average_3']

In [44]:
df_grp_user0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7515 entries, 0 to 7514
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   date_time                              7515 non-null   datetime64[ns]
 1   total_keystroke_counter                7515 non-null   float64       
 2   total_mouse_average_movement_duration  7515 non-null   float32       
 3   total_changes_between_apps             7515 non-null   int32         
 4   total_click_speed_average_0            7515 non-null   float32       
 5   total_click_speed_average_1            7515 non-null   float32       
 6   total_click_speed_average_2            7515 non-null   float32       
 7   total_click_speed_average_3            7515 non-null   float32       
 8   user_activity                          7515 non-null   float64       
dtypes: datetime64[ns](1), float32(5), float64(2), int32(1)
memory us

In [101]:
df_grp_user0.head()

Unnamed: 0,date_time,total_keystroke_counter,total_mouse_average_movement_duration,total_changes_between_apps,total_click_speed_average_0,total_click_speed_average_1,total_click_speed_average_2,total_click_speed_average_3,user_activity,md5,log10,hour_of_day,new_idx
0,2019-11-20 13:40:00,194.0,1893.25,10,831.33,111.0,435.299988,0.0,3474.88,0.0,0.0,13,0
1,2019-11-20 13:50:00,18.0,868.77002,10,651.11,0.0,92.0,0.0,1639.88,0.0,0.0,13,1
2,2019-11-20 14:00:00,128.0,1871.420044,10,1410.13,296.5,290.0,0.0,4006.05,0.0,0.0,14,2
3,2019-11-20 14:10:00,200.0,2600.290039,17,524753100000.0,0.0,821.119995,0.0,524753100000.0,0.0,0.0,14,3
4,2019-11-20 14:20:00,341.0,1762.530029,22,1646.97,0.0,475.0,0.0,4247.5,104950600000.0,11.020985,14,4


In [45]:
# moving average
df_grp_user0['md5'] = df_grp_user0['user_activity'].rolling(5).mean().fillna(0)

In [46]:
df_grp_user0['log10'] = np.log10(df_grp_user0['md5'])
df_grp_user0['log10'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_grp_user0['log10'] = df_grp_user0['log10'].fillna(0)


divide by zero encountered in log10



In [28]:
df_grp_user0.isnull().sum()

date_time                                0
total_keystroke_counter                  0
total_mouse_average_movement_duration    0
total_changes_between_apps               0
total_click_speed_average_0              0
total_click_speed_average_1              0
total_click_speed_average_2              0
total_click_speed_average_3              0
user_activity                            0
md5                                      0
log10                                    0
dtype: int64

In [47]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_grp_user0['date_time'], y=df_grp_user0['log10'], mode='markers'))
fig.show()

### Info
1. There is no data on Sundays


In [100]:
# add new value
df_grp_user0['new_idx'] = range(0, 0+len(df_grp_user0))

In [50]:
df_user0['hour_of_day'] = df_user0['date_time'].dt.hour
df_grp2_user0 = df_user0.groupby(pd.Grouper(key='hour_of_day')).agg(total_keystroke_counter=('keystroke_counter', 'sum'), total_mouse_average_movement_duration=('mouse_average_movement_duration', 'sum'), total_changes_between_apps=('changes_between_apps', 'sum'), total_click_speed_average_0=('click_speed_average_0', 'sum'), total_click_speed_average_1=('click_speed_average_1', 'sum'), total_click_speed_average_2=('click_speed_average_2', 'sum'), total_click_speed_average_3=('click_speed_average_3', 'sum')).reset_index()

df_grp2_user0['user_activity'] = df_grp2_user0['total_keystroke_counter'] + df_grp2_user0['total_mouse_average_movement_duration'] + df_grp2_user0['total_changes_between_apps'] + df_grp2_user0['total_click_speed_average_0'] + df_grp2_user0['total_click_speed_average_1'] + df_grp2_user0['total_click_speed_average_2'] + df_grp2_user0['total_click_speed_average_3']

In [51]:
df_grp2_user0.head()

Unnamed: 0,hour_of_day,total_keystroke_counter,total_mouse_average_movement_duration,total_changes_between_apps,total_click_speed_average_0,total_click_speed_average_1,total_click_speed_average_2,total_click_speed_average_3,user_activity
0,13,2634.0,47771.558594,62,525176900000.0,2850.580078,7001.160156,0.0,525177000000.0
1,14,1669.0,12679.520508,78,1180759000000.0,296.5,2304.449951,0.0,1180760000000.0
2,15,3375.0,27691.400391,149,599579600000.0,1925.420044,7971.709961,0.0,599579600000.0
3,16,4743.0,40354.621094,163,3970916000000.0,1522.01001,8019.540039,0.0,3970916000000.0
4,18,8835.0,96267.023438,59,1161061000000.0,2672.5,11562.75,0.0,1161061000000.0


## Activeness of User 0 over a day
From this histogram we can track user's sleep activity.

We observe that there is no or rare activity from 00:00 untill 08:00, therefore it might be user's sleep time. Also the user is highly active during midday, dayend and until night.

In [98]:
fig = px.histogram(df_grp2_user0, x="hour_of_day", y='user_activity', nbins=24, histnorm='probability', title='Activeness of User 0 over hour of day')
fig.show()