# Notebook that analyse users's day-to-day routine



In [1]:
from glob import glob
import re
from datetime import datetime, timedelta
from collections import defaultdict
from progress.bar import Bar
import pandas as pd
import numpy as np
from wenet_models import LocationPoint
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
init_notebook_mode(connected=True)

In [3]:
locations_glob_expr = '/idiap/temp/wdroz/locations/*.csv'

In [4]:
all_location_files = glob(locations_glob_expr)
len(all_location_files)

241

In [5]:
user_night_activities = defaultdict(dict)
night_user_activities = defaultdict(dict)
user_regex = re.compile(r'\/([^/\\_]+)_location\.csv')

In [6]:
df_list = []
users_list = []
bar = Bar("processing", max=len(all_location_files))
for location_file in all_location_files:
    bar.next()
    df = pd.read_csv(location_file)
    df['date'] = pd.to_datetime(df['timestamp'] + df['timezone'], unit='s')
    df = df.set_index('date')
    df = df[~df.index.duplicated(keep='first')]
    current_user = re.search(user_regex, location_file).group(1)
    users_list.append(current_user)
    df_list.append(df)
bar.finish()

In [7]:
df_all = pd.concat(df_list)

In [8]:
df_all.head()

Unnamed: 0_level_0,userid,night,type,timestamp,timezone,local_time,source,latitude,longitude,speed,accuracy,provider,bearing
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-09-26 23:49:42,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768182,7200,20140926234942,357197052974055,47.254089,8.86881,0.0,20.0,network,0.0
2014-09-26 23:50:02,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768202,7200,20140926235002,357197052974055,47.254088,8.868793,0.0,20.0,network,0.0
2014-09-26 23:50:22,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768222,7200,20140926235022,357197052974055,47.254091,8.86879,0.0,20.0,network,0.0
2014-09-26 23:52:42,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768362,7200,20140926235242,357197052974055,47.254071,8.868794,0.0,20.0,network,0.0
2014-09-26 23:53:02,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768382,7200,20140926235302,357197052974055,47.254075,8.868798,0.0,20.0,network,0.0


In [9]:
df_all['night'].unique()

array([20140926, 20140927, 20140920, 20141003, 20141004, 20141010,
       20141011, 20141018, 20141017, 20141024, 20141025, 20141031,
       20141101, 20141107, 20141108, 20141114, 20141115, 20141121,
       20141122, 20141123, 20140919, 20141116, 20141128, 20141129,
       20141005, 20141023, 20141205, 20141219, 20141226, 20141227,
       20150116, 20141206, 20141019, 20140921, 20141012, 20141013,
       20141020, 20141026, 20150102, 20150109, 20150117, 20141212,
       20141213, 20150103, 20150110, 20141102], dtype=object)

In [10]:
df_all.index.min()

Timestamp('2014-09-19 20:00:10')

In [11]:
df_all.index.max()

Timestamp('2015-01-18 03:15:59')

In [12]:
for name, grouped in df_all.groupby('night'):
    for user in users_list:
        df_user_night = grouped[grouped['userid'] == user]
        night = str(name)
        year = night[:4]
        month = night[4:6]
        days = night[6:]
        start_date = datetime.strptime(f'{year}-{month}-{days} 20:00:00', "%Y-%m-%d %H:%M:%S")
        end_date = start_date + timedelta(hours=8)
        df_median = df_user_night.resample('30T').median()
        df_user_activity = df_median.reindex(pd.date_range(start=start_date, end=end_date, freq='30T'))
        activity_vector = [int(not np.isnan(x)) for x in df_user_activity['latitude'].tolist()]
        user_night_activities[user][str(night)] = activity_vector
        night_user_activities[str(night)][user] = activity_vector
        

In [48]:
nb_users = len(user_night_activities)
trace_night_activities = [go.Bar(
            y=[100*(sum([int(sum(vector) >= 1) for user, vector in user_dict.items()])/nb_users) for night, user_dict in sorted(night_user_activities.items())],
            x=[f'N {n}' for n in sorted(night_user_activities.keys())]
    )]

layout_night_actitivies = go.Layout(title='Nights binary activities', 
                                    xaxis=dict(title='Nights', showticklabels=False), 
                                    yaxis=dict(title='Percent of users that have records',
                                              ticksuffix='%'))

fig = go.Figure(data=trace_night_activities, layout=layout_night_actitivies)
iplot(fig, filename='grouped-bar')

In [49]:
nb_users

241

In [74]:
def plot_user_activities(user, user_dict):
    trace_activities = [go.Scatter(
                        x=[hour for hour in range(len(vector))],
                        y=vector, line=dict(shape='hv'), name=night) for night, vector in user_dict.items() if sum(vector) >= 1]
    layout_activities = go.Layout(title=f'Activities for user {user}', 
                                    xaxis=dict(title='Night time (half-hour unit)'), 
                                    yaxis=dict(title='Percent of users that have records'))
    
    fig = go.Figure(data=trace_activities, layout=layout_activities)
    return iplot(fig, filename='user_activities')

In [75]:
user, user_dict = list(user_night_activities.items())[0]
plot_user_activities(user, user_dict)

In [76]:
user, user_dict = list(user_night_activities.items())[1]
plot_user_activities(user, user_dict)

In [105]:
users_list, user_dict_list = zip(*list(user_night_activities.items()))
all_vectors = [vector for user_dict in user_dict_list for vector in user_dict.values() if sum(vector) >= 1]
all_vectors[:5]

[[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
 [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
 [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]

In [100]:
import plotly.figure_factory as ff
import numpy as np
dendro = ff.create_dendrogram(np.array(all_vectors[:5]))
iplot(dendro, filename='sample_dendrogram')

In [103]:
dendro = ff.create_dendrogram(np.array(all_vectors[:50]))
iplot(dendro, filename='all_dendrogram')

In [104]:
len(all_vectors)

1748