# Notebook that create user day representation



In [40]:
from glob import glob
import re
from copy import deepcopy
from datetime import datetime, timedelta
from collections import defaultdict
from progress.bar import Bar
import pandas as pd
import numpy as np
from wenet_models import LocationPoint, UserPlaceTimeOnly
from wenet_algo import estimate_stay_points, estimate_stay_regions, labelize_stay_region
from wenet_tools import time_difference_ms
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
init_notebook_mode(connected=True)

In [3]:
locations_glob_expr = '/idiap/temp/wdroz/locations/*.csv'

In [15]:
df_ambiance = pd.read_csv('/idiap/temp/wdroz/wenet/surveys/ambiance_survey.csv', sep=',', encoding="ISO-8859-1")

In [4]:
all_location_files = glob(locations_glob_expr)
len(all_location_files)

241

In [68]:
user_regex = re.compile(r'\/([^/\\_]+)_location\.csv')

In [6]:
df_list = []
users_list = []
bar = Bar("processing", max=len(all_location_files))
for location_file in all_location_files:
    bar.next()
    df = pd.read_csv(location_file)
    df['date'] = pd.to_datetime(df['timestamp'] + df['timezone'], unit='s')
    df = df.set_index('date')
    df = df[~df.index.duplicated(keep='first')]
    current_user = re.search(user_regex, location_file).group(1)
    users_list.append(current_user)
    df_list.append(df)
bar.finish()

In [7]:
df_all = pd.concat(df_list)

In [8]:
df_all.head()

Unnamed: 0_level_0,userid,night,type,timestamp,timezone,local_time,source,latitude,longitude,speed,accuracy,provider,bearing
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-09-26 23:49:42,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768182,7200,20140926234942,357197052974055,47.254089,8.86881,0.0,20.0,network,0.0
2014-09-26 23:50:02,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768202,7200,20140926235002,357197052974055,47.254088,8.868793,0.0,20.0,network,0.0
2014-09-26 23:50:22,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768222,7200,20140926235022,357197052974055,47.254091,8.86879,0.0,20.0,network,0.0
2014-09-26 23:52:42,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768362,7200,20140926235242,357197052974055,47.254071,8.868794,0.0,20.0,network,0.0
2014-09-26 23:53:02,02f95f41-cc1c-42d9-ab19-ef86a2fbbf4e,20140926,Location,1411768382,7200,20140926235302,357197052974055,47.254075,8.868798,0.0,20.0,network,0.0


In [9]:
df_all['night'].unique()

array([20140926, 20140927, 20140920, 20141003, 20141004, 20141010,
       20141011, 20141018, 20141017, 20141024, 20141025, 20141031,
       20141101, 20141107, 20141108, 20141114, 20141115, 20141121,
       20141122, 20141123, 20140919, 20141116, 20141128, 20141129,
       20141005, 20141023, 20141205, 20141219, 20141226, 20141227,
       20150116, 20141206, 20141019, 20140921, 20141012, 20141013,
       20141020, 20141026, 20150102, 20150109, 20150117, 20141212,
       20141213, 20150103, 20150110, 20141102], dtype=object)

In [12]:
df_all.index.min()

Timestamp('2014-09-19 20:00:10')

In [13]:
df_all.index.max()

Timestamp('2015-01-18 03:15:59')

In [98]:
def get_locations_from_df(df):
    locations = []
    for index, row in df.iterrows():
        try:
            accuracy = row['accuracy']
            pts_t = datetime.fromtimestamp(row['timestamp'])
            location = LocationPoint(pts_t, row['latitude'], row['longitude'], accuracy)
            locations.append(location)
        except ValueError:
            locations.append(None)
    return locations

In [18]:
def get_labelled_stay_regions(df, stay_regions):
    user_places = []
    for index, row in df.iterrows():
        pts_t = datetime.strptime(row['timestamp'], "%Y-%m-%d %H:%M:%S")
        if row['place_type'] == 'personal':
            place = row['place_id_name']
        else:
            place = row['place_type']
        user_place_time_only = UserPlaceTimeOnly(pts_t, place, user)
        user_place = user_place_time_only.to_user_place_from_stay_points(stay_points, max_delta_time_ms=1000*60*3)
        if user_place is not None:
            user_places.append(user_place)
    labelled_stay_regions = labelize_stay_region(stay_regions, user_places)
    stay_regions_set = set(stay_regions) - labelled_stay_regions
    return labelled_stay_regions

In [29]:
users_labelled_stay_regions = dict()
users_stay_regions = dict()
for user in users_list:
    df_user_locations = df_all[df_all['userid'] == user]
    user_locations = get_locations_from_df(df_user_locations)
    stay_points = estimate_stay_points(user_locations)
    if len(stay_points) < 1:
        continue
    df_user_ambiance = df_ambiance[df_ambiance['user'] == user]
    stay_regions = estimate_stay_regions(stay_points, distance_threshold_m=20)
    labelled_stay_regions = get_labelled_stay_regions(df_user_ambiance, stay_regions)
    c = labelled_stay_regions
    users_stay_regions[user] = stay_regions

In [30]:
len(users_labelled_stay_regions)

148

In [31]:
len(users_list)

241

In [39]:
set([l._label for user, regions in users_labelled_stay_regions.items() for l in regions])

{'bar',
 'club',
 'events',
 'friend_home',
 'home_no_parents',
 'home_with_parents',
 'other_cabane en foret avec parents',
 'other_friends place',
 'public',
 'restaurant',
 'school'}

In [42]:
regions_mapping = { 'no_data' : 0,
                    'unknow' : 1,
                    'unknow_region' : 2,
                    'bar' : 3,
                    'club' : 4,
                    'events' : 5,
                    'friend_home' : 6,
                    'home_no_parents' : 7,
                    'home_with_parents' : 8,
                    'other_cabane en foret avec parents' : 9,
                    'other_friends place' : 9,
                    'public' : 10,
                    'restaurant': 11,
                    'school': 12}

In [104]:
def get_locations_from_df_without_time(df):
    locations = []
    for index, row in df.iterrows():
        try:
            accuracy = row['accuracy']
            pts_t = None
            location = LocationPoint(pts_t, row['latitude'], row['longitude'], accuracy)
            locations.append(location)
        except ValueError:
            locations.append(None)
    return locations


def create_bag_of_words(df, labelled_stay_regions, stay_regions, regions_mapping):
    big_vector = []
    inner_vector = [0]*(max(regions_mapping.values()) + 1)
    locations = get_locations_from_df_without_time(df)
    for location in locations:
        current_vector = deepcopy(inner_vector)
        if location is None or np.isnan(location._lat):
            current_vector[regions_mapping['no_data']] = 1
        else:
            is_in_region = False
            for region in labelled_stay_regions:
                if location in region:
                    current_vector[regions_mapping[region._label]] = 1
                    is_in_region = True
                    break
            for region in stay_regions:
                if location in region:
                    current_vector[regions_mapping['unknow_region']] = 1
                    is_in_region = True
                    break
            if not is_in_region:
                current_vector[regions_mapping['unknow']] = 1
        big_vector += current_vector
    return big_vector

In [105]:
user_night_activities = defaultdict(dict)
night_user_activities = defaultdict(dict)
for name, grouped in df_all.groupby('night'):
    for user in users_labelled_stay_regions.keys():
        df_user_night = grouped[grouped['userid'] == user]
        night = str(name)
        year = night[:4]
        month = night[4:6]
        days = night[6:]
        start_date = datetime.strptime(f'{year}-{month}-{days} 20:00:00', "%Y-%m-%d %H:%M:%S")
        end_date = start_date + timedelta(hours=8)
        df_median = df_user_night.resample('30T').median()
        df_user_activity = df_median.reindex(pd.date_range(start=start_date, end=end_date, freq='30T'))
        labelled_stay_regions = users_labelled_stay_regions[user]
        stay_regions = users_stay_regions[user]
        activity_vector = create_bag_of_words(df_user_activity, labelled_stay_regions, stay_regions, regions_mapping)
        user_night_activities[user][str(night)] = activity_vector
        night_user_activities[str(night)][user] = activity_vector

In [133]:
from sklearn.decomposition import LatentDirichletAllocation
X = [v for user, nights in user_night_activities.items() for night, v in nights.items()]
lda = LatentDirichletAllocation(n_components=15, random_state=0, n_jobs=-1)
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=15, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [134]:
user_night_activities['0387ce90-5e83-4ee8-a3ee-bc2e09877249']['20140919']

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [135]:
lda.transform([user_night_activities['0387ce90-5e83-4ee8-a3ee-bc2e09877249']['20140919']])

array([[0.00370371, 0.0037037 , 0.00370371, 0.00370371, 0.00370371,
        0.23967466, 0.23986935, 0.00370371, 0.0037037 , 0.23989581,
        0.0037037 , 0.23981938, 0.00370371, 0.00370371, 0.00370371]])

In [136]:
Y = lda.transform(X)
trace = go.Histogram(x=[np.argmax(y) for y in Y],
                     histnorm='probability')

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [138]:
lda.transform(X[:5])

array([[0.00370371, 0.0037037 , 0.00370371, 0.00370371, 0.00370371,
        0.23967466, 0.23986935, 0.00370371, 0.0037037 , 0.23989581,
        0.0037037 , 0.23981938, 0.00370371, 0.00370371, 0.00370371],
       [0.00370371, 0.44642035, 0.00370371, 0.00370371, 0.0037037 ,
        0.0037037 , 0.0037037 , 0.00370371, 0.00370371, 0.0037037 ,
        0.00370371, 0.0037037 , 0.0037037 , 0.00370371, 0.50543147],
       [0.00370371, 0.0037037 , 0.00370371, 0.00370371, 0.00370371,
        0.23967466, 0.23986935, 0.00370371, 0.0037037 , 0.23989581,
        0.0037037 , 0.23981938, 0.00370371, 0.00370371, 0.00370371],
       [0.00370371, 0.00370371, 0.00370371, 0.00370371, 0.0037037 ,
        0.00370371, 0.00370371, 0.00370371, 0.0037037 , 0.00370371,
        0.00370371, 0.00370371, 0.00370371, 0.00370371, 0.9481481 ],
       [0.00370371, 0.09678881, 0.00370371, 0.00370371, 0.0037037 ,
        0.00370371, 0.00370371, 0.00370371, 0.00370371, 0.00370371,
        0.00370371, 0.00370371, 0.00370371, 

In [144]:
user_specific = list(user_night_activities['0387ce90-5e83-4ee8-a3ee-bc2e09877249'].values())
user_specific_results = lda.transform(user_specific)
np.mean(user_specific_results, axis=0)

array([0.00370371, 0.01737748, 0.00781428, 0.00370371, 0.00370371,
       0.20376604, 0.2039311 , 0.00370371, 0.0037037 , 0.20395354,
       0.00370371, 0.22030956, 0.00370371, 0.00370371, 0.11321833])

In [146]:
user_specific = list(user_night_activities['05f35693-7fec-4372-af78-7bd904c187e0'].values())
user_specific_results = lda.transform(user_specific)
np.mean(user_specific_results, axis=0)

array([0.04769359, 0.00370371, 0.02890034, 0.0132094 , 0.04570527,
       0.1664457 , 0.1666297 , 0.05171471, 0.00370371, 0.2052659 ,
       0.00556423, 0.16654929, 0.0159967 , 0.019215  , 0.05970276])