In [19]:
__author__ = 'thomas'

# Alright ! Let's do this
# We have csv file containing data from Ally app
# The idea will be to extract as many insights as possible
# Let's first start by importing the file


# import libraries
import numpy as np, matplotlib.pyplot as plt, matplotlib, pandas as pd, os

# Import the data file as a Pandas DataFrame
path = os.getcwd()
rq = pd.read_csv(path + '/data/route_queries.csv')

In [20]:
# Check the columns and their data types

print(rq.dtypes)

user_id                    object
query_time                 object
search_time                object
search_mode                object
search_origin_lat         float64
search_origin_lon         float64
search_destination_lat    float64
search_destination_lon    float64
dtype: object


In [94]:
# Convert dates from object to date type:

rq['query_time'] = (rq['query_time'].apply(pd.to_datetime))
rq['search_time'] = (rq['search_time'].apply(pd.to_datetime))

print(rq.dtypes)

user_id                            object
query_time                 datetime64[ns]
search_time                datetime64[ns]
search_mode                        object
search_origin_lat                 float64
search_origin_lon                 float64
search_destination_lat            float64
search_destination_lon            float64
query_to_search_time      timedelta64[ns]
dtype: object


In [22]:
# Visualize 1st lines of the dataset

print(rq.head(2))
print('\n')

In [23]:
# What is the size of our sample ?
rq_size = len(rq.index)
print('Number of events: ' + str(rq_size))
print(rq.columns)  # Column names

Number of events: 100000
Index(['user_id', 'query_time', 'search_time', 'search_mode',
       'search_origin_lat', 'search_origin_lon', 'search_destination_lat',
       'search_destination_lon'],
      dtype='object')


In [24]:
# How many unique users ? And how many time did they use the app

users = rq[rq.columns[0]].value_counts()

In [25]:
number_users = len(users.index)
print('Number of unique users: ' + str(number_users))

Number of unique users: 32510


In [26]:
print(users)

d41d8cd98f00b204e9800998ecf8427e    6878
a44bbb4db0084e68c96ed6a17aa97313     140
4a25d4bcfa26037be8083006f1bc1f04     140
77be395d12db5497c92262c29dd34863      95
71d4b2afefb76ae7425fab144b7dd455      93
475bbed04c2f3d2fb0edfd907321ed47      87
3aa75cc75fd1e6a927a9d331d935f341      83
2ab1d1675c0f3cf88bc7fba70f2d9941      81
867cb0d89417b93f0dbe6048b6833f51      79
ad31b423cf4e815a65f2bd80ef8ff971      74
088c51a5724b7b1e70a16bbc6916268a      72
83322d4f01d8bd256f5ba0b79d847d7b      71
e04a6062d5393c3b1930fd4762e99267      70
de12297aada54f874205fa78c4ce9c86      68
694d446ec2af0730a269881e8de9a24e      64
4d52afe2ec00f8fd776f80bb17cfb017      62
b7871e52737dbfdf9b0f05c0103690ba      61
89b5e18606a126d2037c1a7eb9c8796b      60
42e08f81945193e874aea11a125dd5fd      60
3589e2570a7bb65f162d54ba9e3c4462      59
189f3a10606d9d5d66f67498ddf8386d      59
7287d36301959c929f2855762fb8f09b      58
91bd605234d087ee872a5e97ff2bd7c4      57
5d6e311b6dc53279de7444dc06cc722b      56
20979ed8542495ef

In [27]:
# The first user appear almost 7000 times ! It's quite huge ! Let's remove it for visualisation reason.
# Showing the number of use from the 20 first customers except our guy.
users2 = users[1:20]
users2.plot(kind='bar')
plt.show()

In [28]:
# How many users have used the app more than once ?
print(str(len(users[users>1])) + ' users are recurring users over all time')
print('They represent ' + str(100*round(len(users[users>1]) / number_users,4)) + '% of the total number of users')
print('\n')

14273 users are recurring users over all time
They represent 43.9% of the total number of users




In [29]:
# Returning users within a certain time frame
# Define time frame
ret_tf = pd.Timedelta('30 days')
# Calculate column query_time - previous(query_time)

In [30]:
# Are users searching their route in advance or at last minute ?

rq['query_to_search_time'] = rq['search_time'] - rq['query_time']
print(rq['query_to_search_time'])

0       -1 days +23:59:05
1       -1 days +23:59:52
2       -1 days +23:59:41
3       -1 days +23:59:20
4       -1 days +23:59:27
5       -1 days +23:59:25
6       -1 days +23:59:49
7       -1 days +23:59:48
8         0 days 01:13:20
9       -1 days +23:59:14
10      -1 days +23:59:26
11        0 days 00:51:02
12        0 days 00:02:49
13        0 days 02:43:38
14      -1 days +23:59:49
15      -1 days +23:59:22
16      -1 days +23:59:28
17      -1 days +23:59:03
18      -1 days +23:59:43
19      -1 days +23:59:05
20        0 days 00:14:22
21        0 days 00:30:18
22      -1 days +23:59:51
23      -1 days +23:58:59
24        0 days 00:59:32
25        1 days 18:18:12
26        0 days 00:18:10
27        0 days 03:49:45
28      -1 days +23:59:29
29        0 days 01:42:09
               ...       
99970   -1 days +23:59:17
99971   -1 days +23:59:04
99972   -1 days +23:59:09
99973     0 days 00:06:58
99974   -1 days +23:58:47
99975     2 days 08:49:34
99976   -1 days +23:59:53
99977   -1 d

In [31]:
# Negative values are due to the small delay between app opening and the actual time the user
# tap the search button. We can see that they are really small.
# They should not be considered for our use case... BUT they give us
# a very useful information: the time to complete the form !
# It's actually an indicator we could use for UX evaluation (and improvement) (we'll come back on this later!)

In [32]:
# Let's check how much time in advance the user look for a route
rq_in_advance = rq[rq['query_to_search_time'] > pd.Timedelta('0 days')]
print(rq_in_advance['query_to_search_time'].describe())

count                     29905
mean     1 days 00:22:43.617923
std      7 days 01:14:21.059081
min             0 days 00:00:01
25%             0 days 00:43:40
50%             0 days 02:59:35
75%             0 days 12:54:50
max           724 days 19:18:00
Name: query_to_search_time, dtype: object


In [33]:
print(str(len(rq_in_advance.index)) + ' events are use to check the route in advance, they represent ' +
      str(round(len(rq_in_advance.index) / rq_size * 100, 2)) + '% of our the app usage')

29905 events are use to check the route in advance, they represent 29.9% of our the app usage


In [34]:
# Looks like there are some outliers here, some people searching for long time in the future.. Doesn't look like
# a normal usage of Ally, let's limit it to 30 days in the future to exclude extreme cases.
rq_in_advance_filter = rq_in_advance[rq_in_advance['query_to_search_time'] < pd.Timedelta('30 days')]
print(rq_in_advance_filter['query_to_search_time'].describe())
rq_in_advance_filter2 = rq_in_advance[rq_in_advance['query_to_search_time'] > pd.Timedelta('30 days')]
print(rq_in_advance_filter2['query_to_search_time'].describe())

count                     29781
mean     0 days 17:06:32.825459
std      2 days 03:52:19.202079
min             0 days 00:00:01
25%             0 days 00:43:19
50%             0 days 02:58:14
75%             0 days 12:46:08
max            29 days 23:59:21
Name: query_to_search_time, dtype: object
count                        124
mean     73 days 18:19:45.959677
std      74 days 20:08:00.572751
min             30 days 00:39:28
25%      39 days 08:35:11.500000
50%             51 days 19:44:00
75%      78 days 15:01:51.250000
max            724 days 19:18:00
Name: query_to_search_time, dtype: object


In [35]:
# Let's describe the distribution of this usage in a histogram, per hour
rq_in_advance_filter['query_to_search_time'] = rq_in_advance_filter['query_to_search_time'].apply(
    lambda x: x.total_seconds() ) # not working !!!
usage_in_advance = rq_in_advance_filter[rq_in_advance_filter['query_to_search_time'].value_counts()]
usage_in_advance.plot(kind='hist', alpha=0.5)
plt.show()

AttributeError: 'numpy.timedelta64' object has no attribute 'total_seconds'

In [55]:
# Are the users using the app with departure datetime or arrival datetime ?
search_mode = rq_in_advance_filter['search_mode'].value_counts()
print(search_mode)

departure    20770
arrival       9011
Name: search_mode, dtype: int64


In [80]:
print(str(round(search_mode['departure'] / search_mode.sum()*100)) + '% of users are using departure')
print(str(round(search_mode['arrival'] / search_mode.sum()*100)) + '% of users are using arrival')

70.0% of users are using departure
30.0% of users are using arrival


In [85]:
rq_last_minute = rq[rq['query_to_search_time'] < pd.Timedelta('0 days')]
rq_last_minute['query_to_search_time'].describe()

count                       69955
mean     -1 days +22:55:30.824186
std        1 days 01:22:16.632744
min           -111 days +05:30:42
25%             -1 days +23:59:08
50%             -1 days +23:59:24
75%             -1 days +23:59:41
max             -1 days +23:59:59
Name: query_to_search_time, dtype: object

In [88]:
# It looks like some user managed to search for a time in the past up to 111 days..
# On average 1 hours before the actual time...
# Although this shoud not be possible on the app, it appears on the data. This could be a bug.
# EDIT: after a few tests, I managed to reproduce that bug.. :)
# However there seems to be very low number of such cases as the 25% shows.
# I will bet on wrong data and exclude them from this sample:
rq_last_minute = rq_last_minute[rq_last_minute['query_to_search_time'] > pd.Timedelta('-1 days')]
rq_last_minute['query_to_search_time'].describe()

count                       69791
mean     -1 days +23:48:06.929016
std        0 days 01:42:43.621974
min             -1 days +00:00:09
25%             -1 days +23:59:08
50%             -1 days +23:59:24
75%             -1 days +23:59:41
max             -1 days +23:59:59
Name: query_to_search_time, dtype: object

In [89]:
# According to this data set, 75% of the users are completing the search screen in less than 1 minute, 
# even if the mean is quite big (12 mins) due to bigger values in the sample.
# This could be a KPI for UX. The goal being to lower the time on the screen of the user 
# so that he can access the results he needs quickly.