In [1]:
import pandas as pd
import numpy as np
import utils, plot_help
import matplotlib.pyplot as plt

#avoid warning popping up
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

%matplotlib inline

In [2]:
event_df = utils.chunk_loader('data/cleaned/checkin_eventlog.csv')
event_df.head()

Unnamed: 0,business_id
2010-01-15 22:59:12,FaHADZARwnY4yvlvpnsfGA
2010-01-15 23:29:06,3KTY5XZfavc85u0_rLFHrw
2010-01-15 23:42:01,t-o_Sraneime4DDhWrQRBA
2010-01-16 00:54:25,Z2fCgJdvjhZVP34V86UKCg
2010-01-16 02:02:04,qwvpcuHVkOM8UoyfXmdIBA


### get first and last checkin of business and span over which checkins are being recorded.

In [3]:
#create a groupby object
event_df_groupby = event_df.reset_index(level=0).groupby(by='business_id', as_index=False)

#get first
first_checkin = event_df_groupby.min()
first_checkin = first_checkin.rename(columns={'index': 'first'})
first_checkin['first'] = pd.to_datetime(first_checkin['first']) 

first_checkin.head()

Unnamed: 0,business_id,first
0,--1UhMGODdWsrMastO9DZw,2016-04-26 19:49:16
1,--6MefnULPED_I942VcFNA,2011-06-04 18:22:23
2,--7zmmkVg-IMGaXbuVd0SQ,2015-01-17 01:49:14
3,--8LPVSo5i0Oo61X01sV9A,2016-07-08 16:43:30
4,--9QQLMTbFzLJ_oT-ON3Xw,2010-06-26 17:39:07


In [4]:
#get last
last_checkin = event_df_groupby.max()
last_checkin = last_checkin.rename(columns={'index': 'last'})
last_checkin['last'] = pd.to_datetime(last_checkin['last']) 

last_checkin.head()

Unnamed: 0,business_id,last
0,--1UhMGODdWsrMastO9DZw,2017-05-03 17:58:02
1,--6MefnULPED_I942VcFNA,2018-10-21 22:58:14
2,--7zmmkVg-IMGaXbuVd0SQ,2018-11-03 17:22:03
3,--8LPVSo5i0Oo61X01sV9A,2016-07-08 16:43:30
4,--9QQLMTbFzLJ_oT-ON3Xw,2018-06-16 18:44:45


In [5]:
first_last_df = pd.merge(left=first_checkin, 
                         right=last_checkin, 
                         how='inner',
                         left_on='business_id', 
                         right_on='business_id')

#get difference between first and last as seconds
first_last_df['span_checkin'] = (first_last_df['last'] - first_last_df['first']).apply(lambda x: x.seconds)

first_last_df.head()

Unnamed: 0,business_id,first,last,span_checkin
0,--1UhMGODdWsrMastO9DZw,2016-04-26 19:49:16,2017-05-03 17:58:02,79726
1,--6MefnULPED_I942VcFNA,2011-06-04 18:22:23,2018-10-21 22:58:14,16551
2,--7zmmkVg-IMGaXbuVd0SQ,2015-01-17 01:49:14,2018-11-03 17:22:03,55969
3,--8LPVSo5i0Oo61X01sV9A,2016-07-08 16:43:30,2016-07-08 16:43:30,0
4,--9QQLMTbFzLJ_oT-ON3Xw,2010-06-26 17:39:07,2018-06-16 18:44:45,3938


### Evaluate what is the average monthly checkin for each business

In [6]:
event_df_month = event_df.copy()
event_df_month['month'] = pd.DatetimeIndex(event_df.index).month
event_df_month['ones'] = 1

event_df_month.head()

Unnamed: 0,business_id,month,ones
2010-01-15 22:59:12,FaHADZARwnY4yvlvpnsfGA,1,1
2010-01-15 23:29:06,3KTY5XZfavc85u0_rLFHrw,1,1
2010-01-15 23:42:01,t-o_Sraneime4DDhWrQRBA,1,1
2010-01-16 00:54:25,Z2fCgJdvjhZVP34V86UKCg,1,1
2010-01-16 02:02:04,qwvpcuHVkOM8UoyfXmdIBA,1,1


In [7]:
#get total checkin by month
checkin_month_count = event_df_month.groupby(by=['business_id', 'month'], as_index=False).sum()
checkin_month_count.head()

Unnamed: 0,business_id,month,ones
0,--1UhMGODdWsrMastO9DZw,4,2
1,--1UhMGODdWsrMastO9DZw,5,1
2,--1UhMGODdWsrMastO9DZw,8,1
3,--1UhMGODdWsrMastO9DZw,10,1
4,--1UhMGODdWsrMastO9DZw,11,1


In [8]:
#group by business ID to get average monthly checkin
checkin_month_avg = checkin_month_count.groupby(by='business_id', as_index=False).mean()
checkin_month_avg.head()

Unnamed: 0,business_id,month,ones
0,--1UhMGODdWsrMastO9DZw,7.6,1.2
1,--6MefnULPED_I942VcFNA,6.5,12.166667
2,--7zmmkVg-IMGaXbuVd0SQ,6.5,12.5
3,--8LPVSo5i0Oo61X01sV9A,7.0,1.0
4,--9QQLMTbFzLJ_oT-ON3Xw,6.5,2.583333


In [9]:
#combine in single dataframe
df_checkin = pd.merge(left=checkin_month_avg, 
                      right = first_last_df, 
                      how='inner', 
                      on='business_id')

df_checkin = df_checkin.drop(columns=['month'])

df_checkin = df_checkin.rename(columns={'ones': 'avg_month_checkin'})

df_checkin.head()

Unnamed: 0,business_id,avg_month_checkin,first,last,span_checkin
0,--1UhMGODdWsrMastO9DZw,1.2,2016-04-26 19:49:16,2017-05-03 17:58:02,79726
1,--6MefnULPED_I942VcFNA,12.166667,2011-06-04 18:22:23,2018-10-21 22:58:14,16551
2,--7zmmkVg-IMGaXbuVd0SQ,12.5,2015-01-17 01:49:14,2018-11-03 17:22:03,55969
3,--8LPVSo5i0Oo61X01sV9A,1.0,2016-07-08 16:43:30,2016-07-08 16:43:30,0
4,--9QQLMTbFzLJ_oT-ON3Xw,2.583333,2010-06-26 17:39:07,2018-06-16 18:44:45,3938


In [10]:
df_checkin.to_csv('data/cleaned/checkin_feat.csv')