In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
plt.rcParams['figure.figsize'] = (20,6)

In [85]:
data = pd.read_csv('data/full_data.csv', delimiter = ';', parse_dates=True)

In [86]:
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [87]:
data.head()

Unnamed: 0,timestamp,customer_no,location,weekday,weekday_num,time,customer_id
0,2019-09-02 07:03:00,1,dairy,Monday,0,07:03:00,0_1
1,2019-09-02 07:03:00,2,dairy,Monday,0,07:03:00,0_2
2,2019-09-02 07:04:00,3,dairy,Monday,0,07:04:00,0_3
3,2019-09-02 07:04:00,4,dairy,Monday,0,07:04:00,0_4
4,2019-09-02 07:04:00,5,spices,Monday,0,07:04:00,0_5


In [88]:
data.dtypes

timestamp      datetime64[ns]
customer_no             int64
location               object
weekday                object
weekday_num             int64
time                   object
customer_id            object
dtype: object

In [89]:
# Number of customers that are not "checked out"
(data['customer_id'].nunique()) - (data[data['location'] == 'checkout'].value_counts().count())

28

In [90]:
# Drop these 28 customers

In [91]:
# Get index of customers without 'checkout'
# cl = []
# for i in range(1,len(data['customer_id'].nunique()+1):
#     data[data['customer_id'] == i][-1:]['location']
#     if data[data['customer_id'] == i][-1:]['location'].all() != 'checkout':
#         cl.append(i)
#     else:
#         pass

In [92]:
#len(cl)

In [93]:
# DIFFERENCE BETWEEN CHECKOUT-NO & CUST-NO:
check_c = set(data[data["location"]=="checkout"]["customer_id"].unique()) # number of checked out customers
all_c = set(data["customer_id"].unique()) # number of all customers
diff = all_c.difference(check_c) # difference between all & checked out

#FILL IN 'CHECKOUTS'
for cust in diff:
    data = data.append({"timestamp":"2019-09-02 22:00:00","customer_id":cust,
    "location":"checkout"}, ignore_index=True)

In [94]:
#data.tail(30)

In [95]:
data.drop(columns=['weekday','weekday_num', 'time','customer_no'], inplace=True)

In [96]:
data

Unnamed: 0,timestamp,location,customer_id
0,2019-09-02 07:03:00,dairy,0_1
1,2019-09-02 07:03:00,dairy,0_2
2,2019-09-02 07:04:00,dairy,0_3
3,2019-09-02 07:04:00,dairy,0_4
4,2019-09-02 07:04:00,spices,0_5
...,...,...,...
24900,2019-09-02 22:00:00,checkout,0_1437
24901,2019-09-02 22:00:00,checkout,4_1500
24902,2019-09-02 22:00:00,checkout,0_1447
24903,2019-09-02 22:00:00,checkout,4_1509


In [98]:
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [99]:
data['weekday'] = data['timestamp'].dt.day_name()

In [101]:
data.sort_values(['customer_id', 'timestamp'], inplace=True)

In [102]:
data

Unnamed: 0,timestamp,location,customer_id,weekday,time
0,2019-09-02 07:03:00,dairy,0_1,Monday,07:03:00
8,2019-09-02 07:05:00,checkout,0_1,Monday,07:05:00
15,2019-09-02 07:06:00,fruit,0_10,Monday,07:06:00
25,2019-09-02 07:08:00,checkout,0_10,Monday,07:08:00
304,2019-09-02 07:56:00,drinks,0_100,Monday,07:56:00
...,...,...,...,...,...
23101,2019-09-06 17:20:00,drinks,4_999,Friday,17:20:00
23109,2019-09-06 17:21:00,fruit,4_999,Friday,17:21:00
23132,2019-09-06 17:24:00,drinks,4_999,Friday,17:24:00
23141,2019-09-06 17:25:00,fruit,4_999,Friday,17:25:00


In [103]:
data.set_index('timestamp', inplace=True)

In [104]:
data.index

DatetimeIndex(['2019-09-02 07:03:00', '2019-09-02 07:05:00',
               '2019-09-02 07:06:00', '2019-09-02 07:08:00',
               '2019-09-02 07:56:00', '2019-09-02 08:03:00',
               '2019-09-02 17:44:00', '2019-09-02 17:50:00',
               '2019-09-02 17:51:00', '2019-09-02 17:45:00',
               ...
               '2019-09-06 17:18:00', '2019-09-06 17:19:00',
               '2019-09-06 17:20:00', '2019-09-06 17:23:00',
               '2019-09-06 17:18:00', '2019-09-06 17:20:00',
               '2019-09-06 17:21:00', '2019-09-06 17:24:00',
               '2019-09-06 17:25:00', '2019-09-06 17:27:00'],
              dtype='datetime64[ns]', name='timestamp', length=24905, freq=None)

In [107]:
# To fill in rows for the minutes that are not in the files,
data2 = data.groupby('customer_id').resample('1min').fillna('ffill')

In [120]:
data2.drop(columns='customer_id', inplace=True)

In [126]:
data2.reset_index(inplace=True)

In [128]:
data2.set_index('timestamp', inplace=True)

In [166]:
data2.head(3)

Unnamed: 0_level_0,customer_id,location,weekday,before,after
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-02 07:03:00,0_1,dairy,Monday,dairy,dairy
2019-09-02 07:04:00,0_1,dairy,Monday,dairy,checkout
2019-09-02 07:05:00,0_1,checkout,Monday,checkout,fruit


In [133]:
# Add a column where customer was 'before' and 'after' (timestep is minutes)
data2['before'] = data2['location']
data2['after'] = data2['before'].shift(-1)
data2.tail(20)

Unnamed: 0_level_0,customer_id,location,weekday,before,after
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-06 17:28:00,4_997,drinks,Friday,drinks,drinks
2019-09-06 17:29:00,4_997,drinks,Friday,drinks,drinks
2019-09-06 17:30:00,4_997,drinks,Friday,drinks,checkout
2019-09-06 17:31:00,4_997,checkout,Friday,checkout,drinks
2019-09-06 17:18:00,4_998,drinks,Friday,drinks,fruit
2019-09-06 17:19:00,4_998,fruit,Friday,fruit,drinks
2019-09-06 17:20:00,4_998,drinks,Friday,drinks,drinks
2019-09-06 17:21:00,4_998,drinks,Friday,drinks,drinks
2019-09-06 17:22:00,4_998,drinks,Friday,drinks,checkout
2019-09-06 17:23:00,4_998,checkout,Friday,checkout,fruit


In [142]:
# Filter out unnecessary rows
data3 = data2[data2['before'] != 'checkout']

In [143]:
data2.shape

(130034, 5)

In [144]:
data3.shape

(46590, 5)

In [153]:
transition_matrix = pd.crosstab(data3['after'], data3['before'], normalize=0)

In [156]:
transition_matrix

before,dairy,drinks,fruit,spices
after,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
checkout,0.2464,0.282062,0.344503,0.127035
dairy,0.838,0.006824,0.077747,0.077428
drinks,0.121374,0.678043,0.081305,0.119278
fruit,0.089383,0.086065,0.767042,0.057511
spices,0.184886,0.170944,0.130127,0.514043


In [154]:
transition_matrix.sum(axis=1)

after
checkout    1.0
dairy       1.0
drinks      1.0
fruit       1.0
spices      1.0
dtype: float64

In [180]:
#########################################################################################
### Which is the right axis to use for the crosstabs? ---> added a test matrix below
transition_matrix_test = pd.crosstab(data3['before'], data3['after'], normalize=0)

In [181]:
transition_matrix_test

after,checkout,dairy,drinks,fruit,spices
before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dairy,0.102773,0.737483,0.058487,0.049899,0.051358
drinks,0.215505,0.011001,0.598499,0.088012,0.086983
fruit,0.200784,0.095608,0.054745,0.598353,0.05051
spices,0.149889,0.19276,0.162591,0.090822,0.403938


In [177]:
transition_matrix_test.sum(axis=1)

before
dairy     1.0
drinks    1.0
fruit     1.0
spices    1.0
dtype: float64

In [None]:
#########################################################################################

In [None]:
#########################################################################################
### Copied from Course code - to be adapted

In [None]:
# # transition prob matrix
# P = np.array([[0.4, 0.666],
#               [0.6, 0.333]])

# # initial state distribution (day 0)
# S = np.array([1.0, 0.0])   # <-- we start with a 100% sunny day

In [None]:
# S1 = np.dot(S, P.T)
# S1

In [None]:
# S2 = np.dot(S1, P.T)
# S2

In [None]:
# np.dot(np.dot(np.dot(np.dot(np.dot(S, P.T), P.T), P.T), P.T), P.T)

In [None]:
# result = []
# s = S.copy()
# for i in range(10):
#     result.append(s)
#     s = np.dot(s, P.T)

In [None]:
# pd.DataFrame(result).plot()