### Bussiness Goal:

Calculate transition probabilities based on Markov Chains with the supermarket data to run a Monte Carlo Stimulation(MCMC).

A Markov Chain describes a Stochastic process where each state depends only on the previous one.

Each transition in a Markov Chain happens with a transition probability that is conditional on the present state. These probabilities can be written as a transition probability matrix P.

Long term dependencies exist in Markov Chains, but they are fully encoded in the transition probabilities. 

In [None]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

<b> Try first for the one day. </b>

In [12]:
monday = pickle.load(open('monday.pkl', 'rb'))
monday[:10]

Unnamed: 0_level_0,customer_no,location,hour,day
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-09-02 07:03:00,1,dairy,7,monday
2019-09-02 07:03:00,2,dairy,7,monday
2019-09-02 07:04:00,3,dairy,7,monday
2019-09-02 07:04:00,4,dairy,7,monday
2019-09-02 07:04:00,5,spices,7,monday
2019-09-02 07:04:00,6,spices,7,monday
2019-09-02 07:04:00,7,spices,7,monday
2019-09-02 07:04:00,8,fruit,7,monday
2019-09-02 07:05:00,1,checkout,7,monday
2019-09-02 07:05:00,5,checkout,7,monday


<b> State space </b>

In [14]:
S = ['dairy', 'drinks', 'fruit', 'spices', 'checkout']

states = monday[['location']]
states

Unnamed: 0_level_0,location
timestamp,Unnamed: 1_level_1
2019-09-02 07:03:00,dairy
2019-09-02 07:03:00,dairy
2019-09-02 07:04:00,dairy
2019-09-02 07:04:00,dairy
2019-09-02 07:04:00,spices
...,...
2019-09-06 21:50:00,dairy
2019-09-06 21:50:00,checkout
2019-09-06 21:50:00,checkout
2019-09-06 21:50:00,drinks


In [15]:
monday['following'] = monday.groupby('customer_no')['location'].shift(-1).to_frame()
monday['following'].replace(np.NaN, 'checkout', inplace=True)

In [16]:
P_monday = pd.crosstab(monday['location'], monday['following'], normalize ='index')

In [17]:
P_monday

following,checkout,dairy,drinks,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
checkout,0.205474,0.229742,0.119051,0.304436,0.141297
dairy,0.391323,0.0,0.223125,0.18957,0.195982
drinks,0.53726,0.027145,0.0,0.21895,0.216645
fruit,0.499024,0.238383,0.13608,0.00039,0.126123
spices,0.251199,0.323388,0.273308,0.152104,0.0


In [18]:
P = P_monday.to_numpy()
P


array([[2.05473911e-01, 2.29742483e-01, 1.19050829e-01, 3.04435756e-01,
        1.41297020e-01],
       [3.91322932e-01, 0.00000000e+00, 2.23124599e-01, 1.89570421e-01,
        1.95982047e-01],
       [5.37259923e-01, 2.71446863e-02, 0.00000000e+00, 2.18950064e-01,
        2.16645327e-01],
       [4.99023819e-01, 2.38383444e-01, 1.36079656e-01, 3.90472472e-04,
        1.26122608e-01],
       [2.51198721e-01, 3.23388386e-01, 2.73308471e-01, 1.52104422e-01,
        0.00000000e+00]])

In [19]:
P_monday.loc[['dairy']]

following,checkout,dairy,drinks,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dairy,0.391323,0.0,0.223125,0.18957,0.195982


In [21]:
current_state = np.full((5, 5), 0)

In [22]:
current_state[0][0] = 1
current_state

array([[1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [23]:
np.dot(current_state, P)

array([[0.20547391, 0.22974248, 0.11905083, 0.30443576, 0.14129702],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [24]:
np.random.choice(S, p=P.loc['fruit'])

AttributeError: 'numpy.ndarray' object has no attribute 'loc'

<b> Seems to work let's continue with the next </b>

In [27]:
weeekdays = pickle.load(open('weekdays.pkl', 'rb'))
weeekdays[:10]

Unnamed: 0,timestamp,customer_no,location,hour,day
0,2019-09-02 07:03:00,1_monday,dairy,7,monday
1,2019-09-02 07:03:00,2_monday,dairy,7,monday
2,2019-09-02 07:04:00,3_monday,dairy,7,monday
3,2019-09-02 07:04:00,4_monday,dairy,7,monday
4,2019-09-02 07:04:00,5_monday,spices,7,monday
5,2019-09-02 07:04:00,6_monday,spices,7,monday
6,2019-09-02 07:04:00,7_monday,spices,7,monday
7,2019-09-02 07:04:00,8_monday,fruit,7,monday
8,2019-09-02 07:05:00,1_monday,checkout,7,monday
9,2019-09-02 07:05:00,5_monday,checkout,7,monday


Rename the customers column to every day customers.

In [28]:
weeekdays.groupby('customer_no')['location'].count().to_frame()
weeekdays[:20]

Unnamed: 0,timestamp,customer_no,location,hour,day
0,2019-09-02 07:03:00,1_monday,dairy,7,monday
1,2019-09-02 07:03:00,2_monday,dairy,7,monday
2,2019-09-02 07:04:00,3_monday,dairy,7,monday
3,2019-09-02 07:04:00,4_monday,dairy,7,monday
4,2019-09-02 07:04:00,5_monday,spices,7,monday
5,2019-09-02 07:04:00,6_monday,spices,7,monday
6,2019-09-02 07:04:00,7_monday,spices,7,monday
7,2019-09-02 07:04:00,8_monday,fruit,7,monday
8,2019-09-02 07:05:00,1_monday,checkout,7,monday
9,2019-09-02 07:05:00,5_monday,checkout,7,monday


In [29]:
weeekdays['order'] = weeekdays.groupby('customer_no')['location'].shift().to_frame()
weeekdays['order'].replace(np.NaN, 'first', inplace=True)

weeekdays[:20]

Unnamed: 0,timestamp,customer_no,location,hour,day,order
0,2019-09-02 07:03:00,1_monday,dairy,7,monday,first
1,2019-09-02 07:03:00,2_monday,dairy,7,monday,first
2,2019-09-02 07:04:00,3_monday,dairy,7,monday,first
3,2019-09-02 07:04:00,4_monday,dairy,7,monday,first
4,2019-09-02 07:04:00,5_monday,spices,7,monday,first
5,2019-09-02 07:04:00,6_monday,spices,7,monday,first
6,2019-09-02 07:04:00,7_monday,spices,7,monday,first
7,2019-09-02 07:04:00,8_monday,fruit,7,monday,first
8,2019-09-02 07:05:00,1_monday,checkout,7,monday,dairy
9,2019-09-02 07:05:00,5_monday,checkout,7,monday,spices


In [30]:
weeekdays['following'] = weeekdays.groupby('customer_no')['location'].shift(-1).to_frame()
weeekdays

Unnamed: 0,timestamp,customer_no,location,hour,day,order,following
0,2019-09-02 07:03:00,1_monday,dairy,7,monday,first,checkout
1,2019-09-02 07:03:00,2_monday,dairy,7,monday,first,checkout
2,2019-09-02 07:04:00,3_monday,dairy,7,monday,first,checkout
3,2019-09-02 07:04:00,4_monday,dairy,7,monday,first,checkout
4,2019-09-02 07:04:00,5_monday,spices,7,monday,first,checkout
...,...,...,...,...,...,...,...
5120,2019-09-06 21:50:00,1500_friday,dairy,21,friday,fruit,
5121,2019-09-06 21:50:00,1507_friday,checkout,21,friday,dairy,
5122,2019-09-06 21:50:00,1508_friday,checkout,21,friday,dairy,
5123,2019-09-06 21:50:00,1509_friday,drinks,21,friday,first,


In [31]:
weeekdays['following'].replace(np.NaN, 'checkout', inplace=True)
weeekdays[:40]

Unnamed: 0,timestamp,customer_no,location,hour,day,order,following
0,2019-09-02 07:03:00,1_monday,dairy,7,monday,first,checkout
1,2019-09-02 07:03:00,2_monday,dairy,7,monday,first,checkout
2,2019-09-02 07:04:00,3_monday,dairy,7,monday,first,checkout
3,2019-09-02 07:04:00,4_monday,dairy,7,monday,first,checkout
4,2019-09-02 07:04:00,5_monday,spices,7,monday,first,checkout
5,2019-09-02 07:04:00,6_monday,spices,7,monday,first,dairy
6,2019-09-02 07:04:00,7_monday,spices,7,monday,first,drinks
7,2019-09-02 07:04:00,8_monday,fruit,7,monday,first,checkout
8,2019-09-02 07:05:00,1_monday,checkout,7,monday,dairy,checkout
9,2019-09-02 07:05:00,5_monday,checkout,7,monday,spices,checkout


In [32]:
P = pd.crosstab(weeekdays['location'], weeekdays['following'], normalize='index')
P

following,checkout,dairy,drinks,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
checkout,1.0,0.0,0.0,0.0,0.0
dairy,0.393033,0.0,0.222483,0.189357,0.195127
drinks,0.53726,0.027145,0.0,0.21895,0.216645
fruit,0.500195,0.237993,0.13608,0.0,0.125732
spices,0.251998,0.323122,0.272776,0.152104,0.0


In [33]:

def select_random_id():
    customer_id = weeekdays['customer_no'].sample(n=1, replace=True).unique()
    
    return customer_id[0]
select_random_id()

'1234_friday'

In [37]:
def generate_next_state(customer):

    state = weeekdays[weeekdays['customer_no']==customer]['location']
    state = state.iloc[0]
    S = ['dairy', 'drinks', 'fruit', 'spices', 'checkout']
    P = pd.crosstab(weeekdays['location'], weeekdays['following'], normalize='index')
    
    return np.random.choice(S, p=P.loc[state])

generate_next_state(customer)

'dairy'

In [None]:
STATES = ['dairy', 'drinks', 'fruit', 'spices', 'checkout']

def generate_next_state(self):#takes customer as parameter?
        '''Based on a calculated transition probabilities matrix, returns the next probable state for
    a given customer.
    '''
        state = self.location
        state = state.iloc[0]
        P = self.transition_probabilities
    
        return np.random.choice(STATES, p=P.loc[state])