In [1]:
import pandas as pd
import random
import glob

### Get complete dataframe for one customer : 
- columns = ['timestamp', 'customer_id', 'location', 'before', 'after']
- timestamp : complete from entrance to checkout in 1 minute resolution

In [2]:
def get_one_customer(data, weekday_tag, customer_no):
    """
    input:
        data: dataframe - containing data of one weekday
        weekday_tag: str - e.g. 'mo','tu','we','th','fr'
        ustomer_no: int - as in the data dataframe
    output dataframe for one customer: 
        timestamp : 1min frequency
        customer_id : str weekday_tag + customer_no
        location : location at correspoinding time 
    """
    # get data of one customer
    one_customer = data.loc[data['customer_no']==customer_no]
    # generate complete time index
    one_timeind = pd.date_range(start=one_customer.index[0], end=one_customer.index[-1], freq='min')
    # initiate dataframe with complete time index
    df_one = pd.DataFrame({'timestamp' : one_timeind})
    # modify customer id by adding weekday information
    df_one['customer_id'] = weekday_tag + str(customer_no)
    # fill in the dataframe :df.resample('min).fillna('ffill')
    df_one = df_one.merge(one_customer['location'].reset_index(), on=['timestamp'], how='left').fillna(method='ffill')
    
    # for consistancy : each customer start with entrance, stop with checkout 
    df_one['before'] = pd.Series({0 : 'entrance'}).append(df_one['location'][:-1]).values
    df_one['after'] = df_one['location']
    # if not ended by checkout add one line ending with checkout
    if df_one['after'].iloc[-1] != 'checkout':
        df_one = df_one.append(df_one.iloc[-1])
        df_one['after'].iloc[-1] = 'checkout'

    return df_one

### Generate dataframe containing all customers on monday

In [None]:
# data_mo = pd.DataFrame(columns=['timestamp', 'customer_id', 'location', 'before', 'after'])
# df = pd.read_csv('../data/monday.csv', sep=';', parse_dates=True, index_col=[0])
# for i_customers in df['customer_no'].unique():
#     one_customer = get_one_customer(df, 'mo', i_customers)
#     data_mo = data_mo.append(one_customer)

# data_mo.to_csv('data_monday.csv')

### Generate dataframe containing all customers of the whole week

In [None]:
# data_all = pd.DataFrame(columns=['timestamp', 'customer_id', 'location', 'before', 'after'])
# for file in glob.glob('../data/*.csv'):
#     weekday_tag = file[8:10]
#     print(weekday_tag)
#     df = pd.read_csv(file, sep=';', parse_dates=True, index_col=[0])
#     for i_customers in df['customer_no'].unique():
#         one_customer = get_one_customer(df, weekday_tag, i_customers)
#         data_all = data_all.append(one_customer)
        
# data_all.to_csv('data_week.csv')

### Calculate the transformation probability matrix with entire data

In [11]:
def calculate_tpm(data_mo):
    """
    calculate the transformation probability matrix
    input : dataframe at least contains a 'before' and an 'after' column
    output : dataframe of size n_state x n_state
    """
    # iniciate transformation probability matrix
    states = ['entrance', 'dairy', 'drinks', 'fruit','spices','checkout']
    tpm = pd.DataFrame(0, index=states, columns=states)
    # fill with monday data probabilities
    tpm_fill = tpm + pd.crosstab(data_mo['before'], data_mo['after'], normalize=0)
    # fillna with probability=0
    tpm_fill.fillna(0, inplace=True)
    # checkout the absorbtion state
    tpm_fill['checkout'].iloc[tpm_fill.index == 'checkout'] = 1
    return tpm_fill

data_all = pd.read_csv('data_week.csv')
tpm = calculate_tpm(data_all)
tpm.to_csv('tpm_all.csv')
print(tpm.values)
print(tpm)

[[1.         0.         0.         0.         0.         0.        ]
 [0.10291054 0.73720655 0.05860497 0.         0.04987896 0.05139898]
 [0.21595231 0.01089526 0.59831432 0.         0.08788159 0.08695652]
 [0.00120741 0.28722833 0.15334049 0.         0.3769788  0.18124497]
 [0.20160529 0.09592383 0.05484734 0.         0.59694681 0.05067674]
 [0.15054963 0.19324518 0.16313526 0.         0.09096702 0.40210292]]
          checkout     dairy    drinks  entrance     fruit    spices
checkout  1.000000  0.000000  0.000000       0.0  0.000000  0.000000
dairy     0.102911  0.737207  0.058605       0.0  0.049879  0.051399
drinks    0.215952  0.010895  0.598314       0.0  0.087882  0.086957
entrance  0.001207  0.287228  0.153340       0.0  0.376979  0.181245
fruit     0.201605  0.095924  0.054847       0.0  0.596947  0.050677
spices    0.150550  0.193245  0.163135       0.0  0.090967  0.402103


### Generate customer behaviour : markov chain simulation

In [12]:
def customer_mcmc(tpm):
    """
    simulate customer behavior with markov chain model
    input : transition probability matrix
    output : list of states starting with entrance and end with checkout
    """
    inside = 1
    seq = ['entrance'] # iniciate sequence starting with entrance
    while inside:
        states = tpm.columns       
        state = random.choices(states, weights=tpm.iloc[tpm.index == seq[-1]].values.tolist()[0])
        seq += state
        if state == ['checkout']:
            inside = 0
            break
        elif len(seq) == 50:
            inside = 0
            break
    return seq

seq = customer_mcmc(tpm)
seq

['entrance', 'dairy', 'dairy', 'dairy', 'dairy', 'checkout']