In [1]:
import pandas as pd
import random
import glob

- dataframe for one customer 

In [2]:
def get_one_customer(data, weekday_tag, customer_no):
    """
    input:
        data: dataframe - containing data of one weekday
        weekday_tag: str - e.g. 'mo','tu','we','th','fr'
        ustomer_no: int - as in the data dataframe
    output dataframe for one customer: 
        timestamp : 1min frequency
        customer_id : str weekday_tag + customer_no
        location : location at correspoinding time 
    """
    # get data of one customer
    one_customer = data.loc[data['customer_no']==customer_no]
    # generate complete time index
    one_timeind = pd.date_range(start=one_customer.index[0], end=one_customer.index[-1], freq='min')
    # initiate dataframe with complete time index
    df_one = pd.DataFrame({'timestamp' : one_timeind})
    # modify customer id by adding weekday information
    df_one['customer_id'] = weekday_tag + str(customer_no)
    # fill in the dataframe
    df_one = df_one.merge(one_customer['location'].reset_index(), on=['timestamp'], how='left').fillna(method='ffill')
    
    # for consistancy : each customer start with entrance, stop with checkout 
    df_one['before'] = pd.Series({0 : 'entrance'}).append(df_one['location'][:-1]).values
    df_one['after'] = df_one['location']
    # if not ended by checkout add one line ending with checkout
    if df_one['after'].iloc[-1] != 'checkout':
        df_one = df_one.append(df_one.iloc[-1])
        df_one['after'].iloc[-1] = 'checkout'

    return df_one

- generate dataframe containing all customers on monday or whole week

In [None]:
# data_mo = pd.DataFrame(columns=['timestamp', 'customer_id', 'location', 'before', 'after'])
# df = pd.read_csv('../data/monday.csv', sep=';', parse_dates=True, index_col=[0])
# for i_customers in df['customer_no'].unique():
#     one_customer = get_one_customer(df, 'mo', i_customers)
#     data_mo = data_mo.append(one_customer)

# data_mo.to_csv('data_monday.csv')

In [None]:
# data_all = pd.DataFrame(columns=['timestamp', 'customer_id', 'location', 'after'])
# for file in glob.glob('../data/*.csv'):
#     weekday_tag = file[8:10]
#     print(weekday_tag)
#     df = pd.read_csv(file, sep=';', parse_dates=True, index_col=[0])
#     for i_customers in df['customer_no'].unique():
#         one_customer = get_one_customer(df, weekday_tag, i_customers)
#         data_all = data_all.append(one_customer)
# data_all.to_csv('data_all.csv')

- calculate the transformation probability matrix with data_mo

In [3]:
def calculate_tpm(data_mo):
    """
    calculate the transformation probability matrix
    input : dataframe at least contains a 'before' and an 'after' column
    output : dataframe of size n_state x n_state
    """
    # iniciate transformation probability matrix
    states = ['entrance', 'dairy', 'drinks', 'fruit','spices','checkout']
    tpm = pd.DataFrame(0, index=states, columns=states)
    # fill with monday data probabilities
    tpm_fill = tpm + pd.crosstab(data_mo['before'], data_mo['after'], normalize=0)
    # fillna with probability=0
    tpm_fill.fillna(0, inplace=True)
    # checkout the absorbtion state
    tpm_fill['checkout'].iloc[tpm_fill.index == 'checkout'] = 1
    return tpm_fill

data_mo = pd.read_csv('data_monday.csv')
tpm = calculate_tpm(data_mo)
tpm

Unnamed: 0,checkout,dairy,drinks,entrance,fruit,spices
checkout,1.0,0.0,0.0,0.0,0.0,0.0
dairy,0.089061,0.744559,0.062428,0.0,0.051546,0.052405
drinks,0.209268,0.01122,0.609756,0.0,0.090732,0.079024
entrance,0.002757,0.283942,0.15989,0.0,0.358374,0.195038
fruit,0.205479,0.088063,0.050881,0.0,0.607828,0.04775
spices,0.145069,0.191524,0.176854,0.0,0.096985,0.389568


In [5]:
tpm.to_csv('tpm.csv')

- generate customer behaviour : markov chain simulation

In [25]:
def customer_mcmc(tpm):
    """
    simulate customer behavior with markov chain model
    input : transition probability matrix
    output : list of states starting with entrance and end with checkout
    """
    inside = 1
    seq = ['entrance'] # iniciate sequence starting with entrance
    while inside:
        states = tpm.columns       
        state = random.choices(states, weights=tpm.iloc[tpm.index == seq[-1]].values.tolist()[0])
        seq += state
        if state == ['checkout']:
            inside = 0
            break
        elif len(seq) == 50:
            inside = 0
            break
    return seq

seq = customer_mcmc(tpm)
seq

['entrance',
 'spices',
 'dairy',
 'dairy',
 'dairy',
 'dairy',
 'dairy',
 'drinks',
 'fruit',
 'fruit',
 'checkout']