## SPICED Academy, Week 10: Supermarket Churn with Markov Chains

This project demonstrates in-depth knowledge of Hidden Markov Chains in the context of a real-world business problem. 

A probability matrix was dynamically generated from a labeled dataset of customers that documented time spent in each supermarket section up to and including checkout. To achieve the results seen below, the original CSV was processed in a separate file and was accessed by importing the Python class "Supermarket". In the exploratory data analysis phase, specific customer segments, trends in movement between supermarket sections, and revenue data was generated. A Markov Chain was then hardcoded in Python and used to make predictions from randomly-generated customer data.

All code will be progressively reworked so it uses Numpy format whenever possible.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
from Supermarket import Supermarket

In [None]:
s = Supermarket()

In [None]:
supermarket = s.supermarket

In [None]:
super_list = []

for day in supermarket:
    day = pd.concat(day)
    super_list.append(day)

#### Calculate the total number of customers in each section over time

In [None]:
count_list = []

for day in super_list:
    day['customer_no'] = day['customer_no'].astype('category')
    day = day.groupby(['timestamp', 'location'])['customer_no'].count()
    count_list.append(day)

#### Display the number of customers at checkout over time

In [None]:
for day in count_list:
    day = day.unstack(1)
    plt.plot(day['checkout'].resample('h').count())
    plt.show()

#### Calculate the time each customer spent in the market

In [None]:
diffs_all = []

for day in supermarket:
    diffs = []
    for df in day:
        df = df.reset_index()
        print(df)
        print('max', df['timestamp'].max())
        print('min', df['timestamp'].min())
        
        diff = df['timestamp'].max() - df['timestamp'].min()
        diffs.append(diff)
    diffs_all.append(diffs)

In [None]:
diffs_all[0][0:5] # Time spent in store by first 5 customers on Monday

#### Calculate the total number of customers present in the supermarket over time.

In [None]:
new_count_list = []

for day in super_list:
    day['customer_no'] = day['customer_no'].astype('category')
    day = day[day['location'] != 'checkout']
    day = day.groupby(['timestamp', 'location'])['customer_no'].count()
    day = day.unstack(1)
    day['total'] = day.sum(axis=1)
    new_count_list.append(day)

In [None]:
new_count_list[0]

In [None]:
super_list[0]

#### Our business managers think that the first section customers visit follows a different pattern than the following ones. Plot the distribution of customers of their first visited section versus following sections (treat all sections visited after the first as “following”).

In [None]:
# Go through each customer separately in s.supermarket
# Label first 'location' as 'f{location_first} first'
    # set local variable 'location_first'
    # if 'location' == 'location_first': // df['is first'] = 'f{location_first} first'
    # else: 
        # for index, row in df.iterrows():
            # df['is first'] = 'f{location_first} next'
# Label all subsequent 'location' as 'f{location_first} next'
# Add new column with new data
# Concatenate all customers per day
# Make one pie chart for 'f{location} first' vs 'f{location} next', per day
# Visualize each pie chart

In [None]:
sam = s.supermarket[0][0]

In [None]:
sam

In [None]:
sam = sam.reset_index()
location_first = sam['location'][0]
location_first

In [None]:
sam['is first'] = 0

In [None]:
for index, row in sam.iterrows():
    if sam['location'][index] == location_first:
        sam['is first'][index] = f'{location_first} first'

In [None]:
sam

In [None]:
for index, row in sam.iterrows():
    if sam['is first'][index] == 0:
        sam['is first'][index] = f'{location_first} next'

In [None]:
sam

In [None]:
for day in supermarket:
    for df in day:
        location_first = df['location'][0]
        df['is first'] = 0
        for index, row in df.iterrows():
            if df['location'][index] == location_first:
                df['is first'][index] = f'{location_first} first'
            else:
                break
    print(f'********{day} FINISHED********')

In [None]:
for day in supermarket:
    for df in day:
        location_first = df['location'][0]
        for index, row in df.iterrows():
            if df['is first'][index] == 0:
                df['is first'][index] = f'{location_first} next'
                #print('***CHANGED***')
    print(f'*********{day} FINISHED********')

In [None]:
supermarket[0]

In [None]:
supermarket_list = []

for day in supermarket:
    dataframe = pd.concat(day)
    supermarket_list.append(dataframe)

In [None]:
supermarket_list[0].groupby('is first')['location'].count()

In [None]:
total_supermarket_df = pd.concat(supermarket_list)

In [None]:
grouped_by = total_supermarket_df.groupby('is first')['location'].count()

In [None]:
grouped_by

In [None]:
labels = ['dairy first', 'dairy next']
sizes = [11789, 17052]
explode = (0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

labels = ['drinks first', 'drinks next']
sizes = [3566, 6339]
explode = (0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

labels = ['fruit first', 'fruit next']
sizes = [8837, 19722]
explode = (0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

labels = ['spices first', 'spices next']
sizes = [2653, 13473]
explode = (0.1, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

#### Estimate the total revenue for a customer value using the following table:

In [None]:
# Convert each timestamp to 30.0
for day in supermarket_list:
    day['time_spent'] = 30

#multiply each sum by value in table
    # for index, row in df.iterrows: //if index[1] == 'spices':
        #round((seconds * 3.0) / 60), 2)
# append sums to table

In [None]:
# take .sum() of the 30's by 'location' using groupby
min_groups = []

for num in range(5):
    min_group = (supermarket_list[num].groupby('location')['time_spent'].sum())/60
    min_groups.append(min_group)

In [None]:
min_groups[0]

In [None]:
#multiply each sum by value in table
for num in range(5):
    min_groups[num]['dairy'] = (min_groups[num]['dairy'] * 5)
    min_groups[num]['drinks'] = (min_groups[num]['drinks'] * 6)
    min_groups[num]['fruit'] = (min_groups[num]['fruit'] * 4)
    min_groups[num]['spices'] = (min_groups[num]['spices'] * 3)   

In [None]:
revenue_totals = min_groups

In [None]:
revenue_totals[0]

In [None]:
from functools import reduce

d = reduce(lambda x, y: x.add(y, fill_value=0), revenue_totals)

#### Which is the most profitable section according to your data?

In [None]:
# Take the max of previous answer

In [None]:
d.idxmax(axis=0, skipna=True)

### 5. Simulate a single customer
#### Write a program that uses a transition probability matrix to simulate the journey of a single customer through the market. Use a Markov model to represent the state of a customer. Use one- minute time intervals for the transitions. Once a customer reaches the checkout, consider them “churned” – do not simulate them any longer.
• Print all state changes
• Set the transition probabilities manually
• Later add the probabilities extracted from the data
• Extend the model to use separate probabilities for the first location
• Implement the customer as a class

In [None]:
sam

In [None]:
sam['shift'] = sam['location'].shift(1)

In [None]:
sam.iloc[0]['shift'] = 'checkout'

In [None]:
sam

In [None]:
sam1 = sam

In [None]:
sam1

In [None]:
real_count = sam1.reset_index().groupby(['shift', 'location'])['timestamp'].count().unstack()

In [None]:
real_count

In [None]:
real_proba = real_count.apply(lambda row: row / row.sum(), axis=1)

In [None]:
product = len(real_proba)*len(real_proba.columns)

In [None]:
real_proba = real_count.apply(lambda row: row / row.sum(), axis=1)

In [None]:
real_proba

In [None]:
class Customer(Supermarket):
    
    def __init__(self):
        self.location
        self.id
        self.timestamp
        
    def get_location(self, timestamp):
        pass
        return location
    
    def get_first(self, id):
        pass
        return first
        
    def get_following(self, id, location): # this will run, or be part of, the simulation
        pass
        return following
    
    def get_proba(self):
        pass
        return proba

#### We would like to analyze how customers switch between sections of the supermarket. Calculate and visualize the probability of transitions from section A to B by counting all observed transitions.

#### E.g. if a customer was in the fruit section, later in the spices section, and went back to fruit, we observe two transitions: fruit → spices and spices → fruit .
The checkout is a special terminal state, from which customers cannot leave.

* Draw a state diagram
* Display the transition probability matrix
* Visualize the probabilities using an aproppriate library (consider NetworkX or PyGraphViz)

In [None]:
def get_proba(supermarket):
    probs = []
    for day in supermarket:
        prob_day = []
        for sam in day:
            sam['old_location'] = sam['location'].shift(1)
            #sam.drop(['shift'], axis=1)
            sam = sam.reset_index().groupby(['old_location', 'location'])['timestamp'].count().unstack()
            sam = sam.apply(lambda row: row / row.sum(), axis=1)
            prob_day.append(sam)
        probs.append(prob_day)
    return probs

In [None]:
def get_counts(supermarket):
    counts = []
    for day in supermarket:
        count_day = []
        for sam in day:
            sam['old_location'] = sam['location'].shift(1)
            #sam.drop(['shift'], axis=1)
            sam = sam.reset_index().groupby(['old_location', 'location'])['timestamp'].count().unstack()
            #sam = sam.apply(lambda row: row / row.sum(), axis=1)
            count_day.append(sam)
        counts.append(count_day)
    return counts

In [None]:
probs = get_proba(supermarket)

In [None]:
print(probs[0][2])

#### Calculate and visualize the probability of transitions from section A to B by counting all observed transitions.

In [None]:
data = {'spices_old':[0,0,0,0,0], 'dairy_old':[0,0,0,0,0], 'drinks_old':[0,0,0,0,0], 'fruit_old':[0,0,0,0,0]}

In [None]:
probabilities = pd.DataFrame.from_dict(data, orient='index', columns = ['spices', 'dairy', 'drinks', 'fruit', 'checkout'])

In [None]:
probabilities

In [None]:
counts = pd.DataFrame.from_dict(data, orient='index', columns = ['spices', 'dairy', 'drinks', 'fruit', 'checkout'])

In [None]:
counts

In [None]:
all_counts = get_counts(supermarket)

In [None]:
all_counts[0][0]

In [None]:
all_counts1=[]
for day in all_counts:
    day = pd.concat(day)
    all_counts1.append(day)

In [None]:
all_counts1 = pd.concat(all_counts1)

In [None]:
all_sum = all_counts1.groupby('old_location').sum()

In [None]:
all_sum_prob = all_sum.apply(lambda row: row / row.sum(), axis=1)

#### Display the transition probability matrix

In [None]:
all_sum_prob

In [None]:
all_sum_prob.loc['checkout'] = [1,0,0,0,0]

In [None]:
all_sum_prob

In [None]:
all_sum_prob.loc[:,'Total'] = all_sum_prob.sum(axis=1)

In [None]:
all_sum_prob

In [None]:
new = sam.loc[(sam['location']=='fruit')]

In [None]:
new.reset_index()['timestamp']

In [None]:
all_sum_prob.drop('Total', axis=1, inplace=True)

In [None]:
all_sum_prob

In [None]:
LOCATIONS = ['fruit', 'drinks', 'dairy', 'spices', 'checkout']

In [None]:
PROB = all_sum_prob.to_dict('split')

In [None]:
PROB.pop('columns', None)

In [None]:
PROB = dict(zip(PROB['index'], PROB['data']))

In [None]:
PROB

In [None]:
import random

In [None]:
class _Customer:
    
    def __init__(self, ci_d):
        self.ci_d = ci_d
        self.location = location #self.get_first()
        
    def __repr__(self):
        return f'{self.ci_d}, {self.location}, {self.checked_out}'
    
    """def get_first(self):
        return 'entrance'"""
    
    @property
    def checked_out(self):
        if self.location == 'checkout':
            return True
        else:
            return False
        
    def move(self):
        loc = random.choices(LOCATIONS)
        loc = loc[0]
        prob_loc = random.choices(LOCATIONS, PROB[loc])
        self.location = prob_loc[0]
        """self.location = random.choices(LOCATIONS)
        self.location = random.choices(self.location, PROB[self.location])
        self.location = self.location[0]"""

In [None]:
c = _Customer(1)

In [None]:
c

### Simlulating the movement of a hypothetical customer

In [None]:
while not c.checked_out:
    c.move()
    print(c)

In [None]:
location_last = 'fruit'
location = random.choices(LOCATIONS, PROB[location_last])
print(location)
location_last = location

In [None]:
population = [_Customer(i) for i in range(100)]

In [None]:
for pop in population:
    while not pop.checked_out:
        pop.move()
        print(pop)