# Placeholder for Title & Photo

# Placeholder for describing goal of this Notebook

This notebook is based on this article: https://towardsdatascience.com/marketing-channel-attribution-with-markov-chains-in-python-part-2-the-complete-walkthrough-733c65b23323

In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import defaultdict

%matplotlib inline

# Data Load

In [4]:
data_df = pd.read_csv('attribution_data.csv')

In [5]:
data_df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display
3,00000FkCnDfDDf0iC97iC703B,2018-07-29T07:44:51Z,impression,0,0.0,Online Display
4,0000nACkD9nFkBBDECD3ki00E,2018-07-03T09:44:57Z,impression,0,0.0,Paid Search


In [9]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586737 entries, 0 to 586736
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   cookie            586737 non-null  object 
 1   time              586737 non-null  object 
 2   interaction       586737 non-null  object 
 3   conversion        586737 non-null  int64  
 4   conversion_value  586737 non-null  float64
 5   channel           586737 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 26.9+ MB


In [8]:
data_df.describe()

Unnamed: 0,conversion,conversion_value
count,586737.0,586737.0
mean,0.030063,0.187871
std,0.17076,1.084498
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,8.5


# EDA

In [34]:
data_df['channel'].unique()

array(['Paid Search', 'Online Video', 'Online Display', 'Instagram',
       'Facebook'], dtype=object)

# Data Preprocessing

In [15]:
data_df = data_df.sort_values(['cookie', 'time'], ascending=[False, True])

In [18]:
data_df['visit_order'] = data_df.groupby('cookie').cumcount() + 1

In [19]:
data_df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel,visit_order
586736,ooooohAFofEnonEikhAi3fF9o,2018-07-14T17:17:12Z,impression,0,0.0,Paid Search,1
586734,ooooiBh70D3k3BfAhDFfii9h7,2018-07-03T12:57:25Z,impression,0,0.0,Paid Search,1
586735,ooooiBh70D3k3BfAhDFfii9h7,2018-07-19T08:17:59Z,impression,0,0.0,Online Video,2
586731,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-06T23:30:38Z,impression,0,0.0,Online Display,1
586732,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-12T23:50:45Z,impression,0,0.0,Online Display,2


In [21]:
df_paths = data_df.groupby('cookie')['channel'].aggregate(lambda x: x.unique().tolist()).reset_index()

In [29]:
df_paths.head(1)

Unnamed: 0,cookie,channel,conversion
0,00000FkCnDfDDf0iC97iC703B,"[Instagram, Online Display]",0


In [30]:
df_last_interaction = data_df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]

In [31]:
df_last_interaction.head(1)

Unnamed: 0,cookie,conversion
586736,ooooohAFofEnonEikhAi3fF9o,0


In [32]:
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')

In [33]:
df_paths.head(1)

Unnamed: 0,cookie,channel,conversion_x,conversion_y
0,00000FkCnDfDDf0iC97iC703B,"[Instagram, Online Display]",0,0


In [43]:
df_paths = df_paths.drop('conversion_x', axis=1)

In [44]:
df_paths = df_paths.rename(columns={'conversion_y': 'conversion'})

In [45]:
df_paths.head(1)

Unnamed: 0,cookie,channel,conversion
0,00000FkCnDfDDf0iC97iC703B,"[Instagram, Online Display]",0


In [47]:
df_paths.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240108 entries, 0 to 240107
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   cookie      240108 non-null  object
 1   channel     240108 non-null  object
 2   conversion  240108 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 15.4+ MB


In [55]:
df_paths['path'] = np.where(
 df_paths['conversion'] == 0,
 ['Start, '] + df_paths['channel'].apply(', '.join) + [', Null'],
 ['Start, '] + df_paths['channel'].apply(', '.join) + [', Conversion'])

In [56]:
df_paths.head()

Unnamed: 0,cookie,channel,conversion,path
0,00000FkCnDfDDf0iC97iC703B,"[Instagram, Online Display]",0,"Start, Instagram, Online Display, Null"
1,0000nACkD9nFkBBDECD3ki00E,[Paid Search],0,"Start, Paid Search, Null"
2,0003EfE37E93D0BC03iBhBBhF,[Paid Search],0,"Start, Paid Search, Null"
3,00073CFE3FoFCn70fBhB3kfon,[Instagram],0,"Start, Instagram, Null"
4,00079hhBkDF3k3kDkiFi9EFAD,[Paid Search],0,"Start, Paid Search, Null"


In [57]:
df_paths['path'] = df_paths['path'].str.split(', ')

In [58]:
df_paths.head()

Unnamed: 0,cookie,channel,conversion,path
0,00000FkCnDfDDf0iC97iC703B,"[Instagram, Online Display]",0,"[Start, Instagram, Online Display, Null]"
1,0000nACkD9nFkBBDECD3ki00E,[Paid Search],0,"[Start, Paid Search, Null]"
2,0003EfE37E93D0BC03iBhBBhF,[Paid Search],0,"[Start, Paid Search, Null]"
3,00073CFE3FoFCn70fBhB3kfon,[Instagram],0,"[Start, Instagram, Null]"
4,00079hhBkDF3k3kDkiFi9EFAD,[Paid Search],0,"[Start, Paid Search, Null]"


In [59]:
df_paths = df_paths[['cookie', 'path']]

In [60]:
df_paths.head(1)

Unnamed: 0,cookie,path
0,00000FkCnDfDDf0iC97iC703B,"[Start, Instagram, Online Display, Null]"


# Markov Chains

## 1. Generate the Transition Matrix 

Calculate transition probabilities between all states in our state-space

In [62]:
list_of_paths = df_paths['path']

In [63]:
total_conversions = sum(path.count('Conversion') for path in df_paths['path'].tolist())

In [64]:
base_conversion_rate = total_conversions / len(list_of_paths)

## 2. Calculate Removal Effects