### Referências 
### [Package ‘ChannelAttribution’](https://cran.r-project.org/web/packages/ChannelAttribution/ChannelAttribution.pdf), documento enviado pela Jéssica

### Aplicação da referência principal (enviada pela Jéssica) em python: [Markov Multi-Channel Attribution](https://stackoverflow.com/questions/51817219/channel-attribution-markov-chain-model-in-python)

### Referências, pacotes e ilustrações para explicar a aplicação e os resultados do método:
### [1- pip install](https://pypi.org/project/marketing-attribution-models/)
### [2- Marketing Multi-Channel Attribution model with R (part 1: Markov chains concept)](https://www.analyzecore.com/2016/08/03/attribution-model-r-part-1/)
### [3- Cadeia de Markov em python no github](https://github.com/franciscoicmc/simulacao/blob/master/Markov-PageRank.ipynb) 



In [1]:
import time
import pandas as pd
import numpy as np
import collections
from itertools import chain
import itertools
from scipy.stats import stats
import statistics 

### Problema do impacto da **propaganda** na **conversão**.
#### No dataframe a seguir **path** é o caminho que resulta em uma certa **probabilidade** de conversão **(conversions)** calculada pelo método de cadeia de Markov, em que cada termo (exemplo: google, instagram) que o compõe é o vértice do grafo que simboliza a cadeia de Markov associada.

In [9]:
df = pd.read_excel("channel attribution example.xlsx")
df.head()

Unnamed: 0,path,conversions
0,google / organic,231
1,l.instagram.com / referral,228
2,(direct) / (none),204
3,m.facebook.com / referral,179
4,PK - Sapphire Brand Campaign,138


In [10]:
df.to_csv("channel.csv")

In [11]:
def unique(list1):  
    unique_list = []   
    for x in list1: 
        if x not in unique_list: 
            unique_list.append(x) 

    return(unique_list)

def split_fun(path):
    return path.split('>')

def calculate_rank(vector):
  a={}
  rank=0
  for num in sorted(vector):
    if num not in a:
      a[num]=rank
      rank=rank+1
  return[a[i] for i in vector]

def transition_matrix_func(import_data):

    z_import_data=import_data.copy()

    z_import_data['path1']='start>'+z_import_data['path']
    z_import_data['path2']=z_import_data['path1']+'>convert'


    z_import_data['pair']=z_import_data['path2'].apply(split_fun)

    zlist=z_import_data['pair'].tolist()
    zlist=list(chain.from_iterable(zlist))
    zlist=list(map(str.strip, zlist))
    T=calculate_rank(zlist)

    M = [[0]*len(unique(zlist)) for _ in range(len(unique(zlist)))]

    for (i,j) in zip(T,T[1:]):
        M[i][j] += 1

    x_df=pd.DataFrame(M)

    np.fill_diagonal(x_df.values,0)

    x_df=pd.DataFrame(x_df.values/x_df.values.sum(axis=1)[:,None])
    x_df.columns=sorted(unique(zlist))
    x_df['index']=sorted(unique(zlist))
    x_df.set_index("index", inplace = True) 
    x_df.loc['convert',:]=0
    return(x_df)

def simulation(trans,n):

    sim=['']*n
    sim[0]= 'start'
    i=1
    while i<n:
        sim[i] = np.random.choice(trans.columns, 1, p=trans.loc[sim[i-1],:])[0]
        if sim[i]=='convert':
            break
        i=i+1

    return sim[0:i+1]


def markov_chain(data_set,no_iteration=10,no_of_simulation=10000,alpha=5):


    import_dataset_v1=data_set.copy()
    import_dataset_v1=(import_dataset_v1.reindex(import_dataset_v1.index.repeat(import_dataset_v1.conversions))).reset_index()
    import_dataset_v1['conversions']=1

    import_dataset_v1=import_dataset_v1[['path','conversions']]

    import_dataset=(import_dataset_v1.groupby(['path']).sum()).reset_index()
    import_dataset['probability']=import_dataset['conversions']/import_dataset['conversions'].sum()

    final=pd.DataFrame()


    for k in range(0,no_iteration):
        start = time.time()
        import_data=pd.DataFrame({'path':np.random.choice(import_dataset['path'],size=import_dataset['conversions'].sum(),p=import_dataset['probability'],replace=True)})
        import_data['conversions']=1                           

        tr_matrix=transition_matrix_func(import_data)
        channel_only = list(filter(lambda k0: k0 not in ['start','convert'], tr_matrix.columns)) 

        ga_ex=pd.DataFrame()
        tr_mat=tr_matrix.copy()
        p=[]

        i=0
        while i<no_of_simulation:
            p.append(unique(simulation(tr_mat,1000)))
            i=i+1


        path=list(itertools.chain.from_iterable(p))
        counter=collections.Counter(path)

        df=pd.DataFrame({'path':list(counter.keys()),'count':list(counter.values())})
        df=df[['path','count']]
        ga_ex=ga_ex.append(df,ignore_index=True) 

        df1=(pd.DataFrame(ga_ex.groupby(['path'])[['count']].sum())).reset_index()

        df1['removal_effects']=df1['count']/len(path)
        #df1['removal_effects']=df1['count']/sum(df1['count'][df1['path']=='convert'])
        df1=df1[df1['path'].isin(channel_only)]
        df1['ass_conversion']=df1['removal_effects']/sum(df1['removal_effects'])

        df1['ass_conversion']=df1['ass_conversion']*sum(import_dataset['conversions']) 

        final=final.append(df1,ignore_index=True)
        end = time.time()
        t1=(end - start)
        print(t1)   

    '''
    H0: u=0
    H1: u>0
    '''


    unique_channel=unique(final['path'])
    #final=(pd.DataFrame(final.groupby(['path'])[['ass_conversion']].mean())).reset_index()
    final_df=pd.DataFrame()

    for i in range(0,len(unique_channel)):

        x=(final['ass_conversion'][final['path']==unique_channel[i]]).values
        final_df.loc[i,0]=unique_channel[i]
        final_df.loc[i,1]=x.mean()

        v=stats.ttest_1samp(x,0)
        final_df.loc[i,2]=v[1]/2

        if v[1]/2<=alpha/100:
            final_df.loc[i,3]=str(100-alpha)+'% statistically confidence'
        else:
            final_df.loc[i,3]=str(100-alpha)+'% statistically not confidence'

        final_df.loc[i,4]=len(x)
        final_df.loc[i,5]=statistics.stdev(x)
        final_df.loc[i,6]=v[0]

    final_df.columns=['channel','ass_conversion','p_value','confidence_status','frequency','standard_deviation','t_statistics']       
    final_df['ass_conversion']=sum(import_dataset['conversions']) *final_df['ass_conversion'] /sum(final_df['ass_conversion'])

    return final_df,final

import_dataset=pd.read_csv('channel.csv')

data,dataset=markov_chain(import_dataset,no_iteration=10,no_of_simulation=10000,alpha=5)

10.319990873336792
10.768092393875122
12.088043451309204
10.632326126098633
11.044657230377197
10.47634243965149
11.269956350326538
10.61400032043457
11.260571002960205
10.740792036056519


In [12]:
data

Unnamed: 0,channel,ass_conversion,p_value,confidence_status,frequency,standard_deviation,t_statistics
0,(direct) / (none),1224.778811,2.262592e-19,95% statistically confidence,10.0,13.767197,281.912821
1,0e7307fc4f-EMAIL_CAMPAIGN_2018_08_13_09_04,1.74571,2.148308e-05,95% statistically confidence,10.0,0.751957,7.356682
2,10e1861a3d-EMAIL_CAMPAIGN_2018_10_07_11_03,7.338539,7.54605e-07,95% statistically confidence,10.0,2.097944,11.084555
3,1a106d9a00-EMAIL_CAMPAIGN_2018_11_06_05_23,2.241222,3.726883e-06,95% statistically confidence,10.0,0.77615,9.150439
4,3df8425c54-EMAIL_CAMPAIGN_2018_10_25_07_04_COP...,3.660202,2.847909e-05,95% statistically confidence,10.0,1.634686,7.095344
5,496348d35d-EMAIL_CAMPAIGN_2018_09_29_07_46,1.193217,0.0001716742,95% statistically confidence,10.0,0.677732,5.579097
6,49a41f9f72-EMAIL_CAMPAIGN_2018_09_14_06_54,3.445259,2.255534e-05,95% statistically confidence,10.0,1.493296,7.31103
7,4e1a75c7ff-EMAIL_CAMPAIGN_2018_10_16_05_14,3.36152,1.792898e-06,95% statistically confidence,10.0,1.065488,9.997465
8,4f7bef2225-EMAIL_CAMPAIGN_2018_11_02_06_53,2.711881,0.0001657227,95% statistically confidence,10.0,1.532748,5.606638
9,65ca25afa9-EMAIL_CAMPAIGN_2018_10_18_07_55,1.970873,0.0002930431,95% statistically confidence,10.0,1.20769,5.171371


In [13]:
pd.set_option("display.max_rows", 1000)

### Considere uma probabilidade de conversões **residual**, definida como a probabilidade total de conversões menos a probabilidade que se obetria se não existisse o **path** (ou estratégia de propaganda específica), **removal_effects** é a razão entra a probabilidade de conversão **residual** e a **total**. Então, quanto maior o **removal_effects** mais impacto esse **path** tem no número de conversões.

In [16]:
pd.set_option("display.max_rows", None)
dataset

Unnamed: 0,path,count,removal_effects,ass_conversion
0,(direct) / (none),7553,0.167405,1239.788956
1,0e7307fc4f-EMAIL_CAMPAIGN_2018_08_13_09_04,6,0.000133,0.984871
2,10e1861a3d-EMAIL_CAMPAIGN_2018_10_07_11_03,38,0.000842,6.237519
3,1a106d9a00-EMAIL_CAMPAIGN_2018_11_06_05_23,14,0.00031,2.298033
4,3df8425c54-EMAIL_CAMPAIGN_2018_10_25_07_04_COP...,41,0.000909,6.729955
5,496348d35d-EMAIL_CAMPAIGN_2018_09_29_07_46,1,2.2e-05,0.164145
6,49a41f9f72-EMAIL_CAMPAIGN_2018_09_14_06_54,29,0.000643,4.760212
7,4e1a75c7ff-EMAIL_CAMPAIGN_2018_10_16_05_14,19,0.000421,3.118759
8,4f7bef2225-EMAIL_CAMPAIGN_2018_11_02_06_53,17,0.000377,2.790469
9,65ca25afa9-EMAIL_CAMPAIGN_2018_10_18_07_55,10,0.000222,1.641452
