 ## Атрибуция трафика и Marketing Mixture Modeling
 ### Step-1
 *Модели атрибуции:*
 1. Last Touch 
 2. First Touch
 3. Linear
 4. Time Decay
 5. Position Based (U-Shaped) 
 
 *Атрибуция помогает:*
 * Рассчитать ROI (Return On Investments) 
 * Оптимизировать маркетинговый бюджет
 * В анализе покупательского пути

In [604]:
import pandas as pd

df_event = pd.read_csv('./data/events.csv')
df_event.head()

Unnamed: 0,week,user_id,channel,is_purchased,gmv
0,1,7,bloggers,0,0
1,1,235,social_media,1,100
2,1,233,social_media,0,0
3,1,230,bloggers,0,0
4,1,220,bloggers,0,0


In [605]:
df_ad = pd.read_csv('./data/ad_costs.csv')
df_ad

Unnamed: 0,channel,costs
0,social_media,10000
1,mobile_ads,15000
2,bloggers,20000
3,context_ads,5000


In [606]:
import pandas as pd


def last_touch_attribution(events: pd.DataFrame) -> pd.DataFrame:
    """Calculate last touch attribution"""
    df_events = events.copy()
    df_events = df_events[df_events['is_purchased']==1]
    attribution = pd.pivot_table(df_events, values='gmv', index=['week', 'user_id'],
                       columns=['channel'], aggfunc="sum")
    attribution = attribution.rename_axis(None, axis=1).reset_index().fillna(0).astype(int)
    attribution['total_gmv'] = \
                    attribution.loc[:,['bloggers', 'context_ads', 'mobile_ads', 'social_media']].sum(axis=1).astype(int)
    return attribution[~(attribution['total_gmv']==0)]


def first_touch_attribution(events: pd.DataFrame) -> pd.DataFrame:
    """Calculate first touch attribution"""
    df_event = events.copy()
    grouped = df_event.groupby('user_id')

    #Итерируемся по юзерам и заполняем номер касания, gmv
    for user_id, group in grouped:
        counter = 1
        for idx in group.index:
            df_event.loc[idx, 'touch_num'] = counter
            counter+=1
            if group.loc[idx]['is_purchased'] == 1:
                counter = 1       
        for idx in group.index:
            if df_event.loc[idx, 'touch_num'] == 1:
                channel_local = df_event.loc[idx, 'channel']
            if df_event.loc[idx, 'is_purchased'] == 1:
                df_event.loc[idx, 'channel'] = channel_local 
            
    #Аналогично last_touch
    df_event = df_event[df_event['touch_num']==1]
    attribution = pd.pivot_table(df_event, values='gmv', index=['week', 'user_id'],
                       columns=['channel'], aggfunc="sum")
    attribution = attribution.rename_axis(None, axis=1).reset_index().fillna(0).astype(int)
    attribution['total_gmv'] = \
                    attribution.loc[:,['bloggers', 'context_ads', 'mobile_ads', 'social_media']].sum(axis=1).astype(int)

    return attribution[~(attribution['total_gmv']==0)]


def linear_attribution(events: pd.DataFrame) -> pd.DataFrame:
    """Calculate linear attribution"""
    def linear(group: pd.DataFrame):
        group = group.copy()
        for idx in group.index:
            for channel in group.loc[idx, 'channel']:
                group.loc[idx, channel] += group.loc[idx, 'gmv']/(len(group.loc[idx, 'channel']))
        return  group  

    df_event = events.copy()
    for channel in df_event['channel'].unique():
        df_event[channel] = 0

    grouped = df_event.groupby('user_id')

    #Итерируемся по юзерам и заполняем номер касания, gmv
    for user_id, group in grouped:
        counter = 1
        for idx in group.index:
            df_event.loc[idx, 'touch_num'] = counter
            counter+=1
            if group.loc[idx]['is_purchased'] == 1:
                counter = 1 
        channels_local = []
        for idx in group.index:
            channels_local.append(df_event.loc[idx, 'channel'])
            if df_event.loc[idx, 'is_purchased'] == 1:
                df_event.at[idx, 'channel'] = channels_local
                channels_local = []

    df_event = df_event[df_event['is_purchased']==1].groupby('user_id', group_keys=False).apply(lambda x: linear(x)) 
    columns = ['week', 'user_id', 'bloggers', 'social_media', 'context_ads', 'mobile_ads', 'gmv']
    df_event = df_event[columns]
    col_channels = ['bloggers', 'social_media', 'context_ads', 'mobile_ads']
    df_event[col_channels] = df_event[col_channels].apply(lambda x: round(x,2))
    df_event = df_event.rename(columns={"gmv": "total_gmv"})
    df_event['total_gmv'] = df_event['total_gmv'].astype(float)
            
    return df_event.sort_values(by=['week', 'user_id'])


def u_shaped_attribution(events: pd.DataFrame) -> pd.DataFrame:
    """Calculate U-Shaped attribution"""
    def shaped(group: pd.DataFrame):
        group = group.copy()
        for idx in group.index:
            group_purchase = group.loc[idx, 'channel']
            if len(group_purchase)==1:
                for channel in group_purchase:
                    group.loc[idx, channel] += round(group.loc[idx, 'gmv']*0.4, 2)
            elif len(group_purchase)==2:
                for channel in group_purchase:
                    group.loc[idx, channel] += round(group.loc[idx, 'gmv']*0.4, 2)        
            else:
                first = group_purchase[0]
                last = group_purchase[-1]
                group.loc[idx, first] += round(group.loc[idx, 'gmv']*0.4, 2) 
                group.loc[idx, last] += round(group.loc[idx, 'gmv']*0.4, 2)
                for channel in group_purchase[1:-1]:
                    group.loc[idx, channel] += round(group.loc[idx, 'gmv']/(len(group_purchase)-2), 2)
        return  group 
    
    df_event = events.copy()
    for channel in df_event['channel'].unique():
        df_event[channel] = 0

    grouped = df_event.groupby('user_id')

    #Итерируемся по юзерам и заполняем номер касания, gmv
    for user_id, group in grouped:
        counter = 1
        for idx in group.index:
            df_event.loc[idx, 'touch_num'] = counter
            counter+=1
            if group.loc[idx]['is_purchased'] == 1:
                counter = 1 
        channels_local = []
        for idx in group.index:
            channels_local.append(df_event.loc[idx, 'channel'])
            if df_event.loc[idx, 'is_purchased'] == 1:
                df_event.at[idx, 'channel'] = channels_local
                channels_local = []

    df_event = df_event[df_event['is_purchased']==1].groupby('user_id', group_keys=False).apply(lambda x: shaped(x)) 
    columns = ['week', 'user_id', 'bloggers', 'social_media', 'context_ads', 'mobile_ads', 'gmv']
    df_event = df_event[columns]
    col_channels = ['bloggers', 'social_media', 'context_ads', 'mobile_ads']
    df_event[col_channels] = df_event[col_channels].apply(lambda x: round(x,2))
    df_event = df_event.rename(columns={"gmv": "total_gmv"})
    df_event['total_gmv'] = df_event['total_gmv'].astype(float)
            
    return df_event.sort_values(by=['week', 'user_id'])


Формула для ROI:

$ROI=(\frac{GMV−costs}{costs})×100% $

In [615]:
import numpy as np

def roi(attribution: pd.DataFrame, ad_costs: pd.DataFrame) -> pd.DataFrame:
    """Calculate ROI"""
    roi = ad_costs.copy()
    for channel in list(roi['channel']):
        cond = roi['channel']==channel
        gmv = round(attribution[channel].sum())
        costs = roi.loc[cond, 'costs']
        roi.loc[cond, 'gmv'] = gmv
        roi.loc[cond, 'roi%'] = round((gmv - costs)/costs *100)    
    return roi

In [534]:
df_linear = linear_attribution(df_event)

In [535]:
df_linear[df_linear['user_id']==7]

Unnamed: 0,week,user_id,bloggers,social_media,context_ads,mobile_ads,total_gmv
341,4,7,3.33,0.0,6.67,0.0,10.0
433,5,7,0.0,60.0,0.0,0.0,60.0
1005,11,7,5.0,5.0,0.0,0.0,10.0
2078,22,7,0.0,12.5,0.0,37.5,50.0
2274,24,7,50.0,0.0,0.0,0.0,50.0
2363,25,7,0.0,0.0,0.0,100.0,100.0
2449,26,7,0.0,0.0,200.0,0.0,200.0
3057,32,7,0.0,100.0,0.0,0.0,100.0
3349,35,7,0.0,16.67,0.0,33.33,50.0
4099,43,7,50.0,0.0,50.0,0.0,100.0


In [617]:
roi(df_linear, df_ad)

Unnamed: 0,channel,costs,gmv,roi%
0,social_media,10000,79320.0,694.0
1,mobile_ads,15000,59940.0,300.0
2,bloggers,20000,43070.0,116.0
3,context_ads,5000,53450.0,969.0


### Step-2: MM model

In [618]:
import pandas as pd

df_sales = pd.read_csv('./model_data/sales.csv')
df_sales.head()

Unnamed: 0,day,category,sales
0,2022-01-01,Electronic,408.930079
1,2022-01-01,Fashion,515.645475
2,2022-01-01,Sport,353.163304
3,2022-01-01,Kids,486.977591
4,2022-01-02,Electronic,614.713931


In [619]:
df_costs = pd.read_csv('./model_data/ad_costs.csv')
df_costs.head()

Unnamed: 0,day,TV,Website banners,SMM,Google Ads
0,2022-01-01,199.671415,99.430656,101.704142,226.61209
1,2022-01-02,130.430816,128.530231,98.259475,176.412739
2,2022-01-03,110.744869,68.405162,147.168987,92.783533
3,2022-01-04,185.734689,31.714003,2.126262,88.716796
4,2022-01-05,64.101503,57.374662,138.085609,31.907972


In [678]:
from sklearn.linear_model import LinearRegression
from typing import Tuple, Dict

import pandas as pd


def linreg_total_sales(
    sales: pd.DataFrame, ad_costs: pd.DataFrame
) -> Tuple[float, Dict[str, float]]:
    """
    Fit linear regression model for total sales and ad costs
    Return R2 score and dict with coefficients and intercept
    Example:
    >>> linreg_total_sales(df_sales, df_ad)
    (0.948, {
        'TV': 0.3, 
        'SMM': 0.6, 
        'Website banners': 1.0, 
        'Google Ads': 0.5, 
        'intercept': 452.0
        }
    )
    """
    df_sales = sales.copy()
    df_costs = ad_costs.copy()
    df_y = df_sales.groupby('day').sum('sales')
    data = df_costs.merge(df_y, on='day')
    columns = ['TV', 'SMM', 'Website banners', 'Google Ads']
    X = data[columns].to_numpy()
    y = data[['sales']].to_numpy()
    reg = LinearRegression().fit(X, y)
    values = reg.coef_.flatten().tolist()
    coef = dict(zip(columns, values))
    coef['intercept'] = reg.intercept_[0]
    r2 = reg.score(X, y)
    return r2, coef


def linreg_category_sales(
    sales: pd.DataFrame, ad_costs: pd.DataFrame
) -> Dict[str, Tuple[float, Dict[str, float]]]:
    """
    Fit linear regression model for sales by category and ad costs
    Return R2 score and dict with coefficients and intercept for each category
    Example:
    >>> linreg_category_sales(df_sales, df_ad)
    {
        'Electronic': (0.948, {
            'TV': 0.3, 'SMM': 0.6, 
            'Website banners': 1.0, 
            'Google Ads': 0.5, 'intercept': 452.0
            }),
        'Fashion': (0.567, {
            'TV': 0.2, 'SMM': 0.3, 
            'Website banners': 7.0, 
            'Google Ads': 0.1, 
            'intercept': 527.0
            }),
    """
    result = {}
    grouped = sales.groupby('category')
    for category, df_cat in grouped:
        result[category] = linreg_total_sales(df_cat, df_costs)
    return result


In [679]:
linreg_total_sales(df_sales, df_costs)

(0.8475248162893827,
 {'TV': 0.3805954855349688,
  'SMM': 0.33240532272626777,
  'Website banners': 1.3648036894878135,
  'Google Ads': 0.4536083262513757,
  'intercept': 1870.702305403843})

In [666]:
linreg_category_sales(df_sales, df_costs)

{'Electronic': (0.591690277017956,
  {'TV': array([ 0.34243737, -0.00300553,  0.0766271 , -0.07718226]),
   'intercept': 480.25594633078884}),
 'Fashion': (0.6095692603845206,
  {'TV': array([-0.01536854,  0.57639751,  0.17654083, -0.04578732]),
   'intercept': 467.60754657080395}),
 'Kids': (0.6232808736587676,
  {'TV': array([ 0.06523498, -0.06453712, -0.10501504,  0.50738749]),
   'intercept': 477.88723287736855}),
 'Sport': (0.5505985300920266,
  {'TV': array([-0.01170833, -0.17644953,  1.2166508 ,  0.06919042]),
   'intercept': 444.9515796248817})}

### Сравнение

Атрибуция: 

**'+'** Конкретный путь клиента (но работает только для цифровых каналов)\
**'-'** Качество данных, шум (клиент может посещать сайт по другим причинам)

MMM:

**'+'** Учитывает все каналы (хорошо для макро-оптимизации)\
**'-'** Нужно больше данных (нужно учитывать сезонность, тренды, изменения цен)

### Минусы линейной регресии для MMM
* Часто после определенного объема вложений эффективность канала начинает снижаться
* Не учитываются зависимости между рекламными каналами
* Рекламное воздействие может проявляться не сразу
* Нет учета сезонности

### Можно использовать
* Многомерную регрессию
* Модели временных рядов
* Нейронные сети
* Случайные леса и градиентный бустинг