# `목적`: Last Click Attribution 분석을 위한 데이터 추출

In [1]:
import pandas as pd
import plotly.express as px

### 데이터 로드

In [2]:
attr = pd.read_csv(r'data-files\07-attr.csv')
attr.head()

Unnamed: 0,source,campaign,user_id,event,timestamp,event_id
0,,,2,purchase,2023-01-01 00:12:44,1
1,,,1,visit,2023-01-01 05:18:27,2
2,,,1,visit,2023-01-01 07:12:20,3
3,Mata,B,2,visit,2023-01-01 09:20:32,4
4,Gogle,A,2,visit,2023-01-01 11:28:19,5


In [6]:
display(attr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     6 non-null      object
 1   campaign   6 non-null      object
 2   user_id    20 non-null     int64 
 3   event      20 non-null     object
 4   timestamp  20 non-null     object
 5   event_id   20 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


None

In [8]:
# timestamp 자료형 변환
attr['timestamp'] = pd.to_datetime(attr['timestamp'])

In [9]:
# user_id - timestamp 순서로 정렬
attr.sort_values(['user_id', 'timestamp'], inplace=True)
attr.head()

Unnamed: 0,source,campaign,user_id,event,timestamp,event_id
1,,,1,visit,2023-01-01 05:18:27,2
2,,,1,visit,2023-01-01 07:12:20,3
5,,,1,purchase,2023-01-01 13:54:43,6
6,Mata,B,1,visit,2023-01-02 01:28:15,7
8,,,1,purchase,2023-01-02 08:32:18,9


### Last Click Attribution

(1) 캠페인 경유 방문이 구매로 이어진 경우

In [10]:
# 구매 이벤트 추출
attr_purc = attr[attr['event'] == 'purchase']

In [11]:
# 캠페인 경유 방문 이벤트 추출
attr_visit = attr[~attr['campaign'].isna()]

In [26]:
# 접속과 구매 이벤트를 cross matching
attr_cross = pd.merge(attr_purc, attr_visit, how='cross')

In [27]:
# 구매한 유저와 방문한 유저가 일치하는 열만 필터링
attr_cross = attr_cross[attr_cross['user_id_x'] == attr_cross['user_id_y']]

In [28]:
# 구매 이벤트 발생 이전의 방문 이벤트만 남김
attr_cross = attr_cross[attr_cross['timestamp_x'] >= attr_cross['timestamp_y']]

In [29]:
# 구매 이벤트와 가장 가까운 방문 이벤트만 남김
attr_cross['td'] = (attr_cross['timestamp_x'] - attr_cross['timestamp_y']).dt.total_seconds()

idx = attr_cross.groupby('event_id_x')['td'].idxmin()

attr_cross.loc[idx]

Unnamed: 0,source_x,campaign_x,user_id_x,event_x,timestamp_x,event_id_x,source_y,campaign_y,user_id_y,event_y,timestamp_y,event_id_y,td
29,,,2,purchase,2023-01-02 04:18:51,8,Gogle,A,2,visit,2023-01-01 11:28:19,5,60632.0
6,,,1,purchase,2023-01-02 08:32:18,9,Mata,B,1,visit,2023-01-02 01:28:15,7,25443.0
15,,,1,purchase,2023-01-03 16:00:08,19,Gogle,A,1,visit,2023-01-03 03:47:20,16,43968.0
35,,,2,purchase,2023-01-04 05:41:22,20,Gogle,A,2,visit,2023-01-01 11:28:19,5,238383.0


(2) 모든 구매 이벤트에 대한 마지막 이벤트 

In [44]:
# 모든 구매 이벤트 추출 및 컬럼 정리
attr_purc2 = attr_purc[[
    'user_id'
    , 'timestamp'
    , 'event_id'
    ]].rename(columns={
    'timestamp':'purc_at'
    , 'event_id':'purc_id'
})
attr_purc2

Unnamed: 0,user_id,purc_at,purc_id
5,1,2023-01-01 13:54:43,6
8,1,2023-01-02 08:32:18,9
18,1,2023-01-03 16:00:08,19
0,2,2023-01-01 00:12:44,1
7,2,2023-01-02 04:18:51,8
19,2,2023-01-04 05:41:22,20


In [59]:
# 모든 방문 이벤트 추출 및 컬럼 정리
attr_visit2 = attr[attr['event']=='visit'][[
    'user_id'
    , 'source'
    , 'campaign'
    , 'timestamp'
    , 'event_id'
]].rename(columns={
    'timestamp':'visit_at'
    , 'event_id':'visit_id'
})

attr_visit2

Unnamed: 0,user_id,source,campaign,visit_at,visit_id
1,1,,,2023-01-01 05:18:27,2
2,1,,,2023-01-01 07:12:20,3
6,1,Mata,B,2023-01-02 01:28:15,7
9,1,Mata,B,2023-01-02 14:02:58,10
10,1,Mata,B,2023-01-02 14:47:30,11
11,1,,,2023-01-02 15:18:52,12
12,1,,,2023-01-02 19:00:08,13
13,1,,,2023-01-02 22:58:19,14
15,1,Gogle,A,2023-01-03 03:47:20,16
16,1,,,2023-01-03 12:37:30,17


In [69]:
# 구매 이벤트를 기준으로 방문 이벤트를 병합
attr_left = pd.merge(attr_purc2, attr_visit2, on='user_id', how='left')
attr_left.head(10)

Unnamed: 0,user_id,purc_at,purc_id,source,campaign,visit_at,visit_id
0,1,2023-01-01 13:54:43,6,,,2023-01-01 05:18:27,2
1,1,2023-01-01 13:54:43,6,,,2023-01-01 07:12:20,3
2,1,2023-01-01 13:54:43,6,Mata,B,2023-01-02 01:28:15,7
3,1,2023-01-01 13:54:43,6,Mata,B,2023-01-02 14:02:58,10
4,1,2023-01-01 13:54:43,6,Mata,B,2023-01-02 14:47:30,11
5,1,2023-01-01 13:54:43,6,,,2023-01-02 15:18:52,12
6,1,2023-01-01 13:54:43,6,,,2023-01-02 19:00:08,13
7,1,2023-01-01 13:54:43,6,,,2023-01-02 22:58:19,14
8,1,2023-01-01 13:54:43,6,Gogle,A,2023-01-03 03:47:20,16
9,1,2023-01-01 13:54:43,6,,,2023-01-03 12:37:30,17


In [68]:
# 구매 이벤트 발생 이전의 방문 이벤트만 남김
attr_left['td'] = (attr_left['purc_at'] - attr_left['visit_at']).dt.total_seconds()
attr_cross2 = attr_left[attr_left['td'] >= 0]

# 구매 이벤트와 가장 가까운 방문 이벤트만 남김
idx2 = attr_cross2.groupby('purc_id')['td'].idxmin()
attr_cross2.loc[idx2].sort_values(['user_id', 'purc_id'])

Unnamed: 0,user_id,purc_at,purc_id,source,campaign,visit_at,visit_id,td
1,1,2023-01-01 13:54:43,6,,,2023-01-01 07:12:20,3,24143.0
12,1,2023-01-02 08:32:18,9,Mata,B,2023-01-02 01:28:15,7,25443.0
29,1,2023-01-03 16:00:08,19,,,2023-01-03 12:37:30,17,12158.0
35,2,2023-01-02 04:18:51,8,Gogle,A,2023-01-01 11:28:19,5,60632.0
41,2,2023-01-04 05:41:22,20,,,2023-01-03 15:22:06,18,51556.0
