## 00. 환경설정

In [1]:
import pandas as pd
import tensorflow as tf
from datetime import datetime, date
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import xlearn as xl
K = tf.keras.backend

import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

## 01. Field-aware Factorization Machine
- xlearn으로 ffm 모델을 학습합니다.
- 주어진 avazu-ctr-prediction 데이터만 사용합니다.
- 모델의 성능은 분류 성능인 logloss로 평가합니다.

### 데이터 로드 및 전처리

In [2]:
parse_date = lambda x : datetime.strptime(x, "%y%m%d%H").strftime("%Y-%m-%d")
click_df = pd.read_csv("../data/avazu-ctr-prediction/ad_click.csv", parse_dates=['datetime'], date_parser = parse_date)
click_df

Unnamed: 0,id,click,datetime,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10019071520499579916,0,2014-10-21,1005,0,da79c793,71ed77a0,f028772b,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
1,10025633842336165171,0,2014-10-21,1010,1,85f751fd,c4e18dd6,50e219e0,8c0dcd5a,7801e8d9,...,4,0,21665,320,50,2493,3,35,-1,117
2,10092735447533755726,0,2014-10-21,1002,0,61a8c644,948ff336,50e219e0,ecad2386,7801e8d9,...,0,0,19665,320,50,2253,2,303,-1,52
3,10141326312159899433,1,2014-10-21,1005,1,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,-1,79
4,10141793556467368079,0,2014-10-21,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,9705070284119894086,0,2014-10-30,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,...,1,0,23722,320,50,2716,3,47,-1,23
199996,9907898844680985083,0,2014-10-30,1005,0,85f751fd,c4e18dd6,50e219e0,53de0284,d9b5648e,...,1,0,21706,320,50,2498,3,41,100111,61
199997,9925599241747576355,0,2014-10-30,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,22676,320,50,2616,0,35,-1,51
199998,9995064718229733761,0,2014-10-30,1002,0,c135a32f,b8393312,50e219e0,ecad2386,7801e8d9,...,0,0,17894,320,50,2039,2,39,100077,32


### FFM 학습 데이터 생성

In [3]:
click_df = click_df[[
    'click', 'datetime', 'banner_pos', 'site_id', 'site_domain',
    'site_category', 'app_id', 'app_domain', 'app_category',
    'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]]

In [4]:
# FFM에 사용할 피쳐들

feature_col = [
    'banner_pos', 'site_id', 'site_domain','site_category', 'app_id', 'app_domain',
    'app_category', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]

for feature in feature_col:
    click_df[feature] = click_df[feature].astype("category")
    click_df[feature] = click_df[feature].cat.codes
    
click_df

Unnamed: 0,click,datetime,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,...,device_conn_type,C1,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2014-10-21,0,1546,767,18,1648,43,0,3696,...,0,2,375,3,2,102,0,2,0,42
1,0,2014-10-21,1,949,1316,5,965,43,3,441,...,0,5,719,3,2,196,3,2,0,39
2,0,2014-10-21,0,706,1001,5,1648,43,0,1676,...,0,1,410,3,2,116,2,21,0,15
3,1,2014-10-21,1,1541,1029,18,1648,43,0,2011,...,0,2,202,3,2,51,0,2,0,22
4,0,2014-10-21,0,617,1338,3,1648,43,0,3004,...,0,2,515,3,2,144,0,4,0,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,2014-10-30,1,647,604,18,1648,43,0,1272,...,0,2,1771,3,2,378,3,8,0,6
199996,0,2014-10-30,0,949,1316,5,564,91,3,2240,...,0,2,759,3,2,201,3,5,81,16
199997,0,2014-10-30,0,217,1620,1,1648,43,0,235,...,0,2,1272,3,2,294,0,2,0,14
199998,0,2014-10-30,0,1361,1251,5,1648,43,0,121,...,0,1,291,3,2,91,2,4,56,7


#### Feature dim size

In [5]:
feature_dim = {}
for feature in feature_col: feature_dim[feature] = click_df[feature].nunique()
feature_dim

{'banner_pos': 7,
 'site_id': 1804,
 'site_domain': 1711,
 'site_category': 20,
 'app_id': 1776,
 'app_domain': 112,
 'app_category': 22,
 'device_model': 3751,
 'device_type': 4,
 'device_conn_type': 4,
 'C1': 7,
 'C14': 1934,
 'C15': 8,
 'C16': 9,
 'C17': 405,
 'C18': 4,
 'C19': 65,
 'C20': 159,
 'C21': 60}

In [6]:
print('number of variables:', sum([dim for dim in feature_dim.values()]))

number of variables: 11862


#### 전체 데이터를 ffm의 variable number로 변환

In [7]:
idx = 0
for feature in feature_col:
    click_df[feature] = click_df[feature] + idx
    idx += feature_dim[feature]
    
click_df

Unnamed: 0,click,datetime,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,...,device_conn_type,C1,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2014-10-21,0,1553,2578,3540,5190,5361,5430,9148,...,9207,9213,9593,11155,11162,11271,11574,11580,11643,11844
1,0,2014-10-21,1,956,3127,3527,4507,5361,5433,5893,...,9207,9216,9937,11155,11162,11365,11577,11580,11643,11841
2,0,2014-10-21,0,713,2812,3527,5190,5361,5430,7128,...,9207,9212,9628,11155,11162,11285,11576,11599,11643,11817
3,1,2014-10-21,1,1548,2840,3540,5190,5361,5430,7463,...,9207,9213,9420,11155,11162,11220,11574,11580,11643,11824
4,0,2014-10-21,0,624,3149,3525,5190,5361,5430,8456,...,9207,9213,9733,11155,11162,11313,11574,11582,11643,11844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,2014-10-30,1,654,2415,3540,5190,5361,5430,6724,...,9207,9213,10989,11155,11162,11547,11577,11586,11643,11808
199996,0,2014-10-30,0,956,3127,3527,4106,5409,5433,7692,...,9207,9213,9977,11155,11162,11370,11577,11583,11724,11818
199997,0,2014-10-30,0,224,3431,3523,5190,5361,5430,5687,...,9207,9213,10490,11155,11162,11463,11574,11580,11643,11816
199998,0,2014-10-30,0,1368,3062,3527,5190,5361,5430,5573,...,9207,9212,9509,11155,11162,11260,11576,11582,11699,11809


### Train / test 데이터 생성
- 날짜를 기준으로 train과 test 데이터를 나눕니다.

In [8]:
train_df = click_df[click_df['datetime'] <= '2014-10-28']
test_df  = click_df[click_df['datetime'] >  '2014-10-28']

In [9]:
print(len(train_df))
print(len(test_df))

160052
39948


In [10]:
# xlearn의 ffm 데이터는 ylabel field_1:index_1:value_1 field_2:index_2:value_2 ... 로 만들어야 합니다

with open('./ffm_train.txt', 'w') as f:
    for _, row in train_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[feature_col].values)]
        f.write(' '.join(feature) + '\n')

In [11]:
with open('./ffm_test.txt', 'w') as f:
    for _, row in test_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[feature_col].values)]
        f.write(' '.join(feature) + '\n')

## 02. xlearn FFM
- 참고: xlearn 모델 관련 [하이퍼파라미터](https://xlearn-doc.readthedocs.io/en/latest/all_api/index.html?highlight=create_fm#xlearn-python-api)

In [12]:
import xlearn as xl
xl.hello()

### FFM 모델 선언

In [13]:
ffm_model = xl.create_ffm()
ffm_model.setTrain("./ffm_train.txt")
ffm_model.setValidate("./ffm_test.txt")

# 하이퍼파라미터 선언
param = {'task':'binary', 'lr':0.2, 'lambda':0.001, 'k': 4, 'epoch': 10}

# 모델 학습
ffm_model.setTXTModel("./ffm_model.txt")
ffm_model.fit(param, "./ffm_model.out")

### test 데이터를 사용하여 학습된 FFM 모델로 CTR 예측 => output.txt

In [14]:
ffm_model.setTest("./ffm_test.txt")
ffm_model.setSigmoid()
ffm_model.predict("./ffm_model.out", "./ffm_output.txt")

### 하이퍼 파라미터 튜닝

In [15]:
from sklearn.metrics import log_loss

test_click = test_df['click'].values

result = []

for k in [1,2,4,8]:
    for _lambda in [0.0005, 0.001, 0.002]:
        for lr in [0.1, 0.2, 0.3]:
            
            # train
            ffm_model = xl.create_ffm()         
            ffm_model.setTrain("./ffm_train.txt")
            param = {'task':'binary', 'lr':lr, 'lambda':_lambda, 'k':k, 'epoch':30}
            ffm_model.fit(param, "./ffm_model.out")
            
            # test
            ffm_model.setTest("./ffm_test.txt")
            ffm_model.setSigmoid()
            pCTR = ffm_model.predict("./ffm_model.out")
            result.append([k, _lambda, lr, log_loss(test_click, pCTR)])
            print(k, _lambda, lr, log_loss(test_click, pCTR))
            
result_df = pd.DataFrame(result, columns = ['k', 'lambda', 'lr', 'logloss'])

1 0.0005 0.1 0.40261333778410474
1 0.0005 0.2 0.40232840013918575
1 0.0005 0.3 0.4037950609058351
1 0.001 0.1 0.40306513725853016
1 0.001 0.2 0.40260010922635936
1 0.001 0.3 0.40286295412915785
1 0.002 0.1 0.403394072077182
1 0.002 0.2 0.4028646173663193
1 0.002 0.3 0.40313756740004847
2 0.0005 0.1 0.40244530963170944
2 0.0005 0.2 0.40234301343794493
2 0.0005 0.3 0.40316539436077725
2 0.001 0.1 0.40292192734752047
2 0.001 0.2 0.402516469379529
2 0.001 0.3 0.40300637248110766
2 0.002 0.1 0.40347474866401306
2 0.002 0.2 0.4029353165266922
2 0.002 0.3 0.4032012219982563
4 0.0005 0.1 0.40221820364445715
4 0.0005 0.2 0.40220689870617216
4 0.0005 0.3 0.40336085913404995
4 0.001 0.1 0.40276647961719314
4 0.001 0.2 0.4026923652177786
4 0.001 0.3 0.403300737997598
4 0.002 0.1 0.40344214443182846
4 0.002 0.2 0.4029261692371683
4 0.002 0.3 0.40320229227810833
8 0.0005 0.1 0.40220095853620036
8 0.0005 0.2 0.4024861022002606
8 0.0005 0.3 0.40372346899171385
8 0.001 0.1 0.40295736240956487
8 0.001 0

In [16]:
result_df.sort_values(by='logloss')

Unnamed: 0,k,lambda,lr,logloss
27,8,0.0005,0.1,0.402201
19,4,0.0005,0.2,0.402207
18,4,0.0005,0.1,0.402218
1,1,0.0005,0.2,0.402328
10,2,0.0005,0.2,0.402343
9,2,0.0005,0.1,0.402445
28,8,0.0005,0.2,0.402486
13,2,0.001,0.2,0.402516
31,8,0.001,0.2,0.402568
4,1,0.001,0.2,0.4026


## 03. Feature Ablation
- 일반적인 regerssion, classification에 대해서도 어떤 피쳐가 가장 중요한 역할을 하는지 분석합니다
- Tree 모델의 경우 Feature Importance로 표현하기도 합니다
- 현업에서 FM, FFM 계열의 모델을 학습할 때 피쳐의 개수를 최대한 줄이고 서빙속도를 빠르게 하는 것이 목표이므로 다양한 피쳐의 조합에 대해서 실험을 하고 가장 중요한 피쳐들만 선택하여 최종 모델을 만듭니다

#### 우리가 사용한 피쳐 종류 => C14와 app_id의 피쳐를 제외하고 성능을 비교해봅시다

In [17]:
feature_dim

{'banner_pos': 7,
 'site_id': 1804,
 'site_domain': 1711,
 'site_category': 20,
 'app_id': 1776,
 'app_domain': 112,
 'app_category': 22,
 'device_model': 3751,
 'device_type': 4,
 'device_conn_type': 4,
 'C1': 7,
 'C14': 1934,
 'C15': 8,
 'C16': 9,
 'C17': 405,
 'C18': 4,
 'C19': 65,
 'C20': 159,
 'C21': 60}

### C14 피쳐를 제외했을 때 성능

In [18]:
no_c14 = [
    'banner_pos', 'site_id', 'site_domain','site_category', 'app_id', 'app_domain',
    'app_category', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]

In [19]:
with open('./no_c14_train.txt', 'w') as f:
    for _, row in train_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_c14].values)]
        f.write(' '.join(feature) + '\n')

In [20]:
with open('./no_c14_test.txt', 'w') as f:
    for _, row in test_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_c14].values)]
        f.write(' '.join(feature) + '\n')

#### train

In [21]:
ffm_model = xl.create_ffm()         
ffm_model.setTrain("./no_c14_train.txt")
param = {'task':'binary', 'lr' : 0.1, 'lambda': 0.0005, 'k': 8, 'epoch': 30}
ffm_model.fit(param, "./no_c14_model.out")

#### test

In [22]:
ffm_model.setTest("./no_c14_test.txt")
ffm_model.setSigmoid()
pCTR = ffm_model.predict("./no_c14_model.out")
print("log_loss: ", log_loss(test_click, pCTR))

log_loss:  0.401695173245171


### app_id 피쳐를 제외했을 때 성능

In [23]:
no_app_id = [
    'banner_pos', 'site_id', 'site_domain','site_category', 'app_domain',
    'app_category', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]

In [24]:
with open('./no_app_id_train.txt', 'w') as f:
    for _, row in train_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_app_id].values)]
        f.write(' '.join(feature) + '\n')

In [25]:
with open('./no_app_id_test.txt', 'w') as f:
    for _, row in test_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_app_id].values)]
        f.write(' '.join(feature) + '\n')

#### train 

In [26]:
ffm_model = xl.create_ffm()         
ffm_model.setTrain("./no_app_id_train.txt")
param = {'task':'binary', 'lr' : 0.1, 'lambda': 0.0005, 'k': 8, 'epoch': 30}
ffm_model.fit(param, "./no_app_id_model.out")

#### test

In [27]:
ffm_model.setTest("./no_app_id_test.txt")
ffm_model.setSigmoid()
pCTR = ffm_model.predict("./no_app_id_model.out")
print("log_loss: ", log_loss(test_click, pCTR))

log_loss:  0.40557123109208437
