# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Data Load

In [2]:
train_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/train.csv')
test_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/testA.csv')

In [3]:
train_data.head()

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,datetime,account_id,post_id_emb,post_type,countries,gender,...,cum_bounces,cum_sessions,cum_session_duration,cum_add_cart_num,cum_add_payment_info,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi
0,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 00:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
1,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 01:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
2,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 02:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
3,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 03:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
4,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 04:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0


In [4]:
len(set(train_data['ad_id'].unique().tolist())&set(test_data['ad_id'].unique().tolist()))

0

# Feature Engineering

In [5]:
train_data['datetime'] = pd.to_datetime(train_data['datetime'])
test_data['datetime'] = pd.to_datetime(test_data['datetime'])
train_data['datetime_hour'] = train_data['datetime'].dt.hour
test_data['datetime_hour'] = test_data['datetime'].dt.hour

train_data.drop('datetime', axis=1, inplace=True)
test_data.drop('datetime', axis=1, inplace=True)

for col in ['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']:
    lbl = LabelEncoder()
    lbl.fit(list(train_data[col]) + list(test_data[col]))
    train_data[col] = lbl.transform(list(train_data[col]))
    test_data[col] = lbl.transform(list(test_data[col]))

train_data['product_id_roi_mean'] = train_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean())
test_data['product_id_roi_mean'] = test_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean())

train_data['account_id_roi_mean'] = train_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean())
test_data['account_id_roi_mean'] = test_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean())

train_data['countries_roi_mean'] = train_data['countries'].map(train_data.groupby(['countries'])['roi'].mean())
test_data['countries_roi_mean'] = test_data['countries'].map(train_data.groupby(['countries'])['roi'].mean())

train_data['datetime_hour_roi_mean'] = train_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean())
test_data['datetime_hour_roi_mean'] = test_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean())

In [6]:
train_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
0,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,0,1.606082,2.249979,1.563869,1.805425
1,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,1,1.606082,2.249979,1.563869,1.798679
2,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,2,1.606082,2.249979,1.563869,1.802189
3,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,3,1.606082,2.249979,1.563869,1.802148
4,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,4,1.606082,2.249979,1.563869,1.801525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656414,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,8,1.549928,1.502077,1.433427,1.829020
656415,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,9,1.549928,1.502077,1.433427,1.817643
656416,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,10,1.549928,1.502077,1.433427,1.809741
656417,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,11,1.549928,1.502077,1.433427,1.803130


In [7]:
nan_columns = ['cum_bounces','cum_sessions','cum_session_duration','bounces','sessions','session_duration']
train_data = train_data.drop(nan_columns, axis=1)

In [8]:
train_data_ids, dev_data_ids = train_test_split(train_data['ad_id'].unique(), train_size=0.8, random_state=0)

In [9]:
train_data_ids.shape, dev_data_ids.shape

((5732,), (1433,))

In [10]:
dev_data = train_data[train_data['ad_id'].isin(dev_data_ids)]
train_data = train_data[train_data['ad_id'].isin(train_data_ids)]

In [11]:
train_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
0,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,0,1.606082,2.249979,1.563869,1.805425
1,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,1,1.606082,2.249979,1.563869,1.798679
2,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,2,1.606082,2.249979,1.563869,1.802189
3,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,3,1.606082,2.249979,1.563869,1.802148
4,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,4,1.606082,2.249979,1.563869,1.801525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656414,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,8,1.549928,1.502077,1.433427,1.829020
656415,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,9,1.549928,1.502077,1.433427,1.817643
656416,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,10,1.549928,1.502077,1.433427,1.809741
656417,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,11,1.549928,1.502077,1.433427,1.803130


In [12]:
dev_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
166,2,0,0,209,1,2249,2,19,0,18,...,0,0,0.0,0.0,1.521178,0,1.606082,2.249979,1.563869,1.805425
167,2,0,0,209,1,2249,2,19,0,18,...,0,0,0.0,0.0,1.521178,1,1.606082,2.249979,1.563869,1.798679
168,2,0,0,209,1,2249,2,19,0,18,...,0,0,0.0,0.0,1.521178,2,1.606082,2.249979,1.563869,1.802189
169,2,0,0,209,1,2249,2,19,0,18,...,0,0,0.0,0.0,1.521178,3,1.606082,2.249979,1.563869,1.802148
170,2,0,0,209,1,2249,2,19,0,18,...,0,0,0.0,0.0,1.521178,4,1.606082,2.249979,1.563869,1.801525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656383,7361,3794,2419,135,31,1608,3,10,2,25,...,1,0,0.0,0.0,2.482316,8,1.549928,1.502077,1.433427,1.829020
656384,7361,3794,2419,135,31,1608,3,10,2,25,...,1,0,0.0,0.0,2.482316,9,1.549928,1.502077,1.433427,1.817643
656385,7361,3794,2419,135,31,1608,3,10,2,25,...,1,0,0.0,0.0,2.482316,10,1.549928,1.502077,1.433427,1.809741
656386,7361,3794,2419,135,31,1608,3,10,2,25,...,1,0,0.0,0.0,2.482316,11,1.549928,1.502077,1.433427,1.803130


In [13]:
feature_columns = train_data.iloc[:].drop('roi', axis=1).columns
feature_columns

Index(['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id',
       'post_id_emb', 'post_type', 'countries', 'gender', 'age_min', 'age_max',
       'spend', 'impressions', 'reach', 'clicks', 'engagement_nums',
       'post_shares', 'post_reactions', 'post_comments', 'post_saves',
       'watch15s', 'watch30s', 'watch_p25', 'watch_p50', 'watch_p75',
       'watch_p95', 'watch_p100', 'watches', 'add_cart_num',
       'add_payment_info', 'initiates_checkout', 'purchase', 'income_1',
       'income_2', 'cum_spend', 'cum_impressions', 'cum_reach', 'cum_clicks',
       'cum_engagement_nums', 'cum_post_shares', 'cum_post_reactions',
       'cum_post_comments', 'cum_post_saves', 'cum_watch15s', 'cum_watch30s',
       'cum_watch_p25', 'cum_watch_p50', 'cum_watch_p75', 'cum_watch_p95',
       'cum_watch_p100', 'cum_watches', 'cum_add_cart_num',
       'cum_add_payment_info', 'cum_initiates_checkout', 'cum_purchase',
       'cum_income_1', 'cum_income_2', 'datetime_hour', 'product_id_r

# Model

In [14]:
# lightgbm
from lightgbm import LGBMRegressor
model = LGBMRegressor()

model.fit(
    train_data.iloc[:][feature_columns],
    train_data.iloc[:]['roi'], categorical_feature=['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']
)

dev_data['lgbm_y_pred'] = model.predict(dev_data[feature_columns])
dev_data['lgbm_wmse'] = dev_data.apply(lambda x: (x['datetime_hour']+1) * 1.0 / 13 * abs(x['roi'] - x['lgbm_y_pred']), axis=1)
np.mean(dev_data['lgbm_wmse'])
# df = pd.read_csv('提交示例.csv')
# df['roi'] = model.predict(test_data.iloc[:].drop('uuid', axis=1))
# df.to_csv('submit.csv', index=None)



0.819817340110646

In [15]:
# xgboost
import xgboost as xgb

model = xgb.XGBRegressor()
model.fit(train_data.iloc[:][feature_columns],
        train_data.iloc[:]['roi']
)
dev_data['xgb_y_pred'] = model.predict(dev_data.iloc[:][feature_columns])
dev_data['xgb_wmse'] = dev_data.apply(lambda x: (x['datetime_hour']+1) * 1.0 / 13 * abs(x['roi'] - x['xgb_y_pred']), axis=1)
np.mean(dev_data['xgb_wmse'])

0.8635157501524727

In [16]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_columns', None)
train_data.describe()

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,age_max,spend,impressions,reach,clicks,engagement_nums,post_shares,post_reactions,post_comments,post_saves,watch15s,watch30s,watch_p25,watch_p50,watch_p75,watch_p95,watch_p100,watches,add_cart_num,add_payment_info,initiates_checkout,purchase,income_1,income_2,cum_spend,cum_impressions,cum_reach,cum_clicks,cum_engagement_nums,cum_post_shares,cum_post_reactions,cum_post_comments,cum_post_saves,cum_watch15s,cum_watch30s,cum_watch_p25,cum_watch_p50,cum_watch_p75,cum_watch_p95,cum_watch_p100,cum_watches,cum_add_cart_num,cum_add_payment_info,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
count,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0,528254.0
mean,2820.94287,1296.693469,817.733463,195.795432,36.483078,1390.733073,2.054824,14.082381,0.338898,26.328626,64.541113,4.057076,200.240769,182.690369,3.376681,6.524053,0.0723,0.392595,0.032371,0.089406,17.621118,9.263317,29.887495,17.305052,11.901517,8.368092,6.934431,135.581518,0.689405,0.070154,0.10918,0.069557,6.782267,2.538947,1948.637683,100953.8,85745.27,1807.538385,3225.259561,45.894517,229.794871,18.829294,53.178282,9692.279896,5168.809802,16532.83,9669.374801,6687.534216,4777.36978,3940.769855,68457.14,400.700809,44.236782,60.83051,42.698264,3829.523312,1024.033919,1.806684,5.953992,1.803841,1.805191,1.807133,1.806513
std,2226.900214,1130.650814,689.503957,153.053698,22.654295,801.833501,0.551702,2.551291,0.745426,8.870661,1.856124,9.409501,498.523223,468.583469,8.746035,21.538293,0.369989,1.184482,0.212698,0.404597,54.450619,29.989919,92.573627,54.282527,38.437133,27.017978,21.874851,389.790788,2.299218,0.345608,0.474671,0.331321,38.823869,26.638078,4250.325912,258467.5,199171.1,4459.499896,7747.852013,163.454052,682.402531,58.83665,174.46373,30613.410007,17788.6668,55010.74,32842.826265,23604.353652,17622.230928,14743.984184,176586.1,1129.555593,106.207249,133.43094,102.515843,8449.330908,3601.449915,2.673096,3.757249,0.472008,0.409653,0.202965,0.007516
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,34.0,-327.87,-15633.0,-14885.0,-342.0,-1267.0,-3.0,-38.0,-3.0,-11.0,-1577.0,-1111.0,-3184.0,-1991.0,-1548.0,-1173.0,-1104.0,-14025.0,-60.0,-9.0,-31.0,-7.0,-670.88,-97.29,-86.09,-7832.0,-7205.0,-184.0,-346.0,-2.0,-14.0,-2.0,-8.0,-679.0,-435.0,-1109.0,-755.0,-628.0,-460.0,-439.0,-5339.0,-30.0,-5.0,-8.0,-4.0,-409.99,0.0,-8.537391,0.0,0.0,0.0,0.0,1.798679
25%,892.0,313.0,235.0,43.0,13.0,631.0,2.0,15.0,0.0,18.0,65.0,0.2,10.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,104.07,4626.0,4002.0,80.0,148.0,0.0,8.0,0.0,1.0,117.0,62.0,205.0,116.0,78.0,56.0,48.0,982.0,7.0,1.0,2.0,2.0,139.98,0.0,0.0,3.0,1.600364,1.651574,1.859006,1.802189
50%,2355.0,995.0,662.0,139.0,38.0,1500.0,2.0,15.0,0.0,25.0,65.0,1.15,53.0,47.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,5.0,3.0,2.0,1.0,1.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,465.88,22296.5,19690.0,395.0,711.0,5.0,46.0,3.0,7.0,1398.0,693.0,2387.0,1336.0,892.0,628.0,533.0,11891.0,56.0,9.0,14.0,9.0,897.93,0.0,1.121474,6.0,1.804321,1.913957,1.859006,1.804534
75%,4581.0,2124.0,1315.0,350.0,56.0,2097.0,2.0,15.0,0.0,25.0,65.0,4.01,188.0,167.0,3.0,6.0,0.0,0.0,0.0,0.0,14.0,7.0,23.0,13.0,9.0,6.0,5.0,110.0,0.0,0.0,0.0,0.0,0.0,0.0,1754.9175,83995.0,75026.0,1502.0,2742.0,26.0,171.0,13.0,31.0,6675.75,3341.75,11187.0,6342.0,4239.0,2969.0,2506.0,54411.0,283.0,38.0,53.0,37.0,3566.6475,259.98,2.602371,9.0,2.214228,2.061597,1.859006,1.806698
max,7364.0,3799.0,2423.0,475.0,76.0,2668.0,6.0,20.0,2.0,55.0,65.0,400.36,56828.0,53350.0,636.0,2600.0,21.0,47.0,14.0,15.0,4268.0,2796.0,5763.0,3925.0,3277.0,2520.0,1682.0,22916.0,120.0,25.0,25.0,25.0,2798.0,2839.77,59060.8,3984341.0,2694399.0,60761.0,149439.0,2511.0,10731.0,983.0,3159.0,510359.0,319924.0,1008735.0,605718.0,443477.0,343171.0,296081.0,2547648.0,19401.0,1253.0,1668.0,1419.0,131966.09,58699.29,67.430117,12.0,5.315806,2.776974,2.512746,1.82902


In [17]:
# train_data[train_data.isnull().any(axis=1)]

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,age_max,spend,impressions,reach,clicks,engagement_nums,post_shares,post_reactions,post_comments,post_saves,watch15s,watch30s,watch_p25,watch_p50,watch_p75,watch_p95,watch_p100,watches,add_cart_num,add_payment_info,initiates_checkout,purchase,income_1,income_2,cum_spend,cum_impressions,cum_reach,cum_clicks,cum_engagement_nums,cum_post_shares,cum_post_reactions,cum_post_comments,cum_post_saves,cum_watch15s,cum_watch30s,cum_watch_p25,cum_watch_p50,cum_watch_p75,cum_watch_p95,cum_watch_p100,cum_watches,cum_add_cart_num,cum_add_payment_info,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean


In [None]:
# random forest
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(train_data.iloc[:][feature_columns],
        train_data.iloc[:]['roi']
)
dev_data['rf_y_pred'] = model.predict(dev_data.iloc[:][feature_columns])
dev_data['rf_wmse'] = dev_data.apply(lambda x: (x['datetime_hour']+1) * 1.0 / 13 * abs(x['roi'] - x['rf_y_pred']), axis=1)
np.mean(dev_data['rf_wmse'])

In [None]:
# 
import torch
torch.cuda.is_available()

In [None]:
# Time Series

In [None]:
# Linear Regression