# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Data Load

In [2]:
train_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/train.csv')
test_data = pd.read_csv('跨境电商效果广告ROI预测挑战赛公开数据/testA.csv')

In [3]:
train_data.head()

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,datetime,account_id,post_id_emb,post_type,countries,gender,...,cum_bounces,cum_sessions,cum_session_duration,cum_add_cart_num,cum_add_payment_info,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi
0,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 00:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
1,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 01:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
2,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 02:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
3,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 03:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0
4,23848303658498562,23848131439913714,23848131439304807,6fc8e657540e589d6f70a91b783e3f38,2022-10-01 04:00:00,196573582962558,d5f21fd53aca5bc7c6131e1bc8f2fcbd,2.0,"US,GB",0,...,,,,0,2,6,2,51.96,0.0,0.0


In [4]:
len(set(train_data['ad_id'].unique().tolist())&set(test_data['ad_id'].unique().tolist()))

0

# Feature Engineering

In [5]:
train_data['datetime'] = pd.to_datetime(train_data['datetime'])
test_data['datetime'] = pd.to_datetime(test_data['datetime'])
train_data['datetime_hour'] = train_data['datetime'].dt.hour
test_data['datetime_hour'] = test_data['datetime'].dt.hour

train_data.drop('datetime', axis=1, inplace=True)
test_data.drop('datetime', axis=1, inplace=True)

for col in ['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']:
    lbl = LabelEncoder()
    lbl.fit(list(train_data[col]) + list(test_data[col]))
    train_data[col] = lbl.transform(list(train_data[col]))
    test_data[col] = lbl.transform(list(test_data[col]))

train_data['product_id_roi_mean'] = train_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean())
test_data['product_id_roi_mean'] = test_data['product_id'].map(train_data.groupby(['product_id'])['roi'].mean())

train_data['account_id_roi_mean'] = train_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean())
test_data['account_id_roi_mean'] = test_data['account_id'].map(train_data.groupby(['account_id'])['roi'].mean())

train_data['countries_roi_mean'] = train_data['countries'].map(train_data.groupby(['countries'])['roi'].mean())
test_data['countries_roi_mean'] = test_data['countries'].map(train_data.groupby(['countries'])['roi'].mean())

train_data['datetime_hour_roi_mean'] = train_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean())
test_data['datetime_hour_roi_mean'] = test_data['datetime_hour'].map(train_data.groupby(['datetime_hour'])['roi'].mean())

In [6]:
train_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
0,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,0,1.606082,2.249979,1.563869,1.805425
1,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,1,1.606082,2.249979,1.563869,1.798679
2,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,2,1.606082,2.249979,1.563869,1.802189
3,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,3,1.606082,2.249979,1.563869,1.802148
4,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,4,1.606082,2.249979,1.563869,1.801525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656414,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,8,1.549928,1.502077,1.433427,1.829020
656415,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,9,1.549928,1.502077,1.433427,1.817643
656416,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,10,1.549928,1.502077,1.433427,1.809741
656417,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,11,1.549928,1.502077,1.433427,1.803130


In [7]:
train_data_ids, dev_data_ids = train_test_split(train_data['ad_id'].unique(), train_size=0.8)

In [8]:
train_data_ids.shape, dev_data_ids.shape

((5732,), (1433,))

In [9]:
dev_data = train_data[train_data['ad_id'].isin(dev_data_ids)]
train_data = train_data[train_data['ad_id'].isin(train_data_ids)]

In [10]:
train_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
0,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,0,1.606082,2.249979,1.563869,1.805425
1,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,1,1.606082,2.249979,1.563869,1.798679
2,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,2,1.606082,2.249979,1.563869,1.802189
3,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,3,1.606082,2.249979,1.563869,1.802148
4,1,0,0,209,1,2249,2,19,0,18,...,6,2,51.96,0.0,0.0,4,1.606082,2.249979,1.563869,1.801525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656414,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,8,1.549928,1.502077,1.433427,1.829020
656415,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,9,1.549928,1.502077,1.433427,1.817643
656416,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,10,1.549928,1.502077,1.433427,1.809741
656417,7364,3792,2416,135,31,1812,3,10,2,25,...,0,0,0.00,0.0,0.0,11,1.549928,1.502077,1.433427,1.803130


In [11]:
dev_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_initiates_checkout,cum_purchase,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean
1587,5,1,2,144,44,1031,2,17,1,18,...,0,0,0.00,0.0,0.908068,0,1.202013,2.051953,0.627255,1.805425
1588,5,1,2,144,44,1031,2,17,1,18,...,0,0,0.00,0.0,0.908068,1,1.202013,2.051953,0.627255,1.798679
1589,5,1,2,144,44,1031,2,17,1,18,...,0,0,0.00,0.0,0.908068,2,1.202013,2.051953,0.627255,1.802189
1590,5,1,2,144,44,1031,2,17,1,18,...,0,0,0.00,0.0,0.908068,3,1.202013,2.051953,0.627255,1.802148
1591,5,1,2,144,44,1031,2,17,1,18,...,0,0,0.00,0.0,0.908068,4,1.202013,2.051953,0.627255,1.801525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656394,7362,3796,2420,135,31,147,3,10,2,25,...,2,1,84.22,0.0,3.438955,8,1.549928,1.502077,1.433427,1.829020
656395,7362,3796,2420,135,31,147,3,10,2,25,...,2,1,84.22,0.0,3.438955,9,1.549928,1.502077,1.433427,1.817643
656396,7362,3796,2420,135,31,147,3,10,2,25,...,2,1,84.22,0.0,3.438955,10,1.549928,1.502077,1.433427,1.809741
656397,7362,3796,2420,135,31,147,3,10,2,25,...,2,1,84.22,0.0,3.438955,11,1.549928,1.502077,1.433427,1.803130


# Model

In [12]:
# lightgbm
from lightgbm import LGBMRegressor
model = LGBMRegressor()

model.fit(
    train_data.iloc[:].drop('roi', axis=1),
    train_data.iloc[:]['roi'], categorical_feature=['ad_id', 'ad_set_id', 'campaign_id', 'product_id', 'account_id', 'post_id_emb', 'post_type', 'countries']
)

dev_data['y_pred'] = model.predict(dev_data.iloc[:].drop('roi', axis=1))
dev_data['wmse'] = dev_data.apply(lambda x: (x['datetime_hour']+1) * 1.0 / 13 * abs(x['roi'] - x['y_pred']), axis=1)
# df = pd.read_csv('提交示例.csv')
# df['roi'] = model.predict(test_data.iloc[:].drop('uuid', axis=1))
# df.to_csv('submit.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_data['y_pred'] = model.predict(dev_data.iloc[:].drop('roi', axis=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_data['wmse'] = dev_data.apply(lambda x: (x['datetime_hour']+1) * 1.0 / 13 * abs(x['roi'] - x['y_pred']), axis=1)


In [13]:
dev_data

Unnamed: 0,ad_id,ad_set_id,campaign_id,product_id,account_id,post_id_emb,post_type,countries,gender,age_min,...,cum_income_1,cum_income_2,roi,datetime_hour,product_id_roi_mean,account_id_roi_mean,countries_roi_mean,datetime_hour_roi_mean,y_pred,wmse
1587,5,1,2,144,44,1031,2,17,1,18,...,0.00,0.0,0.908068,0,1.202013,2.051953,0.627255,1.805425,0.721599,0.014344
1588,5,1,2,144,44,1031,2,17,1,18,...,0.00,0.0,0.908068,1,1.202013,2.051953,0.627255,1.798679,0.721599,0.028688
1589,5,1,2,144,44,1031,2,17,1,18,...,0.00,0.0,0.908068,2,1.202013,2.051953,0.627255,1.802189,0.721599,0.043031
1590,5,1,2,144,44,1031,2,17,1,18,...,0.00,0.0,0.908068,3,1.202013,2.051953,0.627255,1.802148,0.721599,0.057375
1591,5,1,2,144,44,1031,2,17,1,18,...,0.00,0.0,0.908068,4,1.202013,2.051953,0.627255,1.801525,0.721599,0.071719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656394,7362,3796,2420,135,31,147,3,10,2,25,...,84.22,0.0,3.438955,8,1.549928,1.502077,1.433427,1.829020,1.387750,1.420065
656395,7362,3796,2420,135,31,147,3,10,2,25,...,84.22,0.0,3.438955,9,1.549928,1.502077,1.433427,1.817643,1.387750,1.577850
656396,7362,3796,2420,135,31,147,3,10,2,25,...,84.22,0.0,3.438955,10,1.549928,1.502077,1.433427,1.809741,1.376058,1.745528
656397,7362,3796,2420,135,31,147,3,10,2,25,...,84.22,0.0,3.438955,11,1.549928,1.502077,1.433427,1.803130,1.387750,1.893420


In [14]:
np.mean(dev_data['wmse'])

0.839711735374547

In [17]:
# Time Series

In [18]:
# Linear Regression