In [1]:
import sys
import random
import pandas as pd
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
pd.set_option('display.min_rows',None)
from tqdm import tqdm
import datetime, time
from pandarallel import pandarallel
import random
from tqdm import tqdm_notebook
from sklearn.metrics import r2_score
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def sliding_window (df, end_date, day=7) :
    #end_date = df['timestamp'].max()
    start_date = end_date - datetime.timedelta(day)
    
    return df[(df['date'] <= end_date) & (df['date'] > start_date)].reset_index(drop=True)

def date_2_timestamp (date_time) :
    # 字符类型的时间
    # 转为时间数组
    timeArray = time.strptime(date_time, "%Y%m%d%H%M%S")    
    # 转为时间戳
    timeStamp = long(time.mktime(timeArray))
    return timeStamp  # 1381419600

def timestamp_to_date (timestamp) :
    # 获得当前时间时间戳
    #转换为其他日期格式,如:"%Y-%m-%d %H:%M:%S"
    timeArray = time.localtime(int(timestamp))
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    #return pd.to_datetime(otherStyleTime)
    return otherStyleTime

In [3]:
%%time
user_behaviors = pd.read_csv('data/user_behaviors.csv')
vid_info = pd.read_csv('data/vid_info.csv')

user_behaviors['date'] = user_behaviors['timestamp'].parallel_apply(timestamp_to_date)
user_behaviors['date_day'] = user_behaviors['date'].parallel_apply(lambda x : x.split(' ')[0])
user_behaviors['date'] = pd.to_datetime(user_behaviors['date'])

CPU times: user 2min 40s, sys: 29.2 s, total: 3min 9s
Wall time: 4min 20s


In [7]:
user_behaviors['date_day'].nunique()

84

In [8]:
def get_label (df_behaviors, end_date, day=7) :
    user_behaviors_label_windows = sliding_window (df_behaviors, 
                                                   end_date, 
                                                   day=7)
    user_history_hebaviors = sliding_window (df_behaviors, 
                                             end_date - datetime.timedelta(day), 
                                             day=df_behaviors['date_day'].nunique())

    agg = {
        'date_day' : 'nunique',
        'vid' : 'count',
        'vts' : 'sum',
    }

    df_label = user_behaviors_label_windows.groupby(['did']).agg(agg)
    df_label.columns = pd.Index([e[0] + e[1].upper() for e in df_label.columns.tolist()])
    df_label = df_label.reset_index()  
    df_label.columns = ['did', 'active_days', 'watch_nums', 'watch_durations']    
    return df_label, user_history_hebaviors, user_behaviors_label_windows

In [9]:
#生成训练集label
df_train_label, user_train_history_behaviors, user_train_behaviors_label_windows = get_label (user_behaviors, user_behaviors['date'].max()-datetime.timedelta(7), day=7)
#生成验证集label
df_valid_label, user_valid_history_behaviors, user_valid_behaviors_label_windows = get_label (user_behaviors, user_behaviors['date'].max(), day=7)

In [10]:
user_train_history_behaviors['date'].min()

Timestamp('2022-11-17 05:30:40')

In [11]:
user_train_history_behaviors['date'].max()

Timestamp('2023-01-25 23:57:52')

In [12]:
user_train_behaviors_label_windows['date'].min()

Timestamp('2023-01-26 00:00:00')

In [13]:
user_train_behaviors_label_windows['date'].max()

Timestamp('2023-02-01 23:57:52')

In [14]:
user_valid_history_behaviors['date'].min()

Timestamp('2022-11-17 05:30:40')

In [15]:
user_valid_history_behaviors['date'].max()

Timestamp('2023-02-01 23:57:52')

In [16]:
user_valid_behaviors_label_windows['date'].min()

Timestamp('2023-02-02 00:00:00')

In [17]:
user_valid_behaviors_label_windows['date'].max()

Timestamp('2023-02-08 23:57:52')

In [18]:
def make_feats (df, days) :
    end_date = df['date'].max()
    df_temp = sliding_window(df, end_date, days)
    
    agg = {
        'date_day' : 'nunique',
        'vid' : 'count',
        'vts' : ['mean', 'std', 'min', 'max', 'sum'],
    }

    df_feats = df_temp.groupby(['did']).agg(agg)
    df_feats.columns = pd.Index([e[0] + '_last_' + str(days) + "_" + e[1].upper() for e in df_feats.columns.tolist()])
    df_feats = df_feats.reset_index()    
    return df_feats


def create_sample (user_history_behaviors, df_label) :
    #训练集特征窗口
    #最近一天
    df_feats_1 = make_feats (user_history_behaviors, days=1)
    #最近三天
    df_feats_2 = make_feats (user_history_behaviors, days=3)
    #最近七天
    df_feats_3 = make_feats (user_history_behaviors, days=7)
    #最近21天
    df_feats_4 = make_feats (user_history_behaviors, days=7 * 3)

    df_label = df_label.merge(df_feats_1, on='did', how='left')
    df_label = df_label.merge(df_feats_2, on='did', how='left')
    df_label = df_label.merge(df_feats_3, on='did', how='left')
    df_label = df_label.merge(df_feats_4, on='did', how='left') 
    
    return df_label    

In [19]:
%%time
#构建训练集样本
df_train_data = create_sample (user_train_history_behaviors, df_train_label)
#构建验证集样本
df_valid_data = create_sample (user_valid_history_behaviors, df_valid_label)
#测试集特征
df_test_data = create_sample (user_behaviors, user_behaviors[['did']].drop_duplicates())

CPU times: user 4min 20s, sys: 10.9 s, total: 4min 31s
Wall time: 4min 31s


In [20]:
df_train_data

Unnamed: 0,did,active_days,watch_nums,watch_durations,date_day_last_1_NUNIQUE,vid_last_1_COUNT,vts_last_1_MEAN,vts_last_1_STD,vts_last_1_MIN,vts_last_1_MAX,...,vts_last_7_MIN,vts_last_7_MAX,vts_last_7_SUM,date_day_last_21_NUNIQUE,vid_last_21_COUNT,vts_last_21_MEAN,vts_last_21_STD,vts_last_21_MIN,vts_last_21_MAX,vts_last_21_SUM
0,00001bf314361c660531a8466264a81c,4,17,16292.0,,,,,,,...,182.0,3060.0,4578.0,4.0,8.0,890.000000,1184.183866,7.0,3060.0,7120.0
1,000026b09095e12af94c49ec1d7eb19a,5,37,49102.0,,,,,,,...,,,,1.0,1.0,14.000000,,14.0,14.0,14.0
2,000026f3c643ae6830dfe8f8f15d614d,7,42,28624.0,1.0,13.0,293.538462,678.869847,2.0,2098.0,...,2.0,2098.0,3816.0,5.0,66.0,435.212121,436.901239,2.0,2098.0,28724.0
3,00002ebb01d646d7c5d2a882665d6ab0,3,4,9885.0,,,,,,,...,,,,7.0,10.0,2258.100000,2286.917647,5.0,5770.0,22581.0
4,0000381af3c014c3155d39df3aa239a2,7,81,40621.0,1.0,4.0,641.250000,849.039997,3.0,1798.0,...,3.0,2493.0,13072.0,5.0,17.0,861.705882,976.918290,2.0,2493.0,14649.0
5,000050c11cf557bb665b5179b1110a22,1,1,13.0,,,,,,,...,,,,,,,,,,
6,00005d15cbee8a1161bf2960146ff781,7,40,57955.0,1.0,4.0,3158.750000,2898.482060,42.0,6678.0,...,11.0,6678.0,91515.0,20.0,88.0,2706.204545,2171.297404,5.0,8857.0,238146.0
7,00006463275024ca3555e5f01e61919b,4,9,9081.0,,,,,,,...,555.0,2006.0,3446.0,5.0,6.0,1338.000000,1371.634354,16.0,3801.0,8028.0
8,00006f81a9e9df226faadd2b5f2a903e,3,6,10887.0,,,,,,,...,6.0,6146.0,23290.0,6.0,16.0,2283.875000,2228.031325,6.0,6146.0,36542.0
9,000074de697627a242453e2f961d492f,3,4,6366.0,,,,,,,...,660.0,1919.0,4319.0,3.0,4.0,1499.750000,568.964190,660.0,1919.0,5999.0


In [21]:
df_valid_data

Unnamed: 0,did,active_days,watch_nums,watch_durations,date_day_last_1_NUNIQUE,vid_last_1_COUNT,vts_last_1_MEAN,vts_last_1_STD,vts_last_1_MIN,vts_last_1_MAX,...,vts_last_7_MIN,vts_last_7_MAX,vts_last_7_SUM,date_day_last_21_NUNIQUE,vid_last_21_COUNT,vts_last_21_MEAN,vts_last_21_STD,vts_last_21_MIN,vts_last_21_MAX,vts_last_21_SUM
0,00000e1872623bbb29b07c2bee94c345,1,18,3464.0,,,,,,,...,,,,6.0,17.0,733.882353,1246.193849,13.0,4839.0,12476.0
1,00001bf314361c660531a8466264a81c,2,11,15961.0,,,,,,,...,4.0,5924.0,16292.0,6.0,23.0,1012.869565,1575.759335,4.0,5924.0,23296.0
2,000026b09095e12af94c49ec1d7eb19a,7,39,48623.0,1.0,2.0,1166.000000,722.663130,655.0,1677.0,...,5.0,2820.0,49102.0,5.0,37.0,1327.081081,986.728331,5.0,2820.0,49102.0
3,000026f3c643ae6830dfe8f8f15d614d,3,19,12868.0,1.0,4.0,39.000000,42.926293,1.0,81.0,...,1.0,2473.0,28624.0,9.0,82.0,553.109756,676.930144,1.0,2473.0,45355.0
4,00002ebb01d646d7c5d2a882665d6ab0,5,11,14779.0,,,,,,,...,1.0,4954.0,9885.0,6.0,7.0,2283.285714,2180.875338,1.0,4954.0,15983.0
5,0000381af3c014c3155d39df3aa239a2,5,21,27117.0,1.0,34.0,463.617647,545.008146,2.0,2459.0,...,2.0,2459.0,40621.0,12.0,98.0,563.979592,761.124886,2.0,2493.0,55270.0
6,00003a751c54a251ef7b79199e3b8cef,1,4,4885.0,,,,,,,...,,,,,,,,,,
7,000050c11cf557bb665b5179b1110a22,2,6,7982.0,,,,,,,...,13.0,13.0,13.0,1.0,1.0,13.000000,,13.0,13.0,13.0
8,00005d15cbee8a1161bf2960146ff781,7,52,108351.0,1.0,3.0,2888.333333,493.275109,2434.0,3413.0,...,4.0,4045.0,57955.0,21.0,100.0,2259.010000,2050.016780,4.0,8857.0,225901.0
9,00006463275024ca3555e5f01e61919b,4,9,18080.0,1.0,3.0,1443.333333,1006.694757,341.0,2314.0,...,21.0,2314.0,9081.0,6.0,12.0,1043.916667,773.850521,21.0,2314.0,12527.0


In [22]:
df_test_data

Unnamed: 0,did,date_day_last_1_NUNIQUE,vid_last_1_COUNT,vts_last_1_MEAN,vts_last_1_STD,vts_last_1_MIN,vts_last_1_MAX,vts_last_1_SUM,date_day_last_3_NUNIQUE,vid_last_3_COUNT,...,vts_last_7_MIN,vts_last_7_MAX,vts_last_7_SUM,date_day_last_21_NUNIQUE,vid_last_21_COUNT,vts_last_21_MEAN,vts_last_21_STD,vts_last_21_MIN,vts_last_21_MAX,vts_last_21_SUM
0,ffca69a280fca4c50ab17f04a9db77bc,,,,,,,,1.0,1.0,...,5.0,2609.0,5369.0,8.0,18.0,1040.055556,994.231212,4.0,3191.0,18721.0
1,01cd0097edede0ca1872b58c2645ee50,1.0,1.0,1001.000000,,1001.0,1001.0,1001.0,2.0,2.0,...,29.0,12150.0,49221.0,20.0,83.0,1957.963855,2445.085147,3.0,12150.0,162511.0
2,a3b940c14f7429b27815b02c407540c5,1.0,1.0,5259.000000,,5259.0,5259.0,5259.0,1.0,1.0,...,5.0,5259.0,6265.0,5.0,22.0,880.227273,1748.860280,3.0,5259.0,19365.0
3,d325e8e381952b598a6382ea20fc2eea,1.0,5.0,872.600000,1585.273415,2.0,3687.0,4363.0,1.0,5.0,...,2.0,5429.0,17222.0,13.0,27.0,1684.185185,1705.319551,2.0,5429.0,45473.0
4,4c5e227a14abc08efd844c00620d781c,,,,,,,,,,...,,,,1.0,1.0,5.000000,,5.0,5.0,5.0
5,7feaf36c5ea92c90ad90bb753368a6fa,,,,,,,,1.0,2.0,...,72.0,178.0,250.0,14.0,77.0,919.428571,1176.180319,1.0,5059.0,70796.0
6,e22fd7b886ff97a1458ff76315a4627b,,,,,,,,1.0,2.0,...,4.0,1168.0,1460.0,3.0,14.0,105.500000,307.220730,4.0,1168.0,1477.0
7,bfd7ab873a72f35a004d2e6811e432e2,,,,,,,,2.0,38.0,...,5.0,2103.0,33380.0,5.0,45.0,835.200000,562.417493,5.0,2219.0,37584.0
8,a7976a469743b45c553e78382ffa034d,1.0,12.0,2701.416667,871.940621,1337.0,4188.0,32417.0,2.0,16.0,...,1337.0,4188.0,52971.0,3.0,22.0,2407.772727,756.129110,1337.0,4188.0,52971.0
9,ae5c0a2ff03c031d2c015eedb24ee38f,1.0,1.0,1330.000000,,1330.0,1330.0,1330.0,2.0,3.0,...,2.0,2632.0,11832.0,14.0,58.0,450.310345,693.091098,2.0,2632.0,26118.0


In [23]:
def train (df_train, df_valid, label, params, features) :
    train_label = df_train[label].values
    train_feat = df_train[features]

    valid_label = df_valid[label].values
    valid_feat = df_valid[features]
    print (train_feat.columns )
    gc.collect()

    trn_data = lgb.Dataset(train_feat, label=train_label)#, categorical_feature=cat_cols)
    val_data = lgb.Dataset(valid_feat, label=valid_label)#, categorical_feature=cat_cols)
    clf = lgb.train(params,
                    trn_data,
                    #3000,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=50,
                    #categorical_feature=cat_cols, 
                    early_stopping_rounds=100)#, feval=custom_metric)#, feval=self_gauc)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    fold_importance_df = fold_importance_df.sort_values(by='importance', ascending=False)
    fold_importance_df.to_csv("fold_importance_df.csv", index=None) 
    df_valid['{}_preds'.format(label)] = clf.predict(valid_feat, num_iteration=clf.best_iteration)
    result = r2_score(df_valid[label], df_valid['{}_preds'.format(label)])
    
    return clf, result

In [24]:
useless_cols = ['did', 'active_days', 'active_days_preds', 
                'watch_nums', 'watch_nums_preds', 
                'watch_durations', 'watch_durations_preds']
features = df_train_data.columns[~df_train_data.columns.isin(useless_cols)].values
#print (features)
print (features)

params = {
    'objective': 'regression', #定义的目标函数
    #'metric': {'auc', 'binary_logloss'},
    'metric': {'rmse'},
    'boosting_type' : 'gbdt',

    'learning_rate': 0.05,
    'max_depth' : 12,
    'num_leaves' : 2 ** 6,

    'feature_fraction' : 0.70,
    'subsample' : 0.75,
    'seed' : 114,
    'num_iterations' : 3000,
    'nthread' : -1,
    'verbose' : -1,
    #'scale_pos_weight':200
}

['date_day_last_1_NUNIQUE' 'vid_last_1_COUNT' 'vts_last_1_MEAN'
 'vts_last_1_STD' 'vts_last_1_MIN' 'vts_last_1_MAX' 'vts_last_1_SUM'
 'date_day_last_3_NUNIQUE' 'vid_last_3_COUNT' 'vts_last_3_MEAN'
 'vts_last_3_STD' 'vts_last_3_MIN' 'vts_last_3_MAX' 'vts_last_3_SUM'
 'date_day_last_7_NUNIQUE' 'vid_last_7_COUNT' 'vts_last_7_MEAN'
 'vts_last_7_STD' 'vts_last_7_MIN' 'vts_last_7_MAX' 'vts_last_7_SUM'
 'date_day_last_21_NUNIQUE' 'vid_last_21_COUNT' 'vts_last_21_MEAN'
 'vts_last_21_STD' 'vts_last_21_MIN' 'vts_last_21_MAX' 'vts_last_21_SUM']


In [25]:
#离线验证
active_days_valid_clf, active_days_valid_result = train (df_train_data, df_valid_data, 'active_days', params, features)
watch_nums_valid_clf, watch_nums_valid_result = train (df_train_data, df_valid_data, 'watch_nums', params, features)
watch_durations_valid_clf, watch_durations_valid_result = train (df_train_data, df_valid_data, 'watch_durations', params, features)

Index(['date_day_last_1_NUNIQUE', 'vid_last_1_COUNT', 'vts_last_1_MEAN',
       'vts_last_1_STD', 'vts_last_1_MIN', 'vts_last_1_MAX', 'vts_last_1_SUM',
       'date_day_last_3_NUNIQUE', 'vid_last_3_COUNT', 'vts_last_3_MEAN',
       'vts_last_3_STD', 'vts_last_3_MIN', 'vts_last_3_MAX', 'vts_last_3_SUM',
       'date_day_last_7_NUNIQUE', 'vid_last_7_COUNT', 'vts_last_7_MEAN',
       'vts_last_7_STD', 'vts_last_7_MIN', 'vts_last_7_MAX', 'vts_last_7_SUM',
       'date_day_last_21_NUNIQUE', 'vid_last_21_COUNT', 'vts_last_21_MEAN',
       'vts_last_21_STD', 'vts_last_21_MIN', 'vts_last_21_MAX',
       'vts_last_21_SUM'],
      dtype='object')




Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.58312	valid_1's rmse: 1.55676
[100]	training's rmse: 1.57327	valid_1's rmse: 1.56205
Early stopping, best iteration is:
[46]	training's rmse: 1.5855	valid_1's rmse: 1.55645
Index(['date_day_last_1_NUNIQUE', 'vid_last_1_COUNT', 'vts_last_1_MEAN',
       'vts_last_1_STD', 'vts_last_1_MIN', 'vts_last_1_MAX', 'vts_last_1_SUM',
       'date_day_last_3_NUNIQUE', 'vid_last_3_COUNT', 'vts_last_3_MEAN',
       'vts_last_3_STD', 'vts_last_3_MIN', 'vts_last_3_MAX', 'vts_last_3_SUM',
       'date_day_last_7_NUNIQUE', 'vid_last_7_COUNT', 'vts_last_7_MEAN',
       'vts_last_7_STD', 'vts_last_7_MIN', 'vts_last_7_MAX', 'vts_last_7_SUM',
       'date_day_last_21_NUNIQUE', 'vid_last_21_COUNT', 'vts_last_21_MEAN',
       'vts_last_21_STD', 'vts_last_21_MIN', 'vts_last_21_MAX',
       'vts_last_21_SUM'],
      dtype='object')




Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 14.9178	valid_1's rmse: 14.6088
[100]	training's rmse: 14.8106	valid_1's rmse: 14.7565
Early stopping, best iteration is:
[24]	training's rmse: 15.2245	valid_1's rmse: 14.4461
Index(['date_day_last_1_NUNIQUE', 'vid_last_1_COUNT', 'vts_last_1_MEAN',
       'vts_last_1_STD', 'vts_last_1_MIN', 'vts_last_1_MAX', 'vts_last_1_SUM',
       'date_day_last_3_NUNIQUE', 'vid_last_3_COUNT', 'vts_last_3_MEAN',
       'vts_last_3_STD', 'vts_last_3_MIN', 'vts_last_3_MAX', 'vts_last_3_SUM',
       'date_day_last_7_NUNIQUE', 'vid_last_7_COUNT', 'vts_last_7_MEAN',
       'vts_last_7_STD', 'vts_last_7_MIN', 'vts_last_7_MAX', 'vts_last_7_SUM',
       'date_day_last_21_NUNIQUE', 'vid_last_21_COUNT', 'vts_last_21_MEAN',
       'vts_last_21_STD', 'vts_last_21_MIN', 'vts_last_21_MAX',
       'vts_last_21_SUM'],
      dtype='object')




Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 14283.7	valid_1's rmse: 14344.8
[100]	training's rmse: 14169	valid_1's rmse: 14354.6
[150]	training's rmse: 14109.8	valid_1's rmse: 14355.8
Early stopping, best iteration is:
[57]	training's rmse: 14255	valid_1's rmse: 14341.8


In [26]:
active_days_valid_result

0.3012553797732195

In [27]:
watch_nums_valid_result

0.17503635965526743

In [28]:
watch_durations_valid_result

0.347061816188191

In [29]:
active_days_valid_result * 0.5 + watch_nums_valid_result * 0.25 + watch_durations_valid_result * 0.25

0.28115223384747434

In [30]:
#预测留存
params['num_iterations'] = active_days_valid_clf.best_iteration
active_days_test_clf, _ = train (df_valid_data, df_valid_data, 'active_days', params, features)

params['num_iterations'] = watch_nums_valid_clf.best_iteration
watch_nums_test_clf, _ = train (df_valid_data, df_valid_data, 'watch_nums', params, features)

params['num_iterations'] = watch_durations_valid_clf.best_iteration
watch_durations_test_clf, _ = train (df_valid_data, df_valid_data, 'watch_durations', params, features)

Index(['date_day_last_1_NUNIQUE', 'vid_last_1_COUNT', 'vts_last_1_MEAN',
       'vts_last_1_STD', 'vts_last_1_MIN', 'vts_last_1_MAX', 'vts_last_1_SUM',
       'date_day_last_3_NUNIQUE', 'vid_last_3_COUNT', 'vts_last_3_MEAN',
       'vts_last_3_STD', 'vts_last_3_MIN', 'vts_last_3_MAX', 'vts_last_3_SUM',
       'date_day_last_7_NUNIQUE', 'vid_last_7_COUNT', 'vts_last_7_MEAN',
       'vts_last_7_STD', 'vts_last_7_MIN', 'vts_last_7_MAX', 'vts_last_7_SUM',
       'date_day_last_21_NUNIQUE', 'vid_last_21_COUNT', 'vts_last_21_MEAN',
       'vts_last_21_STD', 'vts_last_21_MIN', 'vts_last_21_MAX',
       'vts_last_21_SUM'],
      dtype='object')




Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[46]	training's rmse: 1.51471	valid_1's rmse: 1.51471
Index(['date_day_last_1_NUNIQUE', 'vid_last_1_COUNT', 'vts_last_1_MEAN',
       'vts_last_1_STD', 'vts_last_1_MIN', 'vts_last_1_MAX', 'vts_last_1_SUM',
       'date_day_last_3_NUNIQUE', 'vid_last_3_COUNT', 'vts_last_3_MEAN',
       'vts_last_3_STD', 'vts_last_3_MIN', 'vts_last_3_MAX', 'vts_last_3_SUM',
       'date_day_last_7_NUNIQUE', 'vid_last_7_COUNT', 'vts_last_7_MEAN',
       'vts_last_7_STD', 'vts_last_7_MIN', 'vts_last_7_MAX', 'vts_last_7_SUM',
       'date_day_last_21_NUNIQUE', 'vid_last_21_COUNT', 'vts_last_21_MEAN',
       'vts_last_21_STD', 'vts_last_21_MIN', 'vts_last_21_MAX',
       'vts_last_21_SUM'],
      dtype='object')




Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[24]	training's rmse: 14.2788	valid_1's rmse: 14.2788
Index(['date_day_last_1_NUNIQUE', 'vid_last_1_COUNT', 'vts_last_1_MEAN',
       'vts_last_1_STD', 'vts_last_1_MIN', 'vts_last_1_MAX', 'vts_last_1_SUM',
       'date_day_last_3_NUNIQUE', 'vid_last_3_COUNT', 'vts_last_3_MEAN',
       'vts_last_3_STD', 'vts_last_3_MIN', 'vts_last_3_MAX', 'vts_last_3_SUM',
       'date_day_last_7_NUNIQUE', 'vid_last_7_COUNT', 'vts_last_7_MEAN',
       'vts_last_7_STD', 'vts_last_7_MIN', 'vts_last_7_MAX', 'vts_last_7_SUM',
       'date_day_last_21_NUNIQUE', 'vid_last_21_COUNT', 'vts_last_21_MEAN',
       'vts_last_21_STD', 'vts_last_21_MIN', 'vts_last_21_MAX',
       'vts_last_21_SUM'],
      dtype='object')




Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 14153.1	valid_1's rmse: 14153.1
Did not meet early stopping. Best iteration is:
[57]	training's rmse: 14120.9	valid_1's rmse: 14120.9


In [31]:
#predict
df_test_data['active_days'] = active_days_test_clf.predict(df_test_data[features], num_iteration=active_days_test_clf.best_iteration)
df_test_data['watch_nums'] = watch_nums_test_clf.predict(df_test_data[features], num_iteration=watch_nums_test_clf.best_iteration)
df_test_data['watch_durations'] = watch_durations_test_clf.predict(df_test_data[features], num_iteration=watch_durations_test_clf.best_iteration)
df_test_answer = df_test_data[['did', 'active_days', 'watch_nums', 'watch_durations']]

In [32]:
df_test_answer.to_csv('df_test_answer_baseline_v1.csv', index=None)