## 上加上投放时间相关特征

In [2]:
import time
import json
import lightgbm as lgb
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

## 1.加载数据，构造lightgbm数据集

In [3]:
path='../../data/AD/da/'
train=pd.read_csv(path+"train1_without0_delete.csv",index_col=0)
testB_df_path = '../../data/AD/da/Btest_sample_new.dat'
test_name=['id','ad_id','create_time','ad_size','ad_trade_id','good_class','good_id','ad_acc_id','deliver_time','people','ad_bid']
testB_df = pd.read_csv(testB_df_path,delimiter = '\t',parse_dates = ['create_time'],header=None,names = test_name)

### 日期转星期

In [4]:
def week_day_1(x):   # 根据日期
    week = datetime.strptime(str(x), '%Y%m%d').weekday()
    return week+1
train['week'] = train['create_update_time'].apply(week_day_1)
train.head()

Unnamed: 0,ad_id,create_update_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,label,week
0,31,20190312,40,224,13,18683,12577.0,"281474976694272,281474976694272,28147497669427...",area:11442,100,14.0,2
10,32,20190218,40,136,13,32534,18752.0,"70368475742208,70368475742208,70368475742208,7...",area:7572,83,1.0,1
11,32,20190219,40,136,13,32534,18752.0,"70368475742208,70368475742208,70368475742208,7...",area:7572,83,11.0,2
33,32,20190313,40,136,13,32534,18752.0,"70368475742208,70368475742208,70368475742208,7...",area:7572,82,4.0,3
38,32,20190318,40,136,13,32534,18752.0,"70368475742208,70368475742208,70368475742208,7...",area:7572,80,4.0,1


In [5]:
#deliver_time 格式为281474976694272,281474976694272,28147497669427，281474976694272,281474976694272,28147497669427
#将deliver_time分开为六个时段，发现大多时段基本都是重合，保险起见选出最大的时段，例如变成7点到24点，那么构造三个特征，起始时间，结束时间，总计时长（deliver_start，deliver_end，deliver_long）
# 将时间转换测试
def convertOneStr2Interval(x):
    x = int(x)
    bin_str = bin(x)[2:]
    bin_len = len(bin_str)
    r_pos = bin_str.rfind('1')
    if bin_len % 2 == 0:
        end_date =bin_len//2
    else:
        end_date = bin_len//2+0.5
    
    interval = bin_len - r_pos - 1
    if interval % 2 == 0:
        begin_date = interval//2
    else:
        begin_date = interval//2
    return begin_date,end_date,end_date-begin_date

def convertStr2Interval(x):
    res_str = ''
    time_list = x.split(',')
    max=-10
    x_end=0
    y_end=0
    z_end=0
    for time in time_list:
        x,y,z=convertOneStr2Interval(time)
        if z>max:
            max=z
            x_end=x
            y_end=y
            z_end=z
        #res_str += convertOneStr2Interval(time) + ','
    return str(x_end)+','+str(y_end)+','+str(z_end)
train['deliver_time']=train['deliver_time'].apply(lambda x:convertStr2Interval(x))
train['deliver_start']=train['deliver_time'].apply(lambda x:x.split(',')[0])
train['deliver_end']=train['deliver_time'].apply(lambda x:x.split(',')[1])
train['deliver_long']=train['deliver_time'].apply(lambda x:x.split(',')[2])

In [6]:
train[['deliver_start','deliver_end','deliver_long']] = train[['deliver_start','deliver_end','deliver_long']].astype('float')
train.head(1)

Unnamed: 0,ad_id,create_update_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,label,week,deliver_start,deliver_end,deliver_long
0,31,20190312,40,224,13,18683,12577.0,72417,area:11442,100,14.0,2,7.0,24.0,17.0


In [8]:
train.dtypes

ad_id                   int64
create_update_time      int64
ad_size                 int64
ad_trade_id             int64
good_class              int64
good_id                 int64
ad_acc_id             float64
deliver_time           object
people                 object
ad_bid                  int64
label                 float64
week                    int64
deliver_start         float64
deliver_end           float64
deliver_long          float64
dtype: object

In [9]:
df_train=train[train.create_update_time!=20190318]
df_test=train[train.create_update_time==20190318]

In [10]:
df_train.head(1)

Unnamed: 0,ad_id,create_update_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,label,week,deliver_start,deliver_end,deliver_long
0,31,20190312,40,224,13,18683,12577.0,72417,area:11442,100,14.0,2,7.0,24.0,17.0


In [11]:
df_train.drop(['create_update_time','deliver_time','people'], axis=1, inplace=True)
df_train.drop_duplicates(keep='first', inplace=True)
df_test.drop(['create_update_time','deliver_time','people'], axis=1, inplace=True)
df_test.drop_duplicates(keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [13]:
df_train.head()

Unnamed: 0,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,label,week,deliver_start,deliver_end,deliver_long
0,31,40,224,13,18683,12577.0,100,14.0,2,7.0,24.0,17.0
10,32,40,136,13,32534,18752.0,83,1.0,1,14.0,23.0,9.0
11,32,40,136,13,32534,18752.0,83,11.0,2,14.0,23.0,9.0
33,32,40,136,13,32534,18752.0,82,4.0,3,14.0,23.0,9.0
39,84,30,98,13,18194,789.0,100,1.0,3,10.0,21.0,11.0


In [14]:
y_train = df_train.label.values
y_test = df_test.label.values
X_train = df_train.drop(['ad_id','label'], axis=1).values
X_test = df_test.drop(['ad_id','label'], axis=1).values
train_data = lgb.Dataset(data=X_train,label=y_train)
test_data = lgb.Dataset(data=X_test,label=y_test)

##  2.填充最优参数并预测0318号的数据，预测SMAPE得分

In [15]:
params = {
        'task': 'train',
        'max_depth':16,
        'num_trees':413,
        'learning_rate':0.051011639607821485,
        'bagging_fraction':0.55 ,
        'num_leaves':85,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'feature_fraction': 0.7,  # 样本列采样
        'lambda_l1': 0,  # L1 正则化
        'lambda_l2': 0,  # L2 正则化
        'bagging_seed': 100,  # 随机种子,light中默认为100
        'silent':1,
        }
print('Start training...')
# train
gbm = lgb.train(params,
                train_data,
                num_boost_round=20,
                valid_sets=test_data,
                early_stopping_rounds=5)
print('Save model...')
# save model to file
gbm.save_model('model.txt')
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print(y_pred)

Start training...
[1]	valid_0's l2: 37428.8
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 36633.9
[3]	valid_0's l2: 35200.9
[4]	valid_0's l2: 34619.6
[5]	valid_0's l2: 33709.6
[6]	valid_0's l2: 33369.1
[7]	valid_0's l2: 33262.2
[8]	valid_0's l2: 33131.1
[9]	valid_0's l2: 33182.6
[10]	valid_0's l2: 33077.3
[11]	valid_0's l2: 33087.7
[12]	valid_0's l2: 32899.3
[13]	valid_0's l2: 33091.2
[14]	valid_0's l2: 33271.7
[15]	valid_0's l2: 33357.4
[16]	valid_0's l2: 33591.1
[17]	valid_0's l2: 34052.3
Early stopping, best iteration is:
[12]	valid_0's l2: 32899.3
Save model...


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Start predicting...
[ 31.88135897  31.13761896  28.73893087 ... 196.16923692  36.53943667
  31.88135897]


In [16]:
# 测试分数 (0-2之间,越小越好)
def getSMAPEScore (y_true, y_pred):
    #print(y_true)
    # 预测值小于0如何处理 y_pred=np.abs(y_pred)
    y_pred=(np.abs(y_pred) + y_pred) / 2
    SMAPE=2.0 * np.mean(np.abs(y_pred - y_true+0.0001) / (y_pred+ y_true+0.0001))
    return SMAPE,40*(1-SMAPE/2)

In [17]:
getSMAPEScore(y_test,y_pred)

(1.2563543663325818, 14.872912673348363)

## 5.测试集预测

In [18]:
testB_df.head(1)

Unnamed: 0,id,ad_id,create_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid
0,1,152978,1480387159,30,118,3,-1,23511,"281474976710655,281474976710655,28147497671065...","age:182,202,739,340,536,287,690,988,187,367,47...",37


In [19]:
testB_df.dtypes

id               int64
ad_id            int64
create_time     object
ad_size          int64
ad_trade_id      int64
good_class       int64
good_id          int64
ad_acc_id        int64
deliver_time    object
people          object
ad_bid           int64
dtype: object

In [20]:
testB_df[['create_time']] = testB_df[['create_time']].astype('int')
testB_df.dtypes

id               int64
ad_id            int64
create_time      int64
ad_size          int64
ad_trade_id      int64
good_class       int64
good_id          int64
ad_acc_id        int64
deliver_time    object
people          object
ad_bid           int64
dtype: object

In [21]:
import time

def time2strftime(nowtime):   # 时间戳转时间函数
    timeArray = time.localtime(nowtime)
    timeChange = int(time.strftime('%Y%m%d%H%M%S', timeArray))
    return timeChange
testB_df['create_time_date'] = testB_df['create_time'].apply(time2strftime)
testB_df.head(1)

Unnamed: 0,id,ad_id,create_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,create_time_date
0,1,152978,1480387159,30,118,3,-1,23511,"281474976710655,281474976710655,28147497671065...","age:182,202,739,340,536,287,690,988,187,367,47...",37,20161129023919


In [22]:
# 函数YMD表示将数据转换成只有年月日的形式，便于统计一天的曝光量
def YMD(nowtime):   # 时间处理
    ymd = int(nowtime/1000000)
    return ymd
testB_df['YMD'] = testB_df['create_time_date'].apply(YMD)
testB_df.head(1)

Unnamed: 0,id,ad_id,create_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,create_time_date,YMD
0,1,152978,1480387159,30,118,3,-1,23511,"281474976710655,281474976710655,28147497671065...","age:182,202,739,340,536,287,690,988,187,367,47...",37,20161129023919,20161129


In [23]:
def week_day_2(timestamp):    # 根据时间戳计算week
    return time.strftime('%w',time.localtime(timestamp))
testB_df['weekday'] = testB_df['create_time'].apply(week_day_2)
testB_df.head(1)

Unnamed: 0,id,ad_id,create_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,create_time_date,YMD,weekday
0,1,152978,1480387159,30,118,3,-1,23511,"281474976710655,281474976710655,28147497671065...","age:182,202,739,340,536,287,690,988,187,367,47...",37,20161129023919,20161129,2


In [24]:
testB_df['deliver_time']=testB_df['deliver_time'].apply(lambda x:convertStr2Interval(x))
testB_df['deliver_start']=testB_df['deliver_time'].apply(lambda x:x.split(',')[0])
testB_df['deliver_end']=testB_df['deliver_time'].apply(lambda x:x.split(',')[1])
testB_df['deliver_long']=testB_df['deliver_time'].apply(lambda x:x.split(',')[2])
testB_df[['deliver_start','deliver_end','deliver_long']] = testB_df[['deliver_start','deliver_end','deliver_long']].astype('float')
testB_df.head(1)

Unnamed: 0,id,ad_id,create_time,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,deliver_time,people,ad_bid,create_time_date,YMD,weekday,deliver_start,deliver_end,deliver_long
0,1,152978,1480387159,30,118,3,-1,23511,2424,"age:182,202,739,340,536,287,690,988,187,367,47...",37,20161129023919,20161129,2,0.0,24.0,24.0


### week_day和week_day_2分别是按照日期和时间戳计算week的

In [25]:
testB_data = testB_df.drop(['create_time','create_time_date','deliver_time','people'], axis=1)
testB_data.head(1)

Unnamed: 0,id,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,YMD,weekday,deliver_start,deliver_end,deliver_long
0,1,152978,30,118,3,-1,23511,37,20161129,2,0.0,24.0,24.0


In [26]:
exp_his = pd.read_csv(path + 'exp_his_label.csv', index_col=0)
exp_his.head(1)

Unnamed: 0,ad_id,request_time,label
0,415,20190217,6


In [27]:
# from datetime import datetime,timedelta
import datetime
def dayminis1(now):
    datestart=datetime.datetime.strptime(str(now),'%Y%m%d')
    datestart-=datetime.timedelta(days=+1)
    return datestart.strftime('%Y%m%d')
exp_his['request_time']=exp_his['request_time'].apply(lambda x:dayminis1(x))
exp_his.head(1)

Unnamed: 0,ad_id,request_time,label
0,415,20190216,6


In [29]:
exp_his_ad_id = list(set(exp_his.ad_id.tolist()))
testB_data_rule = testB_data[testB_data['ad_id'].isin(exp_his_ad_id)]
testB_data_predict = testB_data[~testB_data['ad_id'].isin(exp_his_ad_id)]
len(testB_data),len(testB_data_rule),len(testB_data_predict)

(38596, 14396, 24200)

In [30]:
testB_data_rule.head(1)

Unnamed: 0,id,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,YMD,weekday,deliver_start,deliver_end,deliver_long
0,1,152978,30,118,3,-1,23511,37,20161129,2,0.0,24.0,24.0


In [31]:
testB_data_rule_ad_id = list(set(testB_data_rule.ad_id.tolist()))
len(testB_data_rule_ad_id)

1384

In [32]:
exp_his_rule = exp_his.groupby('ad_id').mean()
exp_his_rule.head(1)

Unnamed: 0_level_0,label
ad_id,Unnamed: 1_level_1
415,4.52


In [34]:
groupby_rule = testB_data_rule.groupby(['ad_id','ad_bid','YMD']).agg({'ad_bid':np.median})
groupby_rule.to_csv('groupby_rule.csv')
groupby_rule = pd.read_csv('groupby_rule.csv')
exp_his_rule.to_csv('exp_his_rule.csv')
exp_his_rule1 = pd.read_csv('exp_his_rule.csv')


In [35]:
merge_df = pd.merge(groupby_rule, exp_his_rule1)

In [36]:
merge_df.head(1)

Unnamed: 0,ad_id,ad_bid,YMD,ad_bid.1,label
0,415,34,20190118,34,4.52


In [37]:
merge_df.dtypes

ad_id         int64
ad_bid        int64
YMD           int64
ad_bid.1      int64
label       float64
dtype: object

In [38]:
merge_df_ad_id = sorted(list(set(merge_df.ad_id.tolist())))
len(merge_df_ad_id)

1384

In [39]:
merge_df.drop('ad_bid.1', axis=1, inplace=True)
merge_df.head()

Unnamed: 0,ad_id,ad_bid,YMD,label
0,415,34,20190118,4.52
1,415,347,20190118,4.52
2,415,387,20190118,4.52
3,415,457,20190118,4.52
4,415,565,20190118,4.52


In [40]:
merge_df['label_bais'] = (merge_df.ad_bid/10000).tolist()
merge_df.head()

Unnamed: 0,ad_id,ad_bid,YMD,label,label_bais
0,415,34,20190118,4.52,0.0034
1,415,347,20190118,4.52,0.0347
2,415,387,20190118,4.52,0.0387
3,415,457,20190118,4.52,0.0457
4,415,565,20190118,4.52,0.0565


In [41]:
label_new = list(np.zeros(merge_df.shape[0]))
merge_df['label_new'] = label_new
merge_df.head()

Unnamed: 0,ad_id,ad_bid,YMD,label,label_bais,label_new
0,415,34,20190118,4.52,0.0034,0.0
1,415,347,20190118,4.52,0.0347,0.0
2,415,387,20190118,4.52,0.0387,0.0
3,415,457,20190118,4.52,0.0457,0.0
4,415,565,20190118,4.52,0.0565,0.0


In [42]:
group_append = pd.DataFrame(columns = ['ad_id','ad_bid','YMD','label','label_bais','label_new'])
for name, group in merge_df.groupby('ad_id'):
    group_index = group.index.values
    g_len = group.shape[0]
    l_b = group.label_bais.tolist()
    median_val = int(g_len/2)
    val = group.label[group_index[0]]
    group['label_new'][median_val+group_index[0]] = val
    for i in range(group_index[0],(group_index[0]+median_val)):
        group['label_new'][i] = val - np.sum(l_b[(i-group_index[0]):median_val])
    for j in range((group_index[0]+median_val+1),(group_index[0]+g_len)):
        group['label_new'][j] = val + np.sum(l_b[(median_val+1):(j-group_index[0]+1)])
    # print(name)
    # print(group)
    group_append = group_append.append(group)
group_append.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,ad_id,ad_bid,YMD,label,label_bais,label_new
0,415,34,20190118,4.52,0.0034,4.341
1,415,347,20190118,4.52,0.0347,4.3444
2,415,387,20190118,4.52,0.0387,4.3791
3,415,457,20190118,4.52,0.0457,4.4178
4,415,565,20190118,4.52,0.0565,4.4635


In [43]:
group_append_info = group_append[['ad_id','ad_bid','label_new']]
group_append_info['ad_id_bid'] = group_append['ad_id'].map(str)+group_append['ad_bid'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [46]:
testB_data_rule['ad_id_bid'] = testB_data_rule['ad_id'].map(str) + testB_data_rule['ad_bid'].map(str)
testB_data_rule_result = pd.merge(testB_data_rule,group_append_info,on='ad_id_bid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [47]:
# testB_data_rule_fin变量是最终按照规则得到的label
testB_data_rule_fin = testB_data_rule_result.drop(['YMD','ad_id_bid','ad_id_y','ad_bid_y'], axis=1)
testB_data_rule_fin.columns = ['id','ad_id','ad_size','ad_trade_id','good_class','good_id','ad_acc_id','ad_bid','weekday','deliver_start','deliver_end','deliver_long','label']
testB_data_rule_fin.head()

Unnamed: 0,id,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,weekday,deliver_start,deliver_end,deliver_long,label
0,1,152978,30,118,3,-1,23511,37,2,0.0,24.0,24.0,13.3196
1,2,507977,30,122,13,629,4211,54,2,7.0,23.0,16.0,28.2143
2,3,639943,34,84,13,2525,15582,161,4,7.0,24.0,17.0,3.416667
3,9,443852,30,122,13,4332,6678,66,3,7.0,24.0,17.0,14.293592
4,14,199900,40,122,13,4702,1387,46,6,9.0,22.0,13.0,5.131764


In [48]:
X_testB_pre = testB_data_predict.drop(['id','ad_id','YMD'], axis=1).values
y_testB_pre = gbm.predict(X_testB_pre, num_iteration=gbm.best_iteration)
# eval
print(y_testB_pre)
testB_data_predict['label'] = y_testB_pre
testB_data_predict.head()

[180.57011498 277.57470943  29.20387339 ... 632.63051852  67.05983676
  37.27547942]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,id,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,YMD,weekday,deliver_start,deliver_end,deliver_long,label
3,4,605859,40,98,13,3808,10907,28,20190321,4,14.0,20.0,6.0,180.570115
4,5,742582,40,122,13,7017,15855,12,20190228,4,11.0,19.5,8.5,277.574709
5,6,742243,30,146,13,7904,19328,34,20180123,2,7.0,24.0,17.0,29.203873
6,7,306151,40,221,1,-1,6262,1,20190321,4,0.0,24.0,24.0,125.529275
7,8,739668,64,54,15,2396,2347,10,20190305,2,0.0,24.0,24.0,37.527144


In [49]:
testB_data_predict_groupby = testB_data_predict.groupby(['ad_id','ad_bid']).agg({'label':np.sort})
testB_data_predict_groupby.to_csv('testB_data_predict_groupby.csv')
testB_data_predict_groupby = pd.read_csv('testB_data_predict_groupby.csv')
testB_data_predict_groupby.head()

Unnamed: 0,ad_id,ad_bid,label
0,1030,21,34.23124
1,1030,96,28.460133
2,1030,123,28.460133
3,1030,130,28.460133
4,1030,200,28.460133


In [50]:
list_label = []
for name, group in testB_data_predict.groupby('ad_id'):
    list_label = list_label + sorted(group.label)
testB_data_predict_groupby['label_new'] = list_label
testB_data_predict_groupby['label_new_2'] = list(testB_data_predict_groupby.label_new + testB_data_predict_groupby.ad_bid/10000)
testB_data_predict_groupby_2 = testB_data_predict_groupby[['ad_id','ad_bid','label_new_2']]
testB_data_predict_groupby_2['ad_id_bid'] = testB_data_predict_groupby_2['ad_id'].map(str) + testB_data_predict_groupby_2['ad_bid'].map(str)
testB_data_predict_groupby_2.head()
testB_data_predict['ad_id_bid'] = testB_data_predict['ad_id'].map(str) + testB_data_predict['ad_bid'].map(str)
testB_data_predict_result = pd.merge(testB_data_predict,testB_data_predict_groupby_2,on='ad_id_bid')
testB_data_predict_info = testB_data_predict_result[[]]
# testB_data_predict_fin变量是最终按照模型预测得到的label
testB_data_predict_fin = testB_data_predict_result.drop(['YMD','label','ad_id_bid','ad_id_y','ad_bid_y'], axis=1)
testB_data_predict_fin.columns = ['id','ad_id','ad_size','ad_trade_id','good_class','good_id','ad_acc_id','ad_bid','weekday','deliver_start','deliver_end','deliver_long','label']
testB_data_predict_fin.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,id,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,weekday,deliver_start,deliver_end,deliver_long,label
0,4,605859,40,98,13,3808,10907,28,4,14.0,20.0,6.0,221.98916
1,5,742582,40,122,13,7017,15855,12,4,11.0,19.5,8.5,31.882559
2,6,742243,30,146,13,7904,19328,34,2,7.0,24.0,17.0,29.207273
3,7,306151,40,221,1,-1,6262,1,4,0.0,24.0,24.0,40.998368
4,8,739668,64,54,15,2396,2347,10,2,0.0,24.0,24.0,37.401141


## 合并结果：testB_data_rule_fin和testB_data_predict_fin

In [51]:
testB_label = testB_data_rule_fin.append(testB_data_predict_fin)
testB_label = testB_label.sort_values(by='id')
testB_label.head(2)

Unnamed: 0,id,ad_id,ad_size,ad_trade_id,good_class,good_id,ad_acc_id,ad_bid,weekday,deliver_start,deliver_end,deliver_long,label
0,1,152978,30,118,3,-1,23511,37,2,0.0,24.0,24.0,13.3196
1,2,507977,30,122,13,629,4211,54,2,7.0,23.0,16.0,28.2143


In [64]:
submission = testB_label[['id','label']]
submission.to_csv('submission_week_deliver.csv', header=None, index=False)