In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

import time
import datetime

import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

import lightgbm as lgb

In [2]:
train = pd.read_csv('train.csv')
train = train.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)

test = pd.read_csv('evaluation_public.csv')
test = test.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

In [3]:
train.head()

Unnamed: 0,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,2,16,available,sql,x86_64,3,54,0,0,0,0,0,1590683100000,vm,20.0
1,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590683400000,vm,20.0
2,2,16,available,sql,x86_64,7,54,0,0,0,0,0,1590683700000,vm,20.0
3,2,16,available,sql,x86_64,4,54,0,0,0,0,0,1590684000000,vm,20.0
4,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590684120000,vm,20.0


In [4]:
test.head()

Unnamed: 0,ID,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,1,297,16,available,sql,x86_64,60,69,0,5,5,0,0,1662213420000,vm,9
1,1,297,16,available,sql,x86_64,58,69,0,9,4,0,0,1662213720000,vm,9
2,1,297,16,available,sql,x86_64,80,67,0,9,1,0,0,1662214020000,vm,9
3,1,297,16,available,sql,x86_64,100,65,0,7,2,0,1,1662214320000,vm,9
4,1,297,16,available,sql,x86_64,98,67,0,10,3,0,1,1662214620000,vm,9


In [5]:
train.shape, test.shape

((501730, 15), (14980, 16))

In [6]:
# 这些 columns 在 test 只有单一值, 所以直接去掉

del train['STATUS']
del train['PLATFORM']
del train['RESOURCE_TYPE']

del test['STATUS']
del test['PLATFORM']
del test['RESOURCE_TYPE']

In [7]:
# Label Encoding

le = LabelEncoder()
train['QUEUE_TYPE'] = le.fit_transform(train['QUEUE_TYPE'].astype(str))
test['QUEUE_TYPE'] = le.transform(test['QUEUE_TYPE'].astype(str))

In [8]:
train.isnull().sum()

QUEUE_ID                  0
CU                        0
QUEUE_TYPE                0
CPU_USAGE                 0
MEM_USAGE                 0
LAUNCHING_JOB_NUMS        0
RUNNING_JOB_NUMS          0
SUCCEED_JOB_NUMS          0
CANCELLED_JOB_NUMS        0
FAILED_JOB_NUMS           0
DOTTING_TIME              0
DISK_USAGE            33095
dtype: int64

In [9]:
def local_time(timestamp):
    # 转换成localtime，不要忘记除以1000
    time_local = datetime.datetime.fromtimestamp(timestamp / 1000)
    return time_local

train['date_time'] = train['DOTTING_TIME'].map(lambda x: local_time(x))
test['date_time'] = test['DOTTING_TIME'].map(lambda x: local_time(x))

In [10]:
time_features = ['day', 'hour']

for feat in time_features:
    train[feat] = train['date_time'].map(lambda x : getattr(x, feat))
    test[feat] = test['date_time'].map(lambda x : getattr(x, feat))

In [11]:
train.head()

Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,DISK_USAGE,date_time,day,hour
0,2,16,2,3,54,0,0,0,0,0,1590683100000,20.0,2020-05-29 00:25:00,29,0
1,2,16,2,2,54,0,0,0,0,0,1590683400000,20.0,2020-05-29 00:30:00,29,0
2,2,16,2,7,54,0,0,0,0,0,1590683700000,20.0,2020-05-29 00:35:00,29,0
3,2,16,2,4,54,0,0,0,0,0,1590684000000,20.0,2020-05-29 00:40:00,29,0
4,2,16,2,5,54,0,0,0,0,0,1590684120000,20.0,2020-05-29 00:42:00,29,0


In [12]:
# 只用 CPU_USAGE 和 MEM_USAGE，CU 及 QUEUE_TYPE （提取）

keep_cols_train = ['QUEUE_ID', 'CU', 'QUEUE_TYPE', 'MEM_USAGE', 'CPU_USAGE', 'day', 'hour', 'LAUNCHING_JOB_NUMS',
                   'RUNNING_JOB_NUMS', 'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS']
keep_cols_test = ['ID', 'QUEUE_ID', 'CU', 'QUEUE_TYPE', 'MEM_USAGE', 'CPU_USAGE', 'day', 'hour', 'LAUNCHING_JOB_NUMS',
                  'RUNNING_JOB_NUMS', 'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS']

train = train[keep_cols_train]
test = test[keep_cols_test]

In [13]:
train.head()

Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,MEM_USAGE,CPU_USAGE,day,hour,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS
0,2,16,2,54,3,29,0,0,0,0,0,0
1,2,16,2,54,2,29,0,0,0,0,0,0
2,2,16,2,54,7,29,0,0,0,0,0,0
3,2,16,2,54,4,29,0,0,0,0,0,0
4,2,16,2,54,5,29,0,0,0,0,0,0


In [14]:
test.head()

Unnamed: 0,ID,QUEUE_ID,CU,QUEUE_TYPE,MEM_USAGE,CPU_USAGE,day,hour,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS
0,1,297,16,2,69,60,3,21,0,5,5,0,0
1,1,297,16,2,69,58,3,22,0,9,4,0,0
2,1,297,16,2,67,80,3,22,0,9,1,0,0
3,1,297,16,2,65,100,3,22,0,7,2,0,1
4,1,297,16,2,67,98,3,22,0,10,3,0,1


In [15]:
merge_cols = ['MEM_USAGE', 'CPU_USAGE', 'day', 'hour', 'LAUNCHING_JOB_NUMS', 
              'RUNNING_JOB_NUMS', 'SUCCEED_JOB_NUMS', 'CANCELLED_JOB_NUMS', 'FAILED_JOB_NUMS']

In [16]:
# 处理train

for i in range(1, 6):
    cols = [col + "_" + str(i) for col in merge_cols]
    train[cols] = train.groupby('QUEUE_ID')[merge_cols].shift(-1 * i + 1)
    
for i in range(1, 6):
    cpu_name = 'cpu_' + str(i)
    train[cpu_name] = train.groupby('QUEUE_ID')['CPU_USAGE'].shift(-1 * i - 4)
    
train.drop(columns=merge_cols, inplace=True)

In [17]:
train.head()

Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,MEM_USAGE_1,CPU_USAGE_1,day_1,hour_1,LAUNCHING_JOB_NUMS_1,RUNNING_JOB_NUMS_1,SUCCEED_JOB_NUMS_1,CANCELLED_JOB_NUMS_1,FAILED_JOB_NUMS_1,MEM_USAGE_2,CPU_USAGE_2,day_2,hour_2,LAUNCHING_JOB_NUMS_2,RUNNING_JOB_NUMS_2,SUCCEED_JOB_NUMS_2,CANCELLED_JOB_NUMS_2,FAILED_JOB_NUMS_2,MEM_USAGE_3,CPU_USAGE_3,day_3,hour_3,LAUNCHING_JOB_NUMS_3,RUNNING_JOB_NUMS_3,SUCCEED_JOB_NUMS_3,CANCELLED_JOB_NUMS_3,FAILED_JOB_NUMS_3,MEM_USAGE_4,CPU_USAGE_4,day_4,hour_4,LAUNCHING_JOB_NUMS_4,RUNNING_JOB_NUMS_4,SUCCEED_JOB_NUMS_4,CANCELLED_JOB_NUMS_4,FAILED_JOB_NUMS_4,MEM_USAGE_5,CPU_USAGE_5,day_5,hour_5,LAUNCHING_JOB_NUMS_5,RUNNING_JOB_NUMS_5,SUCCEED_JOB_NUMS_5,CANCELLED_JOB_NUMS_5,FAILED_JOB_NUMS_5,cpu_1,cpu_2,cpu_3,cpu_4,cpu_5
0,2,16,2,54,3,29,0,0,0,0,0,0,54.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,7.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,4.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,5.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,2.0,5.0,6.0
1,2,16,2,54,2,29,0,0,0,0,0,0,54.0,7.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,4.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,5.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,3.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,5.0,6.0,2.0
2,2,16,2,54,7,29,0,0,0,0,0,0,54.0,4.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,5.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,3.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,6.0,2.0,3.0
3,2,16,2,54,4,29,0,0,0,0,0,0,54.0,5.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,3.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,2.0,3.0,10.0
4,2,16,2,54,5,29,0,0,0,0,0,0,55.0,3.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,5.0,29.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,2.0,3.0,10.0,6.0


In [18]:
# 处理test

for i in range(1, 6):
    cols = [col + "_" + str(i) for col in merge_cols]
    test[cols] = test.groupby(['ID', 'QUEUE_ID'])[merge_cols].shift(-1 * i + 1)
    
test.drop(columns=merge_cols, inplace=True)

In [19]:
test.head()

Unnamed: 0,ID,QUEUE_ID,CU,QUEUE_TYPE,MEM_USAGE_1,CPU_USAGE_1,day_1,hour_1,LAUNCHING_JOB_NUMS_1,RUNNING_JOB_NUMS_1,SUCCEED_JOB_NUMS_1,CANCELLED_JOB_NUMS_1,FAILED_JOB_NUMS_1,MEM_USAGE_2,CPU_USAGE_2,day_2,hour_2,LAUNCHING_JOB_NUMS_2,RUNNING_JOB_NUMS_2,SUCCEED_JOB_NUMS_2,CANCELLED_JOB_NUMS_2,FAILED_JOB_NUMS_2,MEM_USAGE_3,CPU_USAGE_3,day_3,hour_3,LAUNCHING_JOB_NUMS_3,RUNNING_JOB_NUMS_3,SUCCEED_JOB_NUMS_3,CANCELLED_JOB_NUMS_3,FAILED_JOB_NUMS_3,MEM_USAGE_4,CPU_USAGE_4,day_4,hour_4,LAUNCHING_JOB_NUMS_4,RUNNING_JOB_NUMS_4,SUCCEED_JOB_NUMS_4,CANCELLED_JOB_NUMS_4,FAILED_JOB_NUMS_4,MEM_USAGE_5,CPU_USAGE_5,day_5,hour_5,LAUNCHING_JOB_NUMS_5,RUNNING_JOB_NUMS_5,SUCCEED_JOB_NUMS_5,CANCELLED_JOB_NUMS_5,FAILED_JOB_NUMS_5
0,1,297,16,2,69,60,3,21,0,5,5,0,0,69.0,58.0,3.0,22.0,0.0,9.0,4.0,0.0,0.0,67.0,80.0,3.0,22.0,0.0,9.0,1.0,0.0,0.0,65.0,100.0,3.0,22.0,0.0,7.0,2.0,0.0,1.0,67.0,98.0,3.0,22.0,0.0,10.0,3.0,0.0,1.0
1,1,297,16,2,69,58,3,22,0,9,4,0,0,67.0,80.0,3.0,22.0,0.0,9.0,1.0,0.0,0.0,65.0,100.0,3.0,22.0,0.0,7.0,2.0,0.0,1.0,67.0,98.0,3.0,22.0,0.0,10.0,3.0,0.0,1.0,,,,,,,,,
2,1,297,16,2,67,80,3,22,0,9,1,0,0,65.0,100.0,3.0,22.0,0.0,7.0,2.0,0.0,1.0,67.0,98.0,3.0,22.0,0.0,10.0,3.0,0.0,1.0,,,,,,,,,,,,,,,,,,
3,1,297,16,2,65,100,3,22,0,7,2,0,1,67.0,98.0,3.0,22.0,0.0,10.0,3.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1,297,16,2,67,98,3,22,0,10,3,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [20]:
train = train.dropna()
test = test.dropna()

In [21]:
train.drop(columns=[f'day_{i}' for i in range(2, 5)], inplace=True)
test.drop(columns=[f'day_{i}' for i in range(2, 5)], inplace=True)

train.drop(columns=[f'hour_{i}' for i in range(2, 5)], inplace=True)
test.drop(columns=[f'hour_{i}' for i in range(2, 5)], inplace=True)

In [22]:
train.shape, test.shape

((501343, 47), (2996, 43))

In [23]:
job_num_cols = ['launching', 'running', 'succeed', 'cancelled', 'failed']

In [24]:
'launching'.upper()

'LAUNCHING'

In [25]:
train['cpu_mean'] = train[[f'CPU_USAGE_{i}' for i in range(1,6)]].mean(axis=1)
train['cpu_std'] = train[[f'CPU_USAGE_{i}' for i in range(1,6)]].std(axis=1)
train['cpu_diff'] = train['CPU_USAGE_5'] - train['CPU_USAGE_1']
train['cpu_max'] = train[[f'CPU_USAGE_{i}' for i in range(1,6)]].max(axis=1)
train['mem_mean'] = train[[f'MEM_USAGE_{i}' for i in range(1,6)]].mean(axis=1)
train['mem_std'] = train[[f'MEM_USAGE_{i}' for i in range(1,6)]].std(axis=1)
train['mem_max'] = train[[f'MEM_USAGE_{i}' for i in range(1,6)]].max(axis=1)
train['mem_diff'] = train['MEM_USAGE_5'] - train['MEM_USAGE_1']

test['cpu_mean'] = test[[f'CPU_USAGE_{i}' for i in range(1,6)]].mean(axis=1)
test['cpu_std'] = test[[f'CPU_USAGE_{i}' for i in range(1,6)]].std(axis=1)
test['cpu_diff'] = test['CPU_USAGE_5'] - test['CPU_USAGE_1']
test['cpu_max'] = test[[f'CPU_USAGE_{i}' for i in range(1,6)]].max(axis=1)
test['mem_mean'] = test[[f'MEM_USAGE_{i}' for i in range(1,6)]].mean(axis=1)
test['mem_std'] = test[[f'MEM_USAGE_{i}' for i in range(1,6)]].std(axis=1)
test['mem_max'] = test[[f'MEM_USAGE_{i}' for i in range(1,6)]].max(axis=1)
test['mem_diff'] = test['MEM_USAGE_5'] - test['MEM_USAGE_1']

for job_num_col in job_num_cols:
    fstr = job_num_col.upper() + '_JOB_NUMS_{}'
    train[job_num_col + '_mean'] = train[[fstr.format(i) for i in range(1,6)]].mean(axis=1)
    train[job_num_col + '_std'] = train[[fstr.format(i) for i in range(1,6)]].std(axis=1)
    train[job_num_col + '_max'] = train[[fstr.format(i) for i in range(1,6)]].max(axis=1)
    train[job_num_col + '_diff'] = train[fstr.format(5)] - train[fstr.format(1)]
    
    test[job_num_col + '_mean'] = test[[fstr.format(i) for i in range(1,6)]].mean(axis=1)
    test[job_num_col + '_std'] = test[[fstr.format(i) for i in range(1,6)]].std(axis=1)
    test[job_num_col + '_max'] = test[[fstr.format(i) for i in range(1,6)]].max(axis=1)
    test[job_num_col + '_diff'] = test[fstr.format(5)] - test[fstr.format(1)]

In [26]:
['LAUNCHING_JOB_NUMS_{}'.format(i) for i in range(1,6)]

['LAUNCHING_JOB_NUMS_1',
 'LAUNCHING_JOB_NUMS_2',
 'LAUNCHING_JOB_NUMS_3',
 'LAUNCHING_JOB_NUMS_4',
 'LAUNCHING_JOB_NUMS_5']

In [27]:
def run_lgb_qid(train, test, target, qid):
    
    # 提取特征列
    feature_names = list(
        filter(lambda x: x not in ['QUEUE_ID', 'CU', 'QUEUE_TYPE'] + [f'cpu_{i}' for i in range(1,6)], 
               train.columns))
    
    # 提取 QUEUE_ID 对应的数据集
    train = train[train.QUEUE_ID == qid]
    test = test[test.QUEUE_ID == qid]
    
    print(f"QUEUE_ID:{qid}, target:{target}, train:{len(train)}, test:{len(test)}")
    
    # 模型参数，可调
    model = lgb.LGBMRegressor(num_leaves=20,
                              max_depth=4,
                              learning_rate=0.08,
                              n_estimators=10000,
                              subsample=0.9,
                              feature_fraction=0.8,
                              reg_alpha=0.6,
                              reg_lambda=1.2,
                              random_state=42)
    oof = []
    prediction = test[['ID', 'QUEUE_ID']]
    prediction[target] = 0
    
    kfold = KFold(n_splits=5, random_state=42)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train, train[target])):
        
        X_train = train.iloc[trn_idx][feature_names]
        Y_train = train.iloc[trn_idx][target]
        X_val = train.iloc[val_idx][feature_names]
        Y_val = train.iloc[val_idx][target]
        
        lgb_model = model.fit(X_train, 
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=0,
                              eval_metric='mse',
                              early_stopping_rounds=20)
        
        pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = train.iloc[val_idx][[target, 'QUEUE_ID']].copy()
        df_oof['pred'] = pred_val
        oof.append(df_oof)
        
        pred_test = lgb_model.predict(test[feature_names], num_iteration=lgb_model.best_iteration_)
        prediction[target] += pred_test / kfold.n_splits
        
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
        
    df_oof = pd.concat(oof)
    mse_score = mean_squared_error(df_oof[target], df_oof['pred'])
    print('MSE:', mse_score)

    return prediction, mse_score

In [28]:
predictions = list()
mse_scores = list()

for qid in tqdm(test.QUEUE_ID.unique()):    
    df = pd.DataFrame()
    for t in [f'cpu_{i}' for i in range(1,6)]:
        prediction, mse_score = run_lgb_qid(train, test, target=t, qid=qid)
        if t == 'cpu_1':
            df = prediction.copy()
        else:
            df = pd.merge(df, prediction, on=['ID', 'QUEUE_ID'], how='left')            
        mse_scores.append(mse_score)
        # mae_scores.append(mae_score)

    predictions.append(df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23.0), HTML(value='')))

QUEUE_ID:297, target:cpu_1, train:21175, test:1142
MSE: 111.69076044694305
QUEUE_ID:297, target:cpu_2, train:21175, test:1142
MSE: 159.96984125170025
QUEUE_ID:297, target:cpu_3, train:21175, test:1142
MSE: 168.8512736418839
QUEUE_ID:297, target:cpu_4, train:21175, test:1142
MSE: 166.80462760402904
QUEUE_ID:297, target:cpu_5, train:21175, test:1142
MSE: 141.36899601638567
QUEUE_ID:85153, target:cpu_1, train:14344, test:390
MSE: 160.0328663690351
QUEUE_ID:85153, target:cpu_2, train:14344, test:390
MSE: 191.42966977056395
QUEUE_ID:85153, target:cpu_3, train:14344, test:390
MSE: 196.88079143677598
QUEUE_ID:85153, target:cpu_4, train:14344, test:390
MSE: 200.6743315710563
QUEUE_ID:85153, target:cpu_5, train:14344, test:390
MSE: 202.66305340741712
QUEUE_ID:291, target:cpu_1, train:8875, test:57
MSE: 62.7869037186376
QUEUE_ID:291, target:cpu_2, train:8875, test:57
MSE: 127.7366517984226
QUEUE_ID:291, target:cpu_3, train:8875, test:57
MSE: 182.0291625006363
QUEUE_ID:291, target:cpu_4, train:88

MSE: 7.997464183031941
QUEUE_ID:298, target:cpu_4, train:20372, test:2
MSE: 8.1652925468169
QUEUE_ID:298, target:cpu_5, train:20372, test:2
MSE: 8.192430925315778



In [29]:
print('mean MSE score: ', np.mean(mse_scores))

mean MSE score:  49.660801578160324


In [30]:
sub = pd.concat(predictions)

sub = sub.sort_values(by='ID').reset_index(drop=True)
sub.drop(['QUEUE_ID'], axis=1, inplace=True)
sub.columns = ['ID'] + [f'CPU_USAGE_{i}' for i in range(1,6)]

# 全置 0 都比训练出来的结果好
for col in [f'LAUNCHING_JOB_NUMS_{i}' for i in range(1,6)]:
    sub[col] = 0
    
sub = sub[['ID',
           'CPU_USAGE_1', 'LAUNCHING_JOB_NUMS_1', 
           'CPU_USAGE_2', 'LAUNCHING_JOB_NUMS_2', 
           'CPU_USAGE_3', 'LAUNCHING_JOB_NUMS_3', 
           'CPU_USAGE_4', 'LAUNCHING_JOB_NUMS_4', 
           'CPU_USAGE_5', 'LAUNCHING_JOB_NUMS_5']]

print(sub.shape)
sub.head()

(2996, 11)


Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,86.877882,0,83.155746,0,90.305449,0,88.914952,0,91.15839,0
1,2,35.6659,0,26.68277,0,33.307783,0,41.008974,0,24.891951,0
2,3,50.224369,0,61.664568,0,25.867571,0,13.456061,0,5.609942,0
3,4,28.728412,0,18.279899,0,7.363384,0,5.538564,0,6.175134,0
4,5,2.419956,0,14.575281,0,17.88465,0,17.101124,0,12.846067,0


In [31]:
# 注意: 提交要求预测结果需为非负整数, 包括 ID 也需要是整数

sub['ID'] = sub['ID'].astype(int)

for col in [i for i in sub.columns if i != 'ID']:
    sub[col] = sub[col].apply(np.floor)
    sub[col] = sub[col].apply(lambda x: 0 if x<0 else x)
    sub[col] = sub[col].astype(int)
    
sub.head(10)

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,86,0,83,0,90,0,88,0,91,0
1,2,35,0,26,0,33,0,41,0,24,0
2,3,50,0,61,0,25,0,13,0,5,0
3,4,28,0,18,0,7,0,5,0,6,0
4,5,2,0,14,0,17,0,17,0,12,0
5,6,8,0,17,0,16,0,14,0,13,0
6,7,12,0,8,0,6,0,7,0,7,0
7,8,0,0,0,0,0,0,0,0,0,0
8,9,3,0,3,0,3,0,3,0,3,0
9,10,15,0,7,0,8,0,7,0,7,0


In [32]:
sub.to_csv('submission_1114_2.csv', index=False)