In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

In [2]:
train = pd.read_csv('raw_data/train.csv')
train = train.sort_values(by=['QUEUE_ID', 'DOTTING_TIME']).reset_index(drop=True)

test = pd.read_csv('raw_data/evaluation_public.csv')
test = test.sort_values(by=['ID', 'DOTTING_TIME']).reset_index(drop=True)

sub_sample = pd.read_csv('raw_data/submit_example.csv')

In [3]:
train.head(10)

Unnamed: 0,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,2,16,available,sql,x86_64,3,54,0,0,0,0,0,1590683100000,vm,20.0
1,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590683400000,vm,20.0
2,2,16,available,sql,x86_64,7,54,0,0,0,0,0,1590683700000,vm,20.0
3,2,16,available,sql,x86_64,4,54,0,0,0,0,0,1590684000000,vm,20.0
4,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590684120000,vm,20.0
5,2,16,available,sql,x86_64,3,55,0,0,0,0,0,1590684420000,vm,20.0
6,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590684720000,vm,20.0
7,2,16,available,sql,x86_64,2,54,0,0,0,0,0,1590685020000,vm,20.0
8,2,16,available,sql,x86_64,5,54,0,0,0,0,0,1590685320000,vm,20.0
9,2,16,available,sql,x86_64,6,54,0,0,0,0,0,1590685620000,vm,20.0


In [4]:
test.head(10)

Unnamed: 0,ID,QUEUE_ID,CU,STATUS,QUEUE_TYPE,PLATFORM,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DOTTING_TIME,RESOURCE_TYPE,DISK_USAGE
0,1,297,16,available,sql,x86_64,60,69,0,5,5,0,0,1662213420000,vm,9
1,1,297,16,available,sql,x86_64,58,69,0,9,4,0,0,1662213720000,vm,9
2,1,297,16,available,sql,x86_64,80,67,0,9,1,0,0,1662214020000,vm,9
3,1,297,16,available,sql,x86_64,100,65,0,7,2,0,1,1662214320000,vm,9
4,1,297,16,available,sql,x86_64,98,67,0,10,3,0,1,1662214620000,vm,9
5,2,85153,64,available,general,x86_64,56,91,0,0,0,0,0,1613655960000,vm,20
6,2,85153,64,available,general,x86_64,48,78,0,1,1,0,0,1613656260000,vm,20
7,2,85153,64,available,general,x86_64,23,35,0,0,0,0,0,1613656560000,vm,20
8,2,85153,64,available,general,x86_64,68,61,0,0,0,0,0,1613656860000,vm,20
9,2,85153,64,available,general,x86_64,38,74,0,0,0,0,0,1613657160000,vm,20


In [5]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [6]:
# 这些 columns 在 test 只有单一值, 所以直接去掉

del train['STATUS']
del train['PLATFORM']
del train['RESOURCE_TYPE']

del test['STATUS']
del test['PLATFORM']
del test['RESOURCE_TYPE']

In [7]:
# 时间排序好后也没什么用了

del train['DOTTING_TIME']
del test['DOTTING_TIME']

In [8]:
# Label Encoding

le = LabelEncoder()
train['QUEUE_TYPE'] = le.fit_transform(train['QUEUE_TYPE'].astype(str))
test['QUEUE_TYPE'] = le.transform(test['QUEUE_TYPE'].astype(str))

In [9]:
# 加个 id 后面方便处理
train['myid'] = train.index
test['myid'] = test.index

In [10]:
# 生成 target 列

df_train = pd.DataFrame()

for id_ in tqdm(train.QUEUE_ID.unique()):
    tmp = train[train.QUEUE_ID == id_]
    tmp['CPU_USAGE_next25mins'] = tmp['CPU_USAGE'].shift(-5)
    tmp['LAUNCHING_JOB_NUMS_next25mins'] = tmp['LAUNCHING_JOB_NUMS'].shift(-5)
    df_train = df_train.append(tmp)

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [11]:
df_train = df_train[df_train.CPU_USAGE_next25mins.notna()]
# df_train['CPU_USAGE_next25mins'] /= 100

print(df_train.shape)
df_train.head()

(501515, 14)


Unnamed: 0,QUEUE_ID,CU,QUEUE_TYPE,CPU_USAGE,MEM_USAGE,LAUNCHING_JOB_NUMS,RUNNING_JOB_NUMS,SUCCEED_JOB_NUMS,CANCELLED_JOB_NUMS,FAILED_JOB_NUMS,DISK_USAGE,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,2,16,2,3,54,0,0,0,0,0,20.0,0,3.0,0.0
1,2,16,2,2,54,0,0,0,0,0,20.0,1,2.0,0.0
2,2,16,2,7,54,0,0,0,0,0,20.0,2,2.0,0.0
3,2,16,2,4,54,0,0,0,0,0,20.0,3,5.0,0.0
4,2,16,2,5,54,0,0,0,0,0,20.0,4,6.0,0.0


In [41]:
def run_lgb(df_train, df_test, target):
    
    feature_names = list(
        filter(lambda x: x not in ['CPU_USAGE_next25mins', 'LAUNCHING_JOB_NUMS_next25mins', 'QUEUE_ID', 'myid'], df_train.columns))
    
    model = lgb.LGBMRegressor(num_leaves=32,
                              max_depth=6,
                              learning_rate=0.08,
                              n_estimators=10000,
                              subsample=0.8,
                              feature_fraction=0.8,
                              reg_alpha=0.5,
                              reg_lambda=0.8,
                              random_state=2020)
    oof = []
    prediction = df_test[['ID', 'QUEUE_ID', 'myid']]
    prediction[target] = 0
    df_importance_list = []
    
    kfold = GroupKFold(n_splits=5)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train, df_train[target], df_train['QUEUE_ID'])):
        
        X_train = df_train.iloc[trn_idx][feature_names]
        Y_train = df_train.iloc[trn_idx][target]
        X_val = df_train.iloc[val_idx][feature_names]
        Y_val = df_train.iloc[val_idx][target]
        
        print('\nFold_{} Training ================================\n'.format(fold_id+1))
        lgb_model = model.fit(X_train, 
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=10,
                              eval_metric='mse',
                              early_stopping_rounds=20)
        
        pred_val = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = df_train.iloc[val_idx][[target, 'myid', 'QUEUE_ID']].copy()
        df_oof['pred'] = pred_val
        oof.append(df_oof)
        
        pred_test = lgb_model.predict(df_test[feature_names], num_iteration=lgb_model.best_iteration_)
        prediction[target] += pred_test / kfold.n_splits
        
        df_importance = pd.DataFrame({
            'column': feature_names,
            'importance': lgb_model.feature_importances_,
        })
        df_importance_list.append(df_importance)
        
        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
        
    return oof, prediction, df_importance_list

In [42]:
oof1, prediction1, df_importance_list1 = run_lgb(df_train, test, target='CPU_USAGE_next25mins')



Training until validation scores don't improve for 20 rounds
[10]	train's l2: 70.6213	valid's l2: 43.2251
[20]	train's l2: 51.011	valid's l2: 43.5256
[30]	train's l2: 45.6821	valid's l2: 42.938
Early stopping, best iteration is:
[12]	train's l2: 64.1332	valid's l2: 42.3578


Training until validation scores don't improve for 20 rounds
[10]	train's l2: 73.7756	valid's l2: 34.0144
[20]	train's l2: 53.6048	valid's l2: 37.1839
Early stopping, best iteration is:
[5]	train's l2: 103.612	valid's l2: 32.0698


Training until validation scores don't improve for 20 rounds
[10]	train's l2: 44.8343	valid's l2: 263.153
[20]	train's l2: 35.0573	valid's l2: 246.156
[30]	train's l2: 32.2759	valid's l2: 256.397
[40]	train's l2: 30.8942	valid's l2: 259.654
Early stopping, best iteration is:
[24]	train's l2: 33.6089	valid's l2: 245.896


Training until validation scores don't improve for 20 rounds
[10]	train's l2: 69.0314	valid's l2: 52.5163
[20]	train's l2: 49.9745	valid's l2: 48.0681
[30]	train's l2:

In [43]:
df_importance1 = pd.concat(df_importance_list1)
df_importance1 = df_importance1.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance1

Unnamed: 0,column,importance
0,MEM_USAGE,156.0
1,CPU_USAGE,150.6
2,DISK_USAGE,125.6
3,CU,104.8
4,RUNNING_JOB_NUMS,76.8
5,SUCCEED_JOB_NUMS,73.0
6,QUEUE_TYPE,34.6
7,LAUNCHING_JOB_NUMS,6.0
8,FAILED_JOB_NUMS,3.8
9,CANCELLED_JOB_NUMS,0.4


In [44]:
df_oof1 = pd.concat(oof1)
score = mean_squared_error(df_oof1['CPU_USAGE_next25mins'], df_oof1['pred'])
print('MSE:', score)

MSE: 94.94211232399769


In [49]:
prediction1.CPU_USAGE_next25mins.describe()

count    14980.000000
mean        10.603659
std          7.586795
min          1.965355
25%          4.789789
50%          9.142856
75%         12.838920
max         56.264859
Name: CPU_USAGE_next25mins, dtype: float64

In [45]:
oof2, prediction2, df_importance_list2 = run_lgb(df_train, test, target='LAUNCHING_JOB_NUMS_next25mins')



Training until validation scores don't improve for 20 rounds
[10]	train's l2: 2.1918	valid's l2: 11.0655
[20]	train's l2: 1.99441	valid's l2: 10.6562
[30]	train's l2: 1.9289	valid's l2: 10.3935
[40]	train's l2: 1.89266	valid's l2: 10.2175
[50]	train's l2: 1.87113	valid's l2: 10.1394
[60]	train's l2: 1.85669	valid's l2: 10.0907
[70]	train's l2: 1.84423	valid's l2: 10.0616
[80]	train's l2: 1.83686	valid's l2: 10.0659
[90]	train's l2: 1.82871	valid's l2: 10.0306
[100]	train's l2: 1.82232	valid's l2: 10.0473
[110]	train's l2: 1.81907	valid's l2: 10.0447
Early stopping, best iteration is:
[93]	train's l2: 1.82633	valid's l2: 10.0222


Training until validation scores don't improve for 20 rounds
[10]	train's l2: 3.94307	valid's l2: 2.24021
[20]	train's l2: 3.56212	valid's l2: 2.15194
[30]	train's l2: 3.41018	valid's l2: 2.13478
[40]	train's l2: 3.33111	valid's l2: 2.14066
[50]	train's l2: 3.29182	valid's l2: 2.13955
Early stopping, best iteration is:
[35]	train's l2: 3.36385	valid's l2: 2.

In [46]:
df_importance2 = pd.concat(df_importance_list2)
df_importance2 = df_importance2.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance2

Unnamed: 0,column,importance
0,LAUNCHING_JOB_NUMS,263.2
1,MEM_USAGE,187.8
2,RUNNING_JOB_NUMS,166.0
3,CPU_USAGE,104.6
4,SUCCEED_JOB_NUMS,98.2
5,DISK_USAGE,74.2
6,CU,47.0
7,CANCELLED_JOB_NUMS,35.0
8,QUEUE_TYPE,31.0
9,FAILED_JOB_NUMS,24.6


In [48]:
df_oof2 = pd.concat(oof2)
score = mean_squared_error(df_oof2['LAUNCHING_JOB_NUMS_next25mins'], df_oof2['pred'])
print('MSE:', score)

MSE: 4.000169005621965


In [50]:
prediction2.LAUNCHING_JOB_NUMS_next25mins.describe()

count    14980.000000
mean         0.329041
std          1.649481
min         -0.097474
25%          0.069156
50%          0.092879
75%          0.130210
max         34.811653
Name: LAUNCHING_JOB_NUMS_next25mins, dtype: float64

In [51]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [58]:
prediction = prediction1.copy()
prediction = pd.merge(prediction, prediction2[['myid', 'LAUNCHING_JOB_NUMS_next25mins']], on='myid')

prediction.head(10)

Unnamed: 0,ID,QUEUE_ID,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,1,297,0,10.538106,0.064479
1,1,297,1,10.260576,0.201328
2,1,297,2,15.082348,0.198362
3,1,297,3,55.178834,0.468154
4,1,297,4,33.661207,0.211956
5,2,85153,5,20.234766,0.098941
6,2,85153,6,18.480754,0.131637
7,2,85153,7,13.490999,0.113829
8,2,85153,8,19.160936,0.116536
9,2,85153,9,20.191499,0.103389


In [59]:
# 注意: 提交要求预测结果需为非负整数

prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].apply(np.floor)
prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].apply(lambda x: 0 if x<0 else x)
prediction['CPU_USAGE_next25mins'] = prediction['CPU_USAGE_next25mins'].astype(int)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].apply(np.floor)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].apply(lambda x: 0 if x<0 else x)
prediction['LAUNCHING_JOB_NUMS_next25mins'] = prediction['LAUNCHING_JOB_NUMS_next25mins'].astype(int)

prediction.head(10)

Unnamed: 0,ID,QUEUE_ID,myid,CPU_USAGE_next25mins,LAUNCHING_JOB_NUMS_next25mins
0,1,297,0,10,0
1,1,297,1,10,0
2,1,297,2,15,0
3,1,297,3,55,0
4,1,297,4,33,0
5,2,85153,5,20,0
6,2,85153,6,18,0
7,2,85153,7,13,0
8,2,85153,8,19,0
9,2,85153,9,20,0


In [60]:
sub_sample.head()

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0


In [63]:
preds = []

for id_ in tqdm(prediction.ID.unique()):
    items = [id_]
    tmp = prediction[prediction.ID == id_].sort_values(by='myid').reset_index(drop=True)
    for i, row in tmp.iterrows():
        items.append(row['CPU_USAGE_next25mins'])
        items.append(row['LAUNCHING_JOB_NUMS_next25mins'])
    preds.append(items)

HBox(children=(FloatProgress(value=0.0, max=2996.0), HTML(value='')))




In [65]:
sub = pd.DataFrame(preds)
sub.columns = sub_sample.columns

sub.head(10)

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,10,0,10,0,15,0,55,0,33,0
1,2,20,0,18,0,13,0,19,0,20,0
2,3,8,0,10,0,11,0,13,0,12,0
3,4,12,0,10,0,14,0,11,0,12,0
4,5,5,0,3,0,4,0,3,0,3,0
5,6,9,0,7,0,8,0,8,0,8,0
6,7,8,0,20,0,12,0,9,0,11,0
7,8,2,11,4,11,2,13,2,13,3,13
8,9,4,0,4,0,4,0,4,0,4,0
9,10,7,0,13,0,9,0,12,0,15,0


In [68]:
sub.shape, sub_sample.shape

((2996, 11), (2996, 11))

In [69]:
sub.to_csv('baseline_202010141435.csv', index=False)