In [1]:
# -*- coding: utf-8 -*-

# Created on 202208181620
# Author:    zhuoyin94 <zhuoyin94@163.com>
# Github:    https://github.com/MichaelYin1994

'''
分析特征工程结果数据。
'''

import gc
import multiprocessing as mp
import os
from functools import partial

import numpy as np
import pandas as pd
from numba import jit, njit
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from utils.io_utils import LoadSave

tqdm.pandas()
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 300)

In [8]:
# 载入原始数据
# *************
f_dir = '../cached_data/train_feats'
f_names_list = [f_name for f_name in os.listdir(f_dir) if f_name.endswith('.csv')]

train_feats_df_list = []
for f_name in f_names_list:
    train_feats_df_list.append(pd.read_csv(os.path.join(f_dir, f_name), nrows=None))

train_feats_df = pd.concat(train_feats_df_list, axis=0, ignore_index=True)
train_feats_df['unix_ts'] = (pd.to_datetime(train_feats_df['unix_ts']).astype(int) / 10**9).astype(int)

train_feats_df.head(5)

Unnamed: 0,kpi_id,unix_ts,label,value,window_val_sum_w300,window_val_min_w300,window_val_max_w300,window_val_mean_w300,window_val_sum_w600,window_val_min_w600,window_val_max_w600,window_val_mean_w600,window_val_sum_w1200,window_val_min_w1200,window_val_max_w1200,window_val_mean_w1200,window_val_sum_w3600,window_val_min_w3600,window_val_max_w3600,window_val_mean_w3600
0,2,1491041280,0,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47,33.47
1,2,1491041340,0,32.25,65.72,32.25,33.47,32.86,65.72,32.25,33.47,32.86,65.72,32.25,33.47,32.86,65.72,32.25,33.47,32.86
2,2,1491041400,0,31.82,97.54,31.82,33.47,32.513333,97.54,31.82,33.47,32.513333,97.54,31.82,33.47,32.513333,97.54,31.82,33.47,32.513333
3,2,1491041460,0,32.94,130.48,31.82,33.47,32.62,130.48,31.82,33.47,32.62,130.48,31.82,33.47,32.62,130.48,31.82,33.47,32.62
4,2,1491041520,0,33.2,163.68,31.82,33.47,32.736,163.68,31.82,33.47,32.736,163.68,31.82,33.47,32.736,163.68,31.82,33.47,32.736


In [9]:
train_feats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3004066 entries, 0 to 3004065
Data columns (total 20 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   kpi_id                 object 
 1   unix_ts                int64  
 2   label                  object 
 3   value                  float64
 4   window_val_sum_w300    float64
 5   window_val_min_w300    float64
 6   window_val_max_w300    float64
 7   window_val_mean_w300   float64
 8   window_val_sum_w600    float64
 9   window_val_min_w600    float64
 10  window_val_max_w600    float64
 11  window_val_mean_w600   float64
 12  window_val_sum_w1200   float64
 13  window_val_min_w1200   float64
 14  window_val_max_w1200   float64
 15  window_val_mean_w1200  float64
 16  window_val_sum_w3600   float64
 17  window_val_min_w3600   float64
 18  window_val_max_w3600   float64
 19  window_val_mean_w3600  float64
dtypes: float64(17), int64(1), object(2)
memory usage: 458.4+ MB


In [10]:
# 载入比赛中的train_df与test_df，分析时间戳
# *************
file_handler = LoadSave(dir_name='../cached_data')
train_df = file_handler.load_data(file_name='train_df.pkl')
test_df = file_handler.load_data(file_name='test_df.pkl')

train_df['unix_ts'] = (train_df['unix_ts'] / 10**3).astype(int)
test_df['unix_ts'] = (test_df['unix_ts'] / 10**3).astype(int)

unique_kpi_id_list = train_df['kpi_id'].unique().tolist()
for kpi_id in unique_kpi_id_list:
    print('*************')
    print('Curr {}:'.format(kpi_id))
    train_df_tmp = train_df.query('kpi_id == {}'.format(kpi_id))
    test_df_tmp = test_df.query('kpi_id == {}'.format(kpi_id))

    print(
        'train range: {} -- {}'.format(
            train_df_tmp['unix_ts'].min(),
            train_df_tmp['unix_ts'].max()
        )
    )

    print(
        'test range: {} -- {}'.format(
            test_df_tmp['unix_ts'].min(),
            test_df_tmp['unix_ts'].max()
        )
    )

    print(
        'test min - train max: {}'.format(
            test_df_tmp['unix_ts'].min() - train_df_tmp['unix_ts'].max()
        )
    )


[INFO] 2022-08-24 16:14:37 LoadSave: Load from dir ../cached_data with name train_df.pkl
[INFO] 2022-08-24 16:14:37 LoadSave: Load from dir ../cached_data with name test_df.pkl
*************
Curr 20:
train range: 1467302400 -- 1471044600
test range: 1471044900 -- 1474787100
test min - train max: 300
*************
Curr 8:
train range: 1469203200 -- 1471881300
test range: 1471881600 -- 1474559700
test min - train max: 300
*************
Curr 19:
train range: 1469203200 -- 1471881300
test range: 1471881600 -- 1474559700
test min - train max: 300
*************
Curr 26:
train range: 1469376000 -- 1472010900
test range: 1472011200 -- 1474646100
test min - train max: 300
*************
Curr 23:
train range: 1469894400 -- 1472529300
test range: 1472529600 -- 1475164500
test min - train max: 300
*************
Curr 1:
train range: 1472918400 -- 1475553300
test range: 1475553600 -- 1478188500
test min - train max: 300
*************
Curr 3:
train range: 1474646400 -- 1477281300
test range: 147728160

In [6]:
train_df.head(5)

Unnamed: 0,unix_ts,value,label,kpi_id,row_count
0,1467302400,0.143619,0,20,1
1,1467302700,0.087151,0,20,1
2,1467303000,0.028282,0,20,1
3,1467303300,-0.02141,0,20,1
4,1467303600,-0.043751,0,20,1


In [7]:
test_df.head(5)

Unnamed: 0,unix_ts,value,label,kpi_id,row_count
0,1471044900,-1.436408,0,20,1
1,1471045200,-1.436584,0,20,1
2,1471045500,-1.436055,0,20,1
3,1471045800,-1.433126,0,20,1
4,1471046100,-1.432773,0,20,1
