In [3]:
# -*- coding: utf-8 -*-

# Created on 202208181620
# Author:    zhuoyin94 <zhuoyin94@163.com>
# Github:    https://github.com/MichaelYin1994

'''
分析特征工程结果数据。
'''

import gc
import multiprocessing as mp
import os
from functools import partial

import numpy as np
import pandas as pd
from numba import jit, njit
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

tqdm.pandas()
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 300)

In [4]:
# 载入原始数据
# *************
f_dir = '../cached_data/train_feats'
f_names_list = [f_name for f_name in os.listdir(f_dir) if f_name.endswith('.csv')]

train_feats_df_list = []
for f_name in f_names_list:
    train_feats_df_list.append(pd.read_csv(os.path.join(f_dir, f_name)))

train_feats_df = pd.concat(train_feats_df_list, ignore_index=True)
train_feats_df['unix_ts'] = (pd.to_datetime(train_feats_df['unix_ts']).astype(int) / 10**9).astype(int)
train_feats_df.head(20)

Unnamed: 0,kpi_id,unix_ts,label,value,window_val_sum_w300,window_val_min_w300,window_val_max_w300,window_val_mean_w300,window_val_sum_w600,window_val_min_w600,window_val_max_w600,window_val_mean_w600,window_val_sum_w1200,window_val_min_w1200,window_val_max_w1200,window_val_mean_w1200,window_val_sum_w3600,window_val_min_w3600,window_val_max_w3600,window_val_mean_w3600
0,23,2016-07-30T16:00:00.000Z,0,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053,0.216053
1,23,2016-07-30T16:05:00.000Z,0,0.226608,0.442661,0.216053,0.226608,0.22133,0.442661,0.216053,0.226608,0.22133,0.442661,0.216053,0.226608,0.22133,0.442661,0.216053,0.226608,0.22133
2,23,2016-07-30T16:10:00.000Z,0,0.218363,0.444971,0.218363,0.226608,0.222485,0.661023,0.216053,0.226608,0.220341,0.661023,0.216053,0.226608,0.220341,0.661023,0.216053,0.226608,0.220341
3,23,2016-07-30T16:15:00.000Z,0,0.218225,0.436588,0.218225,0.218363,0.218294,0.663196,0.218225,0.226608,0.221065,0.879249,0.216053,0.226608,0.219812,0.879249,0.216053,0.226608,0.219812
4,23,2016-07-30T16:20:00.000Z,0,0.205969,0.424194,0.205969,0.218225,0.212097,0.642557,0.205969,0.218363,0.214186,1.085217,0.205969,0.226608,0.217043,1.085217,0.205969,0.226608,0.217043
5,23,2016-07-30T16:25:00.000Z,0,0.201512,0.407481,0.201512,0.205969,0.20374,0.625706,0.201512,0.218225,0.208569,1.070677,0.201512,0.226608,0.214135,1.286729,0.201512,0.226608,0.214455
6,23,2016-07-30T16:30:00.000Z,0,0.20725,0.408762,0.201512,0.20725,0.204381,0.614731,0.201512,0.20725,0.20491,1.051319,0.201512,0.218363,0.210264,1.493979,0.201512,0.226608,0.213426
7,23,2016-07-30T16:35:00.000Z,0,0.192847,0.400097,0.192847,0.20725,0.200048,0.601609,0.192847,0.20725,0.200536,1.025803,0.192847,0.218225,0.205161,1.686826,0.192847,0.226608,0.210853
8,23,2016-07-30T16:40:00.000Z,0,0.193712,0.386559,0.192847,0.193712,0.193279,0.593809,0.192847,0.20725,0.197936,1.00129,0.192847,0.20725,0.200258,1.880538,0.192847,0.226608,0.208949
9,23,2016-07-30T16:45:00.000Z,0,0.199283,0.392996,0.193712,0.199283,0.196498,0.585842,0.192847,0.199283,0.195281,0.994604,0.192847,0.20725,0.198921,2.079822,0.192847,0.226608,0.207982


In [8]:
(pd.to_datetime(train_feats_df.iloc[:10]['unix_ts']).astype(int) / 10**9).astype(int)

0    1469894400
1    1469894700
2    1469895000
3    1469895300
4    1469895600
5    1469895900
6    1469896200
7    1469896500
8    1469896800
9    1469897100
Name: unix_ts, dtype: int64

In [8]:
train_feats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3004066 entries, 0 to 3004065
Data columns (total 20 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   kpi_id                 object 
 1   unix_ts                object 
 2   label                  object 
 3   value                  float64
 4   window_val_sum_w300    float64
 5   window_val_min_w300    float64
 6   window_val_max_w300    float64
 7   window_val_mean_w300   float64
 8   window_val_sum_w600    float64
 9   window_val_min_w600    float64
 10  window_val_max_w600    float64
 11  window_val_mean_w600   float64
 12  window_val_sum_w1200   float64
 13  window_val_min_w1200   float64
 14  window_val_max_w1200   float64
 15  window_val_mean_w1200  float64
 16  window_val_sum_w3600   float64
 17  window_val_min_w3600   float64
 18  window_val_max_w3600   float64
 19  window_val_mean_w3600  float64
dtypes: float64(17), object(3)
memory usage: 458.4+ MB


In [12]:
(pd.to_datetime(train_feats_df.iloc[:10]['unix_ts'].values).astype(int) / 10**9).astype(int)

Int64Index([1491041280, 1491041340, 1491041400, 1491041460, 1491041520,
            1491041580, 1491041640, 1491041700, 1491041760, 1491041820],
           dtype='int64')

In [52]:
train_feats_df['kpi_id'].value_counts()

9    13026
Name: kpi_id, dtype: int64

In [53]:
train_feats_df['label'].value_counts()

0    12676
1      350
Name: label, dtype: int64