In [1]:
import gc
import time
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
import warnings
from tqdm import tqdm
tqdm.pandas(desc='pandas bar')
warnings.filterwarnings('ignore')

In [3]:
def reduce_mem(df):
    """降低内存占用"""
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'
          .format(end_mem, 100*(start_mem-end_mem)/start_mem, (time.time()-starttime)/60))
    return df

In [4]:
attr = pd.read_csv('data/attr.csv')
attr.head()

Unnamed: 0,link_id,length,direction,path_class,speed_class,lane_num,speed_limit,level,width
0,0,19,1,5,7,1,4.168,5,30
1,1,19,1,5,7,1,4.168,5,30
2,2,16,1,5,7,1,4.168,5,30
3,3,16,1,5,7,1,4.168,5,30
4,4,17,1,5,7,1,4.168,5,30


In [5]:
traffic = pd.read_pickle('data/traffic_logs.pkl')
traffic.head()

Unnamed: 0,date,link_id,cur_time,pred_time,label,rec_speed_0,rec_speed_1,rec_speed_2,rec_speed_3,rec_speed_4,rec_eta_speed_0,rec_eta_speed_1,rec_eta_speed_2,rec_eta_speed_3,rec_eta_speed_4,rec_car_count_0,rec_car_count_1,rec_car_count_2,rec_car_count_3,rec_car_count_4,rec_label_0,rec_label_1,rec_label_2,rec_label_3,rec_label_4,his_speed_0_0,his_speed_0_1,his_speed_0_2,his_speed_0_3,his_speed_0_4,his_speed_1_0,his_speed_1_1,his_speed_1_2,his_speed_1_3,his_speed_1_4,his_speed_2_0,his_speed_2_1,his_speed_2_2,his_speed_2_3,his_speed_2_4,his_speed_3_0,his_speed_3_1,his_speed_3_2,his_speed_3_3,his_speed_3_4,his_eta_speed_0_0,his_eta_speed_0_1,his_eta_speed_0_2,his_eta_speed_0_3,his_eta_speed_0_4,his_eta_speed_1_0,his_eta_speed_1_1,his_eta_speed_1_2,his_eta_speed_1_3,his_eta_speed_1_4,his_eta_speed_2_0,his_eta_speed_2_1,his_eta_speed_2_2,his_eta_speed_2_3,his_eta_speed_2_4,his_eta_speed_3_0,his_eta_speed_3_1,his_eta_speed_3_2,his_eta_speed_3_3,his_eta_speed_3_4,his_car_count_0_0,his_car_count_0_1,his_car_count_0_2,his_car_count_0_3,his_car_count_0_4,his_car_count_1_0,his_car_count_1_1,his_car_count_1_2,his_car_count_1_3,his_car_count_1_4,his_car_count_2_0,his_car_count_2_1,his_car_count_2_2,his_car_count_2_3,his_car_count_2_4,his_car_count_3_0,his_car_count_3_1,his_car_count_3_2,his_car_count_3_3,his_car_count_3_4,his_label_0_0,his_label_0_1,his_label_0_2,his_label_0_3,his_label_0_4,his_label_1_0,his_label_1_1,his_label_1_2,his_label_1_3,his_label_1_4,his_label_2_0,his_label_2_1,his_label_2_2,his_label_2_3,his_label_2_4,his_label_3_0,his_label_3_1,his_label_3_2,his_label_3_3,his_label_3_4
0,1,1049,258,288,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,43.1875,38.1875,33.59375,27.703125,23.5,38.5,38.5,39.40625,32.59375,32.59375,13.398438,13.398438,13.398438,28.90625,39.3125,30.0,30.0,33.1875,30.0,30.0,23.0,23.0,25.90625,22.90625,17.59375,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,11.296875,11.296875,9.601562,9.296875,9.296875,18.90625,17.0,17.0,1,1,2,3,2,1,1,1,1,1,3,3,3,3,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,2,2,2,1,1,0,0,1,0,0
1,1,1049,261,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,33.59375,27.703125,23.5,27.796875,27.796875,39.40625,32.59375,32.59375,32.59375,32.59375,13.398438,28.90625,39.3125,39.3125,39.3125,33.1875,30.0,30.0,7.699219,7.699219,25.90625,22.90625,17.59375,27.703125,27.703125,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,9.601562,9.601562,9.601562,18.90625,17.0,17.0,17.0,17.0,2,3,2,3,3,1,1,1,1,1,3,3,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,0,3,3
2,1,1049,264,288,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,43.1875,38.1875,33.59375,27.703125,23.5,38.5,38.5,39.40625,32.59375,32.59375,13.398438,13.398438,13.398438,28.90625,39.3125,30.0,30.0,33.1875,30.0,30.0,23.0,23.0,25.90625,22.90625,17.59375,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,11.296875,11.296875,9.601562,9.296875,9.296875,18.90625,17.0,17.0,1,1,2,3,2,1,1,1,1,1,3,3,3,3,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,2,2,2,1,1,0,0,1,0,0
3,1,1049,266,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,33.59375,27.703125,23.5,27.796875,27.796875,39.40625,32.59375,32.59375,32.59375,32.59375,13.398438,28.90625,39.3125,39.3125,39.3125,33.1875,30.0,30.0,7.699219,7.699219,25.90625,22.90625,17.59375,27.703125,27.703125,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,9.601562,9.601562,9.601562,18.90625,17.0,17.0,17.0,17.0,2,3,2,3,3,1,1,1,1,1,3,3,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,0,3,3
4,1,1049,272,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,33.59375,27.703125,23.5,27.796875,27.796875,39.40625,32.59375,32.59375,32.59375,32.59375,13.398438,28.90625,39.3125,39.3125,39.3125,33.1875,30.0,30.0,7.699219,7.699219,25.90625,22.90625,17.59375,27.703125,27.703125,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,9.601562,9.601562,9.601562,18.90625,17.0,17.0,17.0,17.0,2,3,2,3,3,1,1,1,1,1,3,3,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,0,3,3


In [6]:
traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15687465 entries, 0 to 15687464
Columns: 105 entries, date to his_label_3_4
dtypes: float16(50), int16(27), int32(1), int8(27)
memory usage: 2.7 GB


In [7]:
df_feats = traffic[['date', 'link_id', 'cur_time', 'pred_time', 'label']].copy()

In [8]:
df_feats.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15687465 entries, 0 to 15687464
Columns: 5 entries, date to label
dtypes: int16(2), int32(1), int8(2)
memory usage: 149.6 MB


## 当前时间片段的统计特征

In [None]:
df_feats['rec_speed_min'] = traffic[['rec_speed_{}'.format(i) for i in range(5)]].min(axis=1)
df_feats['rec_speed_max'] = traffic[['rec_speed_{}'.format(i) for i in range(5)]].max(axis=1)
df_feats['rec_speed_mean'] = traffic[['rec_speed_{}'.format(i) for i in range(5)]].mean(axis=1)
df_feats['rec_speed_std'] = traffic[['rec_speed_{}'.format(i) for i in range(5)]].std(axis=1)
df_feats['rec_speed_mad'] = traffic[['rec_speed_{}'.format(i) for i in range(5)]].mad(axis=1)
df_feats['rec_speed_median'] = traffic[['rec_speed_{}'.format(i) for i in range(5)]].median(axis=1)

In [10]:
df_feats['rec_eta_speed_min'] = traffic[['rec_eta_speed_{}'.format(i) for i in range(5)]].min(axis=1)
df_feats['rec_eta_speed_max'] = traffic[['rec_eta_speed_{}'.format(i) for i in range(5)]].max(axis=1)
df_feats['rec_eta_speed_mean'] = traffic[['rec_eta_speed_{}'.format(i) for i in range(5)]].mean(axis=1)
df_feats['rec_eta_speed_std'] = traffic[['rec_eta_speed_{}'.format(i) for i in range(5)]].std(axis=1)
df_feats['rec_eta_speed_mad'] = traffic[['rec_eta_speed_{}'.format(i) for i in range(5)]].mad(axis=1)
df_feats['rec_eta_speed_median'] = traffic[['rec_eta_speed_{}'.format(i) for i in range(5)]].median(axis=1)

In [11]:
df_feats['rec_car_count_sum'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].sum(axis=1)
df_feats['rec_car_count_min'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].min(axis=1)
df_feats['rec_car_count_max'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].max(axis=1)
df_feats['rec_car_count_mean'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].mean(axis=1)
df_feats['rec_car_count_std'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].std(axis=1)
df_feats['rec_car_count_mad'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].mad(axis=1)
df_feats['rec_car_count_median'] = traffic[['rec_car_count_{}'.format(i) for i in range(5)]].median(axis=1)

In [12]:
df_feats['rec_label_mode'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].progress_apply(
    lambda row: Counter(row).most_common()[0][0], axis=1)
df_feats['rec_label_min'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].min(axis=1)
df_feats['rec_label_max'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].max(axis=1)
df_feats['rec_label_mean'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].mean(axis=1)
df_feats['rec_label_std'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].std(axis=1)
df_feats['rec_label_mad'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].mad(axis=1)
df_feats['rec_label_median'] = traffic[['rec_label_{}'.format(i) for i in range(5)]].median(axis=1)

pandas bar: 100%|██████████| 15687465/15687465 [04:37<00:00, 56621.75it/s]


In [13]:
df_feats = reduce_mem(df_feats)

-- Mem. usage decreased to 882.68 Mb (50.8% reduction),time spend:0.15 min


## 历史时间片段的统计特征

In [16]:
df_feats['his_speed_min'] = traffic[['his_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].min(axis=1)
df_feats['his_speed_max'] = traffic[['his_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].max(axis=1)
df_feats['his_speed_mean'] = traffic[['his_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mean(axis=1)
df_feats['his_speed_std'] = traffic[['his_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].std(axis=1)
df_feats['his_speed_mad'] = traffic[['his_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mad(axis=1)
df_feats['his_speed_median'] = traffic[['his_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].median(axis=1)

In [17]:
df_feats['his_eta_speed_min'] = traffic[['his_eta_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].min(axis=1)
df_feats['his_eta_speed_max'] = traffic[['his_eta_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].max(axis=1)
df_feats['his_eta_speed_mean'] = traffic[['his_eta_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mean(axis=1)
df_feats['his_eta_speed_std'] = traffic[['his_eta_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].std(axis=1)
df_feats['his_eta_speed_mad'] = traffic[['his_eta_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mad(axis=1)
df_feats['his_eta_speed_median'] = traffic[['his_eta_speed_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].median(axis=1)

In [18]:
df_feats['his_car_count_sum'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].sum(axis=1)
df_feats['his_car_count_min'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].min(axis=1)
df_feats['his_car_count_max'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].max(axis=1)
df_feats['his_car_count_mean'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mean(axis=1)
df_feats['his_car_count_std'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].std(axis=1)
df_feats['his_car_count_mad'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mad(axis=1)
df_feats['his_car_count_median'] = traffic[['his_car_count_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].median(axis=1)

In [19]:
df_feats['his_label_mode'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].progress_apply(
    lambda row: Counter(row).most_common()[0][0], axis=1)
df_feats['his_label_min'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].min(axis=1)
df_feats['his_label_max'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].max(axis=1)
df_feats['his_label_mean'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mean(axis=1)
df_feats['his_label_std'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].std(axis=1)
df_feats['his_label_mad'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].mad(axis=1)
df_feats['his_label_median'] = traffic[['his_label_{}_{}'.format(i, j) for i in range(4) for j in range(5)]].median(axis=1)

pandas bar: 100%|██████████| 15687465/15687465 [05:06<00:00, 51203.56it/s]


In [21]:
df_feats = reduce_mem(df_feats)

-- Mem. usage decreased to 1615.76 Mb (36.1% reduction),time spend:0.29 min


In [22]:
df_feats.head()

Unnamed: 0,date,link_id,cur_time,pred_time,label,rec_speed_min,rec_speed_max,rec_speed_mean,rec_speed_std,rec_speed_mad,rec_speed_median,rec_eta_speed_min,rec_eta_speed_max,rec_eta_speed_mean,rec_eta_speed_std,rec_eta_speed_mad,rec_eta_speed_median,rec_car_count_sum,rec_car_count_min,rec_car_count_max,rec_car_count_mean,rec_car_count_std,rec_car_count_mad,rec_car_count_median,rec_label_mode,rec_label_min,rec_label_max,rec_label_mean,rec_label_std,rec_label_mad,rec_label_median,his_speed_min,his_speed_max,his_speed_mean,his_speed_std,his_speed_mad,his_speed_median,his_eta_speed_min,his_eta_speed_max,his_eta_speed_mean,his_eta_speed_std,his_eta_speed_mad,his_eta_speed_median,his_car_count_sum,his_car_count_min,his_car_count_max,his_car_count_mean,his_car_count_std,his_car_count_mad,his_car_count_median,his_label_mode,his_label_min,his_label_max,his_label_mean,his_label_std,his_label_mad,his_label_median
0,1,1049,258,288,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,13.398438,43.1875,30.46875,8.789062,6.4375,31.296875,9.296875,30.09375,19.46875,8.046875,7.019531,18.25,34,1,3,1.700195,0.864746,0.77002,1.0,1,0,2,0.950195,0.60498,0.379883,1.0
1,1,1049,261,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,7.699219,39.40625,28.953125,9.476562,6.707031,31.296875,9.601562,30.09375,20.53125,7.894531,6.992188,18.25,36,1,3,1.799805,0.833496,0.720215,2.0,1,0,3,1.150391,0.745117,0.455078,1.0
2,1,1049,264,288,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,13.398438,43.1875,30.46875,8.789062,6.4375,31.296875,9.296875,30.09375,19.46875,8.046875,7.019531,18.25,34,1,3,1.700195,0.864746,0.77002,1.0,1,0,2,0.950195,0.60498,0.379883,1.0
3,1,1049,266,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,7.699219,39.40625,28.953125,9.476562,6.707031,31.296875,9.601562,30.09375,20.53125,7.894531,6.992188,18.25,36,1,3,1.799805,0.833496,0.720215,2.0,1,0,3,1.150391,0.745117,0.455078,1.0
4,1,1049,272,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,7.699219,39.40625,28.953125,9.476562,6.707031,31.296875,9.601562,30.09375,20.53125,7.894531,6.992188,18.25,36,1,3,1.799805,0.833496,0.720215,2.0,1,0,3,1.150391,0.745117,0.455078,1.0


In [28]:
df_feats.to_pickle('data/feats.pkl')