In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
import warnings
warnings.filterwarnings('ignore')

In [7]:
train_path = './hy_round1_train_20200102'
test_path = './hy_round1_testA_20200102'

In [8]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

In [13]:
def data_processing(Input, isTrain = True):
    df = pd.read_csv(Input)
    ID = df.iloc[0,0]
    if isTrain:
        label = df.iloc[0,-1]
        df =df.drop('type', axis = 1)
    df = df.loc[::-1]
    df.columns = ['ID', 'x', 'y', 'speed', 'direct', 'time']
    df['time'] = df['time'].apply(lambda x: datetime.datetime.strptime(x, "%m%d %H:%M:%S"))
    df_diff = df.diff(1).iloc[1:]
    df_diff['ID'] =ID
    df_diff.columns = ['ID', 'x_diff', 'y_diff', 'speed_diff', 'direct_diff', 'time_diff']
    df_diff['time_seconds_diff'] = df_diff['time_diff'].dt.total_seconds()
    x = group_feature(df, 'ID', 'x', ['max','min','mean','std','skew','sum'])
    y = group_feature(df, 'ID', 'y', ['max','min','mean','std','skew','sum'])
    speed = group_feature(df, 'ID', 'speed', ['max','min','mean','std','skew','sum'])
    direct = group_feature(df, 'ID', 'direct', ['max','min','mean','std','skew','sum'])
    x_diff = group_feature(df_diff, 'ID', 'x_diff', ['max','min','mean','std','skew','sum'])
    y_diff = group_feature(df_diff, 'ID', 'y_diff', ['max','min','mean','std','skew','sum'])
    speed_diff = group_feature(df_diff, 'ID', 'speed_diff', ['max','min','mean','std','skew','sum'])
    direct_diff = group_feature(df_diff, 'ID', 'direct_diff', ['max','min','mean','std','skew','sum'])
    time_diff = group_feature(df_diff, 'ID', 'time_seconds_diff', ['max','min','mean','std','skew','sum'])
    feature = pd.merge(x, y)
    feature = pd.merge(feature, speed)
    feature = pd.merge(feature, direct)
    feature = pd.merge(feature, x_diff)
    feature = pd.merge(feature, y_diff)
    feature = pd.merge(feature, speed_diff)
    feature = pd.merge(feature, direct_diff)
    feature = pd.merge(feature, time_diff)
    feature['x_span'] = df.x.max() - df.x.min()
    feature['y_span'] = df.y.max() - df.y.min()
    feature['speed_span'] = df.speed.max() - df.speed.min()
    feature['direct_span'] = df.direct.max() - df.direct.min()
    feature['slope'] = feature['y_span'] / np.where(feature['x_span'] == 0, 0.001, feature['x_span'])
    feature['area'] = feature['y_span'] * feature['x_span']
    if isTrain:
        feature['label'] = label
    return feature

In [14]:
files = os.listdir(test_path)
df = None
count = 0
for file in files:
    if df is None:
        df = data_processing(os.path.join(test_path, file), False)
    else:
        df = pd.concat([df, data_processing(os.path.join(test_path, file), False)])
    count += 1
    if count % 100 == 0: print('处理第' + str(count) + '条数据')

处理第100条数据
处理第200条数据
处理第300条数据
处理第400条数据
处理第500条数据
处理第600条数据
处理第700条数据
处理第800条数据
处理第900条数据
处理第1000条数据
处理第1100条数据
处理第1200条数据
处理第1300条数据
处理第1400条数据
处理第1500条数据
处理第1600条数据
处理第1700条数据
处理第1800条数据
处理第1900条数据
处理第2000条数据


In [15]:
df.to_csv('./test/test.csv', index = None, encoding = 'utf-8')