In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
from pylab import *

In [2]:
warnings.filterwarnings('ignore')
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [3]:
#cited from others
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train_feature):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    train_feature['x_max_x_min'] = train_feature['x_max'] - train_feature['x_min']
    train_feature['y_max_y_min'] = train_feature['y_max'] - train_feature['y_min']
    train_feature['y_max_x_min'] = train_feature['y_max'] - train_feature['x_min']
    train_feature['x_max_y_min'] = train_feature['x_max'] - train_feature['y_min']
    train_feature['slope'] = train_feature['y_max_y_min'] / np.where(train_feature['x_max_x_min']==0, 0.001, train_feature['x_max_x_min'])
    train_feature['area'] = train_feature['x_max_x_min'] * train_feature['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train_feature['mode_hour'] = train_feature['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train_feature['hour_nunique'] = train_feature['ship'].map(hour_nunique)
    train_feature['date_nunique'] = train_feature['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    return train_feature

In [4]:
#this function will get an n-2 length series
def translation_forward(list_in,n):
    for i in range(n):
        list_in.pop(0)
    return list_in

def area_tri(groupbied):
    # area_triangle=[]change to dataframe
    # 创建空DataFrame
    # ship_id=groupbied['ship'].unique().item(0)
    if len(groupbied) > 49:
        x1_list=list(groupbied['x'])
        y1_list=list(groupbied['y'])
        x2_list=list(translation_forward(x1_list,16))
        y2_list=list(translation_forward(y1_list,16))
        x3_list=list(translation_forward(x2_list,32))
        y3_list=list(translation_forward(y2_list,32))
        # t=list(groupbied['time'])
        delta_t1=list(groupbied['time'])
        delta_t2=list(translation_forward(delta_t1,48))
        area_list=[]
        line_2_list=[]
        eff_a_l_list=[]
        eff_a_t_list=[]
        for x1, y1, x2, y2, x3, y3, diff_t1, diff_t2 in zip(x1_list,y1_list,x2_list,y2_list,x3_list,y3_list,delta_t1,delta_t2):
            a=np.sqrt((x2-x3)**2+(y2-y3)**2)
            b=np.sqrt((x1-x3)**2+(y1-y3)**2)
            c=np.sqrt((x1-x2)**2+(y1-y2)**2)
            s = (a + b + c) / 2
            l=a+c
            area=(s*(s-a)*(s-b)*(s-c)) ** 0.5
            diff_t=(diff_t2 - diff_t1)/np.timedelta64(1, 'h')
            eff_area_line=area/l if l!=0 else 0
            eff_area_diff_t=area /diff_t if diff_t!=0 else 0
            # area_triangle.append(s)
            area_list.append(area)
            line_2_list.append(l)
            eff_a_l_list.append(eff_area_line)
            eff_a_t_list.append(eff_area_diff_t)
        area_triangle = pd.DataFrame(list(zip(area_list, line_2_list, eff_a_l_list, eff_a_t_list)))
        area_triangle.columns = ['area_3','Line_2','eff_a_l','eff_a_t']  
        # area_triangle = area_triangle.append(temp_df,ignore_index=True)
    else:
        area_triangle=pd.DataFrame({'area_3':[0],'Line_2':[0],'eff_a_l':[0],'eff_a_t':[0]})
    return area_triangle

In [5]:
#add FFT
def fft_v(groupbied):
    nSampleNum = len(groupbied)
    sampleTime = groupbied['diff_time'].mean()
    ncount = (nSampleNum-1)*sampleTime
    delta_f = nSampleNum / ncount
    x = np.linspace(0,sampleTime,ncount)#时域波形x轴坐标
    freqLine = nSampleNum
    v = groupbied['v'].values#np.sin(2*pi*250*x)
    fft = abs(np.fft.fft(v))[0:freqLine]  #调用fft变换算法计算频域波形
    fftx = np.linspace(0,delta_f*freqLine,freqLine)  #频域波形x轴坐标
    fft_low = pd.DataFrame(fft[fftx<freqLine*0.25]).describe().T.add_prefix('fft_low_')
    fft_mid_l = pd.DataFrame(fft[(fftx>=freqLine*0.25) & (fftx<freqLine*0.5)]).describe().T.add_prefix('fft_mid_l_')
    fft_mid_h = pd.DataFrame(fft[(fftx>=freqLine*0.5) & (fftx<freqLine*0.75)]).describe().T.add_prefix('fft_mid_h_')
    fft_high = pd.DataFrame(fft[fftx>=freqLine*0.75]).describe().T.add_prefix('fft_high_')
    FFT_=pd.concat([fft_low, fft_mid_l, fft_mid_h, fft_high], axis=1)
    return FFT_

In [19]:
def feature_version2(df,feature_label):
    df_order=df.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
    df_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
    df_order.index.name ='inner_rev_index'# save the inner index number step#2
    df_order.reset_index(drop=False,inplace=True)#add 0-N index
    df_order['inner_ord_index']=df.index#use the former subIndex to Mark the t0-tN
    df_order['diff_time']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
    df_order['diff_dist']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
    df_order['est_v']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
    df_order['est_diff_dist']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]
    v_feature=pd.DataFrame(train_order.groupby('ship').apply(fft_v)).reset_index()
    feature_label = pd.merge(feature_label, v_feature, on='ship', how='left')
    return feature_label

In [None]:
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
train_label = train.drop_duplicates('ship')
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
# rev means a reversed map relation
type_map_rev = {v:k for k,v in type_map.items()} 
train_label['type'] = train_label['type'].map(type_map)
train = extract_dt(train)
test = extract_dt(test)

In [7]:
#add more information about distribution on dist and time
train_order=train.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
train_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
train_order.index.name ='inner_rev_index'# save the inner index number step#2
train_order.reset_index(drop=False,inplace=True)#add 0-N index
print(train_order.index)
train_order['inner_ord_index']=train.index#use the former subIndex to Mark the t0-tN


RangeIndex(start=0, stop=2699638, step=1)


In [8]:
train_order['diff_time']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
train_order['diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
train_order['est_v']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
train_order['est_diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]

In [9]:
v_feature=pd.DataFrame(train_order.groupby('ship').apply(fft_v)).reset_index()
v_feature

Unnamed: 0,ship,level_1,fft_low_count,fft_low_mean,fft_low_std,fft_low_min,fft_low_25%,fft_low_50%,fft_low_75%,fft_low_max,...,fft_mid_h_75%,fft_mid_h_max,fft_high_count,fft_high_mean,fft_high_std,fft_high_min,fft_high_25%,fft_high_50%,fft_high_75%,fft_high_max
0,0,0,18.0,83.887142,20.625293,47.175858,67.579416,88.184464,101.971765,110.11,...,13.395191,20.741462,360.0,11.629517,17.445930,0.359979,4.511088,7.045699,10.342505,107.015931
1,1,0,18.0,147.012768,153.170569,30.443463,74.020849,99.350370,146.822637,619.05,...,17.370267,26.003670,331.0,14.728231,33.473458,0.545854,5.235012,8.332708,12.514176,466.919493
2,2,0,18.0,57.564297,22.687140,29.253883,46.006162,55.918018,62.849070,138.67,...,59.624705,64.354406,179.0,51.045664,10.059779,27.399145,43.608261,51.484586,58.068341,73.207900
3,3,0,18.0,126.425416,118.286181,28.043566,50.211547,88.676866,154.085054,492.90,...,28.903377,35.314139,281.0,24.332056,29.207781,1.324192,11.032982,20.093767,26.913541,336.832017
4,4,0,18.0,123.666232,125.777353,7.531766,56.821201,91.643426,137.658744,566.30,...,37.834128,89.315933,347.0,32.401890,27.448425,1.809252,17.553343,25.929871,36.939714,262.391041
5,5,0,18.0,152.375111,186.085091,19.272286,49.139904,70.594554,133.495176,642.68,...,28.891941,34.058088,321.0,23.905038,41.160528,1.723069,11.805141,17.936838,23.478767,533.047879
6,6,0,18.0,103.670156,343.094106,3.943101,16.548537,21.650586,29.035464,1477.81,...,21.624657,33.631489,340.0,12.764592,7.182024,0.200111,7.437246,11.667549,16.700356,46.042178
7,7,0,15.0,269.208814,403.150860,36.177491,74.082905,126.441049,213.185323,1561.66,...,52.661721,62.156858,303.0,30.190478,53.769670,1.377165,13.030203,20.926804,32.271687,787.845823
8,8,0,18.0,94.020859,282.215158,8.456955,14.788280,26.157642,40.810715,1222.94,...,27.445733,44.944910,312.0,20.560298,11.760064,1.261237,11.461559,18.137247,28.883798,65.612300
9,9,0,18.0,0.949064,1.034886,0.251119,0.575736,0.678352,0.921383,4.97,...,0.654633,1.439506,343.0,0.473611,0.257234,0.022489,0.274523,0.474169,0.602427,1.439506


In [20]:
train_feature = extract_feature(train, train_label)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum'}
{'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
{'hour_max': 'max', 'hour_min': 'min'}


In [21]:
train_feature = pd.merge(train_feature, v_feature, on='ship', how='left')

In [22]:
train_feature
# train_moved=train_order.loc[train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 ).reset_index()['diff_dist']]
# # df.loc[df.groupby(['CATEGORY'])['VALUE'].diff(1)!=0]This can drop off 连贯相同的项
# temp=pd.DataFrame(train_moved.groupby('ship').apply(area_tri)).reset_index()
# temp[temp['area_3']!=0]

Unnamed: 0,ship,x,y,v,d,time,type,x_max,x_min,x_mean,...,fft_mid_h_75%,fft_mid_h_max,fft_high_count,fft_high_mean,fft_high_std,fft_high_min,fft_high_25%,fft_high_50%,fft_high_75%,fft_high_max
0,0,6.152038e+06,5.124873e+06,2.59,102,1110 11:58:19,0,6.152038e+06,6.118352e+06,6.119351e+06,...,1.339519e+01,2.074146e+01,360.0,1.162952e+01,1.744593e+01,0.359979,4.511088e+00,7.045699e+00,1.034251e+01,1.070159e+02
1,1,6.076254e+06,5.061743e+06,3.99,278,1110 11:40:21,0,6.102450e+06,6.049472e+06,6.091460e+06,...,1.737027e+01,2.600367e+01,331.0,1.472823e+01,3.347346e+01,0.545854,5.235012e+00,8.332708e+00,1.251418e+01,4.669195e+02
2,10,6.321032e+06,5.242805e+06,4.48,213,1110 11:49:36,0,6.346913e+06,6.246119e+06,6.262484e+06,...,2.432696e+01,3.124821e+01,343.0,2.235766e+01,3.195142e+01,1.392921,1.085780e+01,1.589844e+01,2.453043e+01,4.065747e+02
3,100,6.102751e+06,5.112534e+06,0.00,0,1030 23:50:05,0,6.151439e+06,6.102326e+06,6.123711e+06,...,2.476046e+01,3.123140e+01,357.0,1.738014e+01,2.066054e+01,0.769223,8.513133e+00,1.266351e+01,1.954562e+01,2.065805e+02
4,1000,6.843713e+06,5.480538e+06,2.00,216,1106 23:42:30,1,6.844414e+06,6.748890e+06,6.807536e+06,...,5.033149e+01,1.014254e+02,323.0,2.888895e+01,3.050078e+01,2.200777,1.384692e+01,2.113300e+01,2.956473e+01,2.500879e+02
5,1001,6.246424e+06,5.241153e+06,0.32,279,1117 11:53:50,0,6.275264e+06,6.246229e+06,6.253311e+06,...,3.416412e+01,4.733928e+01,364.0,2.536922e+01,3.041609e+01,1.634451,1.334161e+01,2.091369e+01,2.895814e+01,4.681524e+02
6,1002,6.403154e+06,5.383851e+06,2.70,359,1117 11:58:35,1,6.443405e+06,6.359137e+06,6.399824e+06,...,3.828503e+01,7.393997e+01,352.0,2.443222e+01,2.315053e+01,0.547302,1.247065e+01,1.911090e+01,2.856273e+01,2.318391e+02
7,1003,7.059754e+06,6.104156e+06,0.00,342,1120 23:59:54,1,7.060459e+06,7.013211e+06,7.030812e+06,...,2.801426e+01,5.139731e+01,342.0,2.653348e+01,2.010927e+01,1.150801,1.371237e+01,2.209159e+01,3.331190e+01,1.514083e+02
8,1004,6.596109e+06,6.049142e+06,3.02,30,1103 11:54:52,0,6.655167e+06,6.356943e+06,6.485758e+06,...,4.956202e+01,7.374579e+01,315.0,3.519376e+01,5.196774e+01,1.069541,1.848354e+01,2.801728e+01,4.197086e+01,8.466499e+02
9,1005,6.233508e+06,5.251609e+06,0.22,0,1106 23:51:36,0,6.272278e+06,6.233407e+06,6.234547e+06,...,1.900114e+01,2.998286e+01,332.0,1.546711e+01,2.565770e+01,0.224413,5.886438e+00,9.181089e+00,1.348359e+01,1.689531e+02


In [11]:
# t = np.arange(0, 1.0, 1.0/100)
# y = t+0.25
# a=pd.DataFrame(t[(y>0.5) & (y<0.6)])
# a.describe().add_prefix('prefix').T
# b=pd.concat([a, a], axis=1)
# b
# DataFrame
# temp1=(y>0.5)
# temp2=(y<0.6)
# temp3=temp1 and temp2
# temp3

In [12]:
# a=fft_v(train_order[train_order['ship']==0])
# a['ship']=1
# a