In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
from pylab import *

In [2]:
warnings.filterwarnings('ignore')
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [3]:
#cited from others
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train_feature):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    train_feature['x_max_x_min'] = train_feature['x_max'] - train_feature['x_min']
    train_feature['y_max_y_min'] = train_feature['y_max'] - train_feature['y_min']
    train_feature['y_max_x_min'] = train_feature['y_max'] - train_feature['x_min']
    train_feature['x_max_y_min'] = train_feature['x_max'] - train_feature['y_min']
    train_feature['slope'] = train_feature['y_max_y_min'] / np.where(train_feature['x_max_x_min']==0, 0.001, train_feature['x_max_x_min'])
    train_feature['area'] = train_feature['x_max_x_min'] * train_feature['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train_feature['mode_hour'] = train_feature['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train_feature['hour_nunique'] = train_feature['ship'].map(hour_nunique)
    train_feature['date_nunique'] = train_feature['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    return train_feature

In [4]:
#this function will get an n-2 length series
def translation_forward(list_in,n):
    for i in range(n):
        list_in.pop(0)
    return list_in

def area_tri(groupbied):
    # area_triangle=[]change to dataframe
    # 创建空DataFrame
    # ship_id=groupbied['ship'].unique().item(0)
    if len(groupbied) > 49:
        x1_list=list(groupbied['x'])
        y1_list=list(groupbied['y'])
        x2_list=list(translation_forward(x1_list,16))
        y2_list=list(translation_forward(y1_list,16))
        x3_list=list(translation_forward(x2_list,32))
        y3_list=list(translation_forward(y2_list,32))
        # t=list(groupbied['time'])
        delta_t1=list(groupbied['time'])
        delta_t2=list(translation_forward(delta_t1,48))
        area_list=[]
        line_2_list=[]
        eff_a_l_list=[]
        eff_a_t_list=[]
        for x1, y1, x2, y2, x3, y3, diff_t1, diff_t2 in zip(x1_list,y1_list,x2_list,y2_list,x3_list,y3_list,delta_t1,delta_t2):
            a=np.sqrt((x2-x3)**2+(y2-y3)**2)
            b=np.sqrt((x1-x3)**2+(y1-y3)**2)
            c=np.sqrt((x1-x2)**2+(y1-y2)**2)
            s = (a + b + c) / 2
            l=a+c
            area=(s*(s-a)*(s-b)*(s-c)) ** 0.5
            diff_t=(diff_t2 - diff_t1)/np.timedelta64(1, 'h')
            eff_area_line=area/l if l!=0 else 0
            eff_area_diff_t=area /diff_t if diff_t!=0 else 0
            # area_triangle.append(s)
            area_list.append(area)
            line_2_list.append(l)
            eff_a_l_list.append(eff_area_line)
            eff_a_t_list.append(eff_area_diff_t)
        area_triangle = pd.DataFrame(list(zip(area_list, line_2_list, eff_a_l_list, eff_a_t_list)))
        area_triangle.columns = ['area_3','Line_2','eff_a_l','eff_a_t']  
        # area_triangle = area_triangle.append(temp_df,ignore_index=True)
    else:
        area_triangle=pd.DataFrame({'area_3':[0],'Line_2':[0],'eff_a_l':[0],'eff_a_t':[0]})
    return area_triangle

In [None]:
#add FFT
def fft_v(groupbied):
    nSampleNum = len(groupbied)
    sampleTime = groupbied['diff_time'].mean()
    ncount = (nSampleNum-1)*sampleTime
    delta_f = nSampleNum / ncount
    x = np.linspace(0,sampleTime,ncount)#时域波形x轴坐标
    freqLine = nSampleNum
    v = groupbied['v'].values#np.sin(2*pi*250*x)
    fft = abs(np.fft.fft(v))[0:freqLine]  #调用fft变换算法计算频域波形
    fftx = np.linspace(0,delta_f*freqLine,freqLine)  #频域波形x轴坐标
    fft_low = pd.DataFrame(fft[fftx<freqLine*0.25]).describe().T.add_prefix('fft_low_')
    fft_mid_l = pd.DataFrame(fft[(fftx>=freqLine*0.25) & (fftx<freqLine*0.5)]).describe().T.add_prefix('fft_mid_l_')
    fft_mid_h = pd.DataFrame(fft[(fftx>=freqLine*0.5) & (fftx<freqLine*0.75)]).describe().T.add_prefix('fft_mid_h_')
    fft_high = pd.DataFrame(fft[fftx>=freqLine*0.75]).describe().T.add_prefix('fft_high_')
    FFT_=pd.concat([fft_low, fft_mid_l, fft_mid_h, fft_high], axis=1)
    return FFT_

In [None]:
def feature_version2(df,feature_label):
    df_order=df.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
    df_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
    df_order.index.name ='inner_rev_index'# save the inner index number step#2
    df_order.reset_index(drop=False,inplace=True)#add 0-N index
    df_order['inner_ord_index']=df.index#use the former subIndex to Mark the t0-tN
    df_order['diff_time']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
    df_order['diff_dist']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
    df_order['est_v']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
    df_order['est_diff_dist']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]
    v_feature=pd.DataFrame(df_order.groupby('ship').apply(fft_v)).reset_index(level=0,drop=False)#edited
    feature_label = pd.merge(feature_label, v_feature, on='ship', how='left')
    temp=pd.DataFrame(df_order[df_order['inner_ord_index']==0][['ship','x','y']]).reset_index(drop=True).rename(columns={'x':'start_x', 'y':'start_y'}, inplace = False)
    feature_label = pd.merge(feature_label, temp, on='ship', how='left')
    temp=pd.DataFrame(df_order[df_order['inner_rev_index']==0][['ship','x','y']]).reset_index(drop=True).rename(columns={'x':'end_x', 'y':'end_y'}, inplace = False)
    feature_label = pd.merge(feature_label, temp, on='ship', how='left')
    return feature_label

In [None]:
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
train_label = train.drop_duplicates('ship')
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
# rev means a reversed map relation
type_map_rev = {v:k for k,v in type_map.items()} 
train_label['type'] = train_label['type'].map(type_map)
train = extract_dt(train)
test = extract_dt(test)

In [None]:
#add more information about distribution on dist and time
train_order=train.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
train_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
train_order.index.name ='inner_rev_index'# save the inner index number step#2
train_order.reset_index(drop=False,inplace=True)#add 0-N index
print(train_order.index)
train_order['inner_ord_index']=train.index#use the former subIndex to Mark the t0-tN


RangeIndex(start=0, stop=2699638, step=1)


In [None]:
train_order['diff_time']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
train_order['diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
train_order['est_v']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
train_order['est_diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]

In [None]:
v_feature=pd.DataFrame(train_order.groupby('ship').apply(fft_v))#.reset_index()

In [None]:
temp=v_feature
temp=temp.reset_index(level=0,drop=False)

In [None]:
temp

In [None]:
train_feature = extract_feature(train, train_label)
train_feature = feature_version2(train, train_label)
# train_feature = pd.merge(train_feature, v_feature, on='ship', how='left')
len(train_feature.columns)

In [None]:
# train_moved=train_order.loc[train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 ).reset_index()['diff_dist']]
# # df.loc[df.groupby(['CATEGORY'])['VALUE'].diff(1)!=0]This can drop off 连贯相同的项
# temp=pd.DataFrame(train_moved.groupby('ship').apply(area_tri)).reset_index()
# temp[temp['area_3']!=0]

In [None]:
# test_1=train_feature
temp_2=pd.DataFrame(train_order[train_order['inner_ord_index']==0][['ship','x','y']]).reset_index(drop=True).rename(columns={'x':'start_x', 'y':'start_y'}, inplace = False)
# test.index.name = 'ship'
# temp_2=temp_1[['ship','x']]
# temp_2
# temp_2.rename(columns={'x':'start_x', 'y':'start_y'}, inplace = False)
temp_2
# test_1 = pd.merge(test_1, temp_2, on='ship', how='left')
# test_1
# train_feature

In [None]:
# t = np.arange(0, 1.0, 1.0/100)
# y = t+0.25
# a=pd.DataFrame(t[(y>0.5) & (y<0.6)])
# a.describe().add_prefix('prefix').T
# b=pd.concat([a, a], axis=1)
# b
# DataFrame
# temp1=(y>0.5)
# temp2=(y<0.6)
# temp3=temp1 and temp2
# temp3

In [None]:
# a=fft_v(train_order[train_order['ship']==0])
# a['ship']=1
# a