In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [3]:
#cited from others
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train_feature):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    train_feature['x_max_x_min'] = train_feature['x_max'] - train_feature['x_min']
    train_feature['y_max_y_min'] = train_feature['y_max'] - train_feature['y_min']
    train_feature['y_max_x_min'] = train_feature['y_max'] - train_feature['x_min']
    train_feature['x_max_y_min'] = train_feature['x_max'] - train_feature['y_min']
    train_feature['slope'] = train_feature['y_max_y_min'] / np.where(train_feature['x_max_x_min']==0, 0.001, train_feature['x_max_x_min'])
    train_feature['area'] = train_feature['x_max_x_min'] * train_feature['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train_feature['mode_hour'] = train_feature['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train_feature['hour_nunique'] = train_feature['ship'].map(hour_nunique)
    train_feature['date_nunique'] = train_feature['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    return train_feature

In [4]:
#this function will get an n-2 length series
def translation_forward(list_in,n):
    for i in range(n):
        list_in.pop(0)
    return list_in

def area_tri(groupbied):
    # area_triangle=[]change to dataframe
    # 创建空DataFrame
    # ship_id=groupbied['ship'].unique().item(0)
    if len(groupbied) > 129:
        x1_list=list(groupbied['x'])
        y1_list=list(groupbied['y'])
        x2_list=list(translation_forward(x1_list,64))
        y2_list=list(translation_forward(y1_list,64))
        x3_list=list(translation_forward(x2_list,64))
        y3_list=list(translation_forward(y2_list,64))
        # t=list(groupbied['time'])
        delta_t1=list(groupbied['time'])
        delta_t2=list(translation_forward(delta_t1,128))
        area_list=[]
        line_2_list=[]
        eff_a_l_list=[]
        eff_a_t_list=[]
        for x1, y1, x2, y2, x3, y3, diff_t1, diff_t2 in zip(x1_list,y1_list,x2_list,y2_list,x3_list,y3_list,delta_t1,delta_t2):
            a=np.sqrt((x2-x3)**2+(y2-y3)**2)
            b=np.sqrt((x1-x3)**2+(y1-y3)**2)
            c=np.sqrt((x1-x2)**2+(y1-y2)**2)
            s = (a + b + c) / 2
            l=a+c
            area=(s*(s-a)*(s-b)*(s-c)) ** 0.5
            diff_t=(diff_t2 - diff_t1)/np.timedelta64(1, 'h')
            eff_area_line=area/l if l!=0 else 0
            eff_area_diff_t=area /diff_t if diff_t!=0 else 0
            # area_triangle.append(s)
            area_list.append(area)
            line_2_list.append(l)
            eff_a_l_list.append(eff_area_line)
            eff_a_t_list.append(eff_area_diff_t)
        area_triangle = pd.DataFrame(list(zip(area_list, line_2_list, eff_a_l_list, eff_a_t_list)))
        area_triangle.columns = ['area_3','Line_2','eff_a_l','eff_a_t']  
        # area_triangle = area_triangle.append(temp_df,ignore_index=True)
    else:
        area_triangle=pd.DataFrame({'area_3':[0],'Line_2':[0],'eff_a_l':[0],'eff_a_t':[0]})
    return area_triangle


In [5]:
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
train_label = train.drop_duplicates('ship')
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
# rev means a reversed map relation
type_map_rev = {v:k for k,v in type_map.items()} 
train_label['type'] = train_label['type'].map(type_map)
train = extract_dt(train)
test = extract_dt(test)

In [6]:
#add more information about distribution on dist and time
train_order=train.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
train_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
train_order.index.name ='inner_rev_index'# save the inner index number step#2
train_order.reset_index(drop=False,inplace=True)#add 0-N index
print(train_order.index)
train_order['inner_ord_index']=train.index#use the former subIndex to Mark the t0-tN


RangeIndex(start=0, stop=2699638, step=1)


In [7]:
train_order['diff_time']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
train_order['diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
train_order['est_v']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
train_order['est_diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]

In [8]:
train_moved=train_order.loc[train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 ).reset_index()['diff_dist']]
# train_moved['diff_time_between_moving']=pd.DataFrame(train_moved.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
# train_moved
# 试着在外部先去掉相邻重复项 this is very important  train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 )
# df.loc[df.groupby(['CATEGORY'])['VALUE'].diff(1)!=0]This can drop off 连贯相同的项
# train_order=train_order.loc[train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 ).reset_index()['diff_dist']]
# train_order_drop

In [16]:
temp=pd.DataFrame(train_moved.groupby('ship').apply(area_tri)).reset_index()
# temp=train_order[train_order['ship']==1]['ship'].unique().item(0)
# temp=train_order.groupby('ship').apply(area_tri)
temp[temp['area_3']!=0]

Unnamed: 0,ship,level_1,area_3,Line_2,eff_a_l,eff_a_t


In [15]:
temp

Unnamed: 0,ship,level_1,area_3,Line_2,eff_a_l,eff_a_t
0,0,0,0.0,0.000000,0.0,0
1,1,0,0.0,7145.098121,0.0,0
2,1,1,0.0,7397.410623,0.0,0
3,1,2,0.0,7521.181457,0.0,0
4,1,3,0.0,7851.215858,0.0,0
5,1,4,0.0,8202.967113,0.0,0
6,1,5,0.0,8722.684613,0.0,0
7,1,6,0.0,9740.272351,0.0,0
8,1,7,0.0,10452.732758,0.0,0
9,1,8,0.0,9749.740423,0.0,0


In [10]:
# temp.describe()

In [11]:
# train_order[train_order['ship']==0]

In [12]:
# temp_test=train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 )
# temp_test=pd.DataFrame(temp)
# temp_test1=train_order.groupby('ship')['x'].diff()!=0
# temp_test1

In [13]:
# train_moved=train_order.groupby('ship').apply(lambda x:x['diff_dist']!=0 ).reset_index()['diff_dist']
# # temp_test3=temp_test2.reset_index(level=0,drop=True)
# temp_test2
# temp_test2=train_order[train_order['ship']==1]
# temp_test2

In [14]:
# b=len(train_moved)
# b
# drop_duplicates(subset=['ship','x','y'],keep='first',inplace=True).