In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')
train = pd.read_hdf('../input/train.h5')
train_label = train.drop_duplicates('ship')
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
# rev means a reversed map relation
type_map_rev = {v:k for k,v in type_map.items()} 
train_label['type'] = train_label['type'].map(type_map)

def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df
train = extract_dt(train)

In [3]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train_feature):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    train_feature['x_max_x_min'] = train_feature['x_max'] - train_feature['x_min']
    train_feature['y_max_y_min'] = train_feature['y_max'] - train_feature['y_min']
    train_feature['y_max_x_min'] = train_feature['y_max'] - train_feature['x_min']
    train_feature['x_max_y_min'] = train_feature['x_max'] - train_feature['y_min']
    train_feature['slope'] = train_feature['y_max_y_min'] / np.where(train_feature['x_max_x_min']==0, 0.001, train_feature['x_max_x_min'])
    train_feature['area'] = train_feature['x_max_x_min'] * train_feature['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train_feature['mode_hour'] = train_feature['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train_feature['hour_nunique'] = train_feature['ship'].map(hour_nunique)
    train_feature['date_nunique'] = train_feature['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    return train_feature

In [82]:
#this function will get an n-2 length series
def translation_forward(list_in,n):
    for i in range(n):
        list_in.pop(0)
    return list_in

def area_tri(groupbied):
    area_triangle=[]
    x1_list=list(groupbied['x'])
    y1_list=list(groupbied['y'])
    x2_list=list(translation_forward(x1_list,1))
    y2_list=list(translation_forward(y1_list,1))
    x3_list=list(translation_forward(x2_list,1))
    y3_list=list(translation_forward(y2_list,1))
    for x1, y1, x2, y2, x3, y3 in zip(x1_list,y1_list,x2_list,y2_list,x3_list,y3_list):
        a=np.sqrt((x2-x3)**2+(y2-y3)**2)
        b=np.sqrt((x1-x3)**2+(y1-y3)**2)
        c=np.sqrt((x1-x2)**2+(y1-y2)**2)
        s = (a + b + c) / 2
        area_triangle.append(s)
    return area_triangle


In [4]:
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
train = extract_dt(train)
test = extract_dt(test)

In [5]:
#add more information about distribution on dist and time
train_order=train.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
train_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
train_order.index.name ='inner_rev_index'# save the inner index number step#2
train_order.reset_index(drop=False,inplace=True)#add 0-N index
print(train_order.index)
train_order['inner_ord_index']=train.index#use the former subIndex to Mark the t0-tN


RangeIndex(start=0, stop=2699638, step=1)


In [16]:
train_order['diff_time']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
train_order['diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
train_order['est_v']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
train_order['est_diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]

In [22]:
temp=pd.DataFrame(train_order.groupby('ship').apply(lambda x:len(x))).reset_index()

In [31]:
temp.iloc[1]  #

ship      1
0       385
Name: 1, dtype: int64

In [84]:
temp=train_order.groupby('ship').apply(area_tri)

In [86]:
temp

ship
0       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3       [0.0, 0.0, 1162.9152806630425, 992.94362556649...
4       [0.0, 93.27571855040479, 93.27571855040479, 0....
5       [0.0, 0.0, 0.0, 0.0, 98.09530262414417, 98.095...
6       [1157.6321676505852, 1338.9765813810914, 1004....
7       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
8       [1061.1585352788256, 1190.4300084463305, 1153....
9       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10      [0.0, 109.42747206936704, 109.42747206936704, ...
11      [0.0, 0.0, 109.46282580257194, 0.0, 0.0, 0.0, ...
12      [954.8002989585325, 954.8168001355841, 1045.40...
13      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
14      [0.0, 109.42851863034547, 0.0, 109.42851863034...
15      [148.39487907083213, 98.21489180117148, 148.39...
16      [775.3958867179202, 768.9367076392092, 768.951...
17      [