In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [3]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train_feature):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    train_feature['x_max_x_min'] = train_feature['x_max'] - train_feature['x_min']
    train_feature['y_max_y_min'] = train_feature['y_max'] - train_feature['y_min']
    train_feature['y_max_x_min'] = train_feature['y_max'] - train_feature['x_min']
    train_feature['x_max_y_min'] = train_feature['x_max'] - train_feature['y_min']
    train_feature['slope'] = train_feature['y_max_y_min'] / np.where(train_feature['x_max_x_min']==0, 0.001, train_feature['x_max_x_min'])
    train_feature['area'] = train_feature['x_max_x_min'] * train_feature['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train_feature['mode_hour'] = train_feature['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train_feature['hour_nunique'] = train_feature['ship'].map(hour_nunique)
    train_feature['date_nunique'] = train_feature['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    return train_feature

In [4]:
#this function will get an n-2 length series
def translation_forward(list_in,n):
    for i in range(n):
        list_in.pop(0)
    return list_in

def area_tri(groupbied):
    # area_triangle=[]change to dataframe
    # 创建空DataFrame
    area_triangle = pd.DataFrame(columns = ['time','area_3'])#,'Line_2','eff_a_l','eff_a_t']) 
    # ship_id=groupbied['ship'].unique().item(0)
    x1_list=list(groupbied['x'])
    y1_list=list(groupbied['y'])
    x2_list=list(translation_forward(x1_list,1))
    y2_list=list(translation_forward(y1_list,1))
    x3_list=list(translation_forward(x2_list,1))
    y3_list=list(translation_forward(y2_list,1))
    # t=list(groupbied['time'])
    # delta_t=list(groupbied['diff_time'])
    for x1, y1, x2, y2, x3, y3, cur_t, diff_t in zip(x1_list,y1_list,x2_list,y2_list,x3_list,y3_list):#,t,delta_t):
        a=np.sqrt((x2-x3)**2+(y2-y3)**2)
        b=np.sqrt((x1-x3)**2+(y1-y3)**2)
        c=np.sqrt((x1-x2)**2+(y1-y2)**2)
        s = (a + b + c) / 2
        # l=a+c
        # eff_area_line=s/l if l!=0 else 0
        # eff_area_diff_t=s/diff_t if diff_t!=0 else 0
        # area_triangle.append(s)
        temp_df={'time':cur_t, 'area_3':s},#'Line_2':l,'eff_a_l':eff_area_line,'eff_a_t':eff_area_diff_t}
        area_triangle = area_triangle.append(temp_df,ignore_index=True)
    return area_triangle


In [5]:
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
train_label = train.drop_duplicates('ship')
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
# rev means a reversed map relation
type_map_rev = {v:k for k,v in type_map.items()} 
train_label['type'] = train_label['type'].map(type_map)
train = extract_dt(train)
test = extract_dt(test)

In [6]:
#add more information about distribution on dist and time
train_order=train.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
train_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
train_order.index.name ='inner_rev_index'# save the inner index number step#2
train_order.reset_index(drop=False,inplace=True)#add 0-N index
print(train_order.index)
train_order['inner_ord_index']=train.index#use the former subIndex to Mark the t0-tN


RangeIndex(start=0, stop=2699638, step=1)


In [7]:
train_order['diff_time']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
train_order['diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
train_order['est_v']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
train_order['est_diff_dist']=pd.DataFrame(train_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]

In [8]:
# 试着在外部先去掉相邻重复项 No need
# temp=pd.DataFrame(train_order.groupby('ship').apply(area_tri)).reset_index()

In [9]:
# temp.describe()

In [10]:
# temp=train_order[train_order['ship']==1]['ship'].unique().item(0)
temp=train_order.groupby('ship').apply(area_tri)

KeyboardInterrupt: 

In [None]:
temp

In [None]:
# temp_test=pd.DataFrame(temp)
# temp_test

In [None]:
# temp_test2=train_order[train_order['ship']==1]
# temp_test2

In [None]:
# drop_duplicates(subset=['ship','x','y'],keep='first',inplace=True).