In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import sys
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

In [3]:
test_path = '../input/hy_round1_testA_20200102'
test_files = os.listdir(test_path)
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship','x','y','v','d','time']
test=df

100%|██████████| 2000/2000 [00:35<00:00, 56.72it/s]


In [None]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train_feature):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    train_feature['x_max_x_min'] = train_feature['x_max'] - train_feature['x_min']
    train_feature['y_max_y_min'] = train_feature['y_max'] - train_feature['y_min']
    train_feature['y_max_x_min'] = train_feature['y_max'] - train_feature['x_min']
    train_feature['x_max_y_min'] = train_feature['x_max'] - train_feature['y_min']
    train_feature['slope'] = train_feature['y_max_y_min'] / np.where(train_feature['x_max_x_min']==0, 0.001, train_feature['x_max_x_min'])
    train_feature['area'] = train_feature['x_max_x_min'] * train_feature['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train_feature['mode_hour'] = train_feature['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train_feature['hour_nunique'] = train_feature['ship'].map(hour_nunique)
    train_feature['date_nunique'] = train_feature['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train_feature = pd.merge(train_feature, t, on='ship', how='left')
    return train_feature

#this func try to sparse the TIme data
def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [None]:
#add FFT
def fft_v(groupbied):
    nSampleNum = len(groupbied)
    sampleTime = groupbied['diff_time'].mean()
    ncount = (nSampleNum-1)*sampleTime
    delta_f = nSampleNum / ncount
    x = np.linspace(0,sampleTime,ncount)#时域波形x轴坐标
    freqLine = nSampleNum
    v = groupbied['v'].values#np.sin(2*pi*250*x)
    fft = abs(np.fft.fft(v))[0:freqLine]  #调用fft变换算法计算频域波形
    fftx = np.linspace(0,delta_f*freqLine,freqLine)  #频域波形x轴坐标
    fft_low = pd.DataFrame(fft[fftx<freqLine*0.25]).describe().T.add_prefix('fft_low_')
    fft_mid_l = pd.DataFrame(fft[(fftx>=freqLine*0.25) & (fftx<freqLine*0.5)]).describe().T.add_prefix('fft_mid_l_')
    fft_mid_h = pd.DataFrame(fft[(fftx>=freqLine*0.5) & (fftx<freqLine*0.75)]).describe().T.add_prefix('fft_mid_h_')
    fft_high = pd.DataFrame(fft[fftx>=freqLine*0.75]).describe().T.add_prefix('fft_high_')
    FFT_=pd.concat([fft_low, fft_mid_l, fft_mid_h, fft_high], axis=1)
    return FFT_

In [None]:
def feature_version2(df,feature_label):
    df_order=df.groupby('ship').apply(lambda x: x.sort_values('time',ascending=True))
    df_order.reset_index(level=0,drop=True,inplace=True)# save the inner index number step#1
    df_order.index.name ='inner_rev_index'# save the inner index number step#2
    df_order.reset_index(drop=False,inplace=True)#add 0-N index
    df_order['inner_ord_index']=df.index#use the former subIndex to Mark the t0-tN
    df_order['diff_time']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['time'].diff()/np.timedelta64(1, 'h'))).reset_index()['time']
    df_order['diff_dist']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:np.sqrt(x['x'].diff()**2+x['y'].diff()**2))).reset_index()[0]
    df_order['est_v']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['diff_dist']/x['diff_time'])).reset_index()[0]
    df_order['est_diff_dist']=pd.DataFrame(df_order.groupby('ship').apply(lambda x:x['v']*x['diff_time']*1852)).reset_index()[0]
    v_feature=pd.DataFrame(df_order.groupby('ship').apply(fft_v)).reset_index()
    feature_label = pd.merge(feature_label, v_feature, on='ship', how='left')
    temp=pd.DataFrame(df_order[df_order['inner_ord_index']==0][['ship','x','y']]).reset_index(drop=True).rename(columns={'x':'start_x', 'y':'start_y'}, inplace = False)
    feature_label = pd.merge(feature_label, temp, on='ship', how='left')
    temp=pd.DataFrame(df_order[df_order['inner_rev_index']==0][['ship','x','y']]).reset_index(drop=True).rename(columns={'x':'end_x', 'y':'end_y'}, inplace = False)
    feature_label = pd.merge(feature_label, temp, on='ship', how='left')
    return feature_label

In [None]:
def read_model_param(conf_path):
    f=open(conf_path,"r")
    model_param=eval(f.read())
    f.close()
    return model_param

In [None]:
test = extract_dt(test)
test_label = test.drop_duplicates('ship')

In [None]:
#zip realize the one to one map from the former list to the latter list
#dict make the map as a dictionary
type_map={'拖网': 0, '围网': 1, '刺网': 2}
type_map_rev={0: '拖网', 1: '围网', 2: '刺网'}
#the data of type was now described by numbers as 0,1,2 
test_label = extract_feature(test, test_label)
test_label = feature_version2(test, test_label)

In [None]:
features = [x for x in test_label.columns if x not in ['ship','type','time','diff_time','date']]
target = 'type'
print(len(features), ','.join(features))

In [None]:
method = 2
for i in range(method):
    modle_dir=f'../model/testModelV{i}'
    models=[]
    pred = np.zeros((len(test_label),3))
    for file in tqdm(modle_dir):
        if file=='param.txt':
            param = read_model_param(f'{modle_dir}/{file}')
        else:
            models.append(lgb.load_model(f'{modle_dir}/{file}'))
    for index, model in enumerate(models):
        test_pred = model.predict(test_label[features])
        pred += test_pred/len(models)
    pred = np.argmax(pred, axis=1)
    sub = test_label[['ship']]
    sub['pred'] = pred
    print(sub['pred'].value_counts(1))
    sub['pred'] = sub['pred'].map(type_map_rev)
    sub.to_csv(f'{modle_dir}/result.csv', index=None, header=None)