## 学习目标

- 学习特征工程的基本概念
- 学习topline代码的特征工程构造方法，实现构建有意义的特征工程
- 完成相应学习打卡任务

## 内容介绍

@[TOC]()

## 特征工程概述

特征工程大体可分为3部分，特征构建、特征提取和特征选择

### 特征构建
+ 探索性数据分析
+ 数值特征
+ 类别特征
+ 时间特征
+ 文本特征

In [1]:
import gc
import multiprocessing as mp
import os
import pickle
import time
import warnings
from collections import Counter
from copy import deepcopy
from datetime import datetime
from functools import partial
from glob import glob

import geopandas as gpd
# import lightgbm as lgb
from itertools import product
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.models import FastText, Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pyproj import Proj
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

os.environ['PYTHONHASHSEED'] = '0'
warnings.filterwarnings('ignore')

In [2]:
# 不直接对DataFrame做append操作，提升运行速度
def get_data(file_path,max_lines = 2000):
    paths = os.listdir(file_path)
    tmp = []
    for t in tqdm(range(len(paths))):
        if len(tmp) > max_lines:break
            
        p = paths[t]
        with open('{}/{}'.format(file_path, p), encoding='utf-8') as f:
            next(f)
            for line in f.readlines():
                tmp.append(line.strip().split(','))
                if len(tmp) > max_lines:break
                    
    tmp_df = pd.DataFrame(tmp)
    tmp_df.columns = ['渔船ID', 'x', 'y', '速度', '方向', 'time', 'type']
    return tmp_df
data_path = '../team-learning-data-mining-master/wisdomOcean/data/'
TRAIN_PATH = f"{data_path}/hy_round1_train_20200102/"
# 采样数据行数
max_lines = 3000
df = get_data(TRAIN_PATH,max_lines=max_lines)

  0%|          | 0/7000 [00:00<?, ?it/s]


In [3]:
# 基本预处理
label_dict1 = {'拖网': 0, '围网': 1, '刺网': 2}
label_dict2 = {0: '拖网', 1: '围网', 2: '刺网'}
name_dict = {'渔船ID': 'id', '速度': 'v', '方向': 'dir', 'type': 'label'}

df.rename(columns = name_dict, inplace = True)
df['label'] = df['label'].map(label_dict1)
cols = ['x','y','v']
for col in cols:
    df[col] = df[col].astype('float')
df['dir'] = df['dir'].astype('int')
df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')

df.head()

Unnamed: 0,id,x,y,v,dir,time,label
0,6966,6265902.0,5279254.0,0.11,306,1900-11-06 23:58:16,1
1,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:48:21,1
2,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:38:19,1
3,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:28:36,1
4,6966,6265902.0,5279254.0,0.32,130,1900-11-06 23:08:17,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      3001 non-null   object        
 1   x       3001 non-null   float64       
 2   y       3001 non-null   float64       
 3   v       3001 non-null   float64       
 4   dir     3001 non-null   int64         
 5   time    3001 non-null   datetime64[ns]
 6   label   3001 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 164.2+ KB


#### 赛题特征工程

##### 距离特征

    - 构造各点的(x、y)坐标与特定点(6165599,5202660)的距离

In [5]:
df['x_dis_diff'] = (df['x'] - 6165599).abs()
df['y_dis_diff'] = (df['y'] - 5202660).abs()
df['base_dis_diff'] = ((df['x_dis_diff']**2)+(df['y_dis_diff']**2))**0.5    
del df['x_dis_diff'],df['y_dis_diff'] 
df['base_dis_diff'].head()

0    126203.102299
1    126203.102299
2    126203.102299
3    126203.102299
4    126203.102299
Name: base_dis_diff, dtype: float64

##### 时间特征
    - 对时间，小时进行白天、黑天进行划分，5-20为白天1，其余为黑天0
    - 周
    - 季度

In [6]:

df['date'] = df['time'].dt.date
df['hour'] = df['time'].dt.hour
df['month'] = df['time'].dt.month
df['weekday'] = df['time'].dt.weekday
df['day_nig'] = 0
df.loc[(df['hour'] > 5) & (df['hour'] < 20),'day_nig'] = 1

3. 动态特征
    - 动态速度，速度变化，角度变化，xy相似性等特征
    - 统计每个ship的对应速度等级的个数
    - 方位进行16均分
    - 统计速度为0的个数，以及速度不为0的统计量
    - 加入x，v，d，y的中位数和各种位数
    - 以ship、time为主键进行排序
        - 通过shift求相邻差异值,时间差分，角度差分，x， y差分

In [7]:
temp = df.copy()
temp.rename(columns={'id':'ship','dir':'d'},inplace=True)

# 给速度一个等级
# def v_cut(v):
#     if v < 0.1:
#         return 0
#     elif v < 0.5:
#         return 1
#     elif v < 1:
#         return 2
#     elif v < 2.5:
#         return 3
#     elif v < 5:
#         return 4
#     elif v < 10:
#         return 5
#     elif v < 20:
#         return 5
#     else:
#         return 6


# 统计每个ship的对应速度等级的个数
def get_v_fea(df):

#     df['v_cut'] = df['v'].apply(lambda x: v_cut(x))
    bins = [-np.inf,0.1,0.5,1,2.5,5,20,np.inf]
    df['v_cut'] = pd.cut(df['v'],bins = bins,labels = np.arange(len(bins)-1)).fillna(6)
#     tmp = df.groupby(['ship', 'v_cut'], as_index=False)['v_cut'].agg({'v_cut_count': 'count'})
    tmp = temp.groupby(['ship', 'v_cut']).agg({'x': 'count'}).reset_index().rename(columns = {'x':'v_cut_count'})
    # 通过pivot构建透视表
    tmp = tmp.pivot_table(index='ship', columns='v_cut', values='v_cut_count')

    new_col_nm = ['v_cut_' + str(col) for col in tmp.columns.tolist()]
    tmp.columns = new_col_nm
    tmp = tmp.reset_index()  # 把index恢复成data

    return tmp

c1 = get_v_fea(temp)

In [8]:
# 方位进行16均分
def add_direction(df):
#     df['d16'] = df['d'].apply(lambda x: int((x / 22.5) + 0.5) % 16 if not np.isnan(x) else np.nan)
#     df['d16'] = (df['d']/ 22.5 +0.5)% 16//1
    bins = np.arange(17)*382.5/17
    df['d16'] = pd.cut(df['d'],bins = bins,labels = np.arange(len(bins)-1))
    return df
def get_d_cut_count_fea(df):
    df = add_direction(df)
#     print(df.columns)
    tmp = df.groupby(['ship', 'd16']).agg({'x': 'count'}).reset_index().rename(columns = {'x':'d16_count'})
    tmp = tmp.pivot(index='ship', columns='d16', values='d16_count')
    new_col_nm = ['d16_' + str(col) for col in tmp.columns.tolist()]
    tmp.columns = new_col_nm
    tmp = tmp.reset_index()
    return tmp

c2 = get_d_cut_count_fea(temp)

In [9]:
def get_v0_fea(df):
    # 统计速度为0的个数，以及速度不为0的统计量
    df_zero_count = df.query("v==0")[['ship', 'v']].groupby('ship', as_index=False)['v'].agg(
        {'num_zero_v': 'count'})
    df_not_zero_agg = df.query("v!=0")[['ship', 'v']].groupby('ship', as_index=False)['v'].agg(
        {'v_max_drop_0': 'max',
         'v_min_drop_0': 'min',
         'v_mean_drop_0': 'mean',
         'v_std_drop_0': 'std',
         'v_median_drop_0': 'median',
         'v_skew_drop_0': 'skew'})
    tmp = df_zero_count.merge(df_not_zero_agg, on='ship', how='left')

    return tmp

c3 = get_v0_fea(temp)

In [10]:
def get_percentiles_fea(df_raw):
    key = ['x', 'y', 'v', 'd']
    temp = df_raw[['ship']].drop_duplicates('ship')
    for i in range(len(key)):
        # 加入x，v，d，y的中位数和各种位数
        tmp_dscb = df_raw.groupby('ship')[key[i]].describe(
            percentiles=[0.05] + [ii / 1000 for ii in range(125, 1000, 125)] + [0.95])
        raw_col_nm = tmp_dscb.columns.tolist()
        new_col_nm = [key[i] + '_' + col for col in raw_col_nm]
        tmp_dscb.columns = new_col_nm
        tmp_dscb = tmp_dscb.reset_index()
        # 删掉多余的统计特征
        tmp_dscb = tmp_dscb.drop([f'{key[i]}_count', f'{key[i]}_mean', f'{key[i]}_std',
                                  f'{key[i]}_min', f'{key[i]}_max'], axis=1)

        temp = temp.merge(tmp_dscb, on='ship', how='left')
    return temp

c4 = get_percentiles_fea(temp)

In [11]:
def get_d_change_rate_fea(df):
    import math
    import time
    temp = df.copy()
    # 以ship、time为主键进行排序
    temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
    # 通过shift求相邻差异值，注意学习.shift(-1,1)的含义
    temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
    temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
    temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
    # 将shift得到的差异量进行填充，为什么会有空值NaN？
    # 因为shift的起始位置是没法比较的，故用空值来代替
    temp['ynext'] = temp['ynext'].fillna(method='ffill')
    temp['xnext'] = temp['xnext'].fillna(method='ffill')
    # 这里笔者的理解是ynext/xnext，而不需要减去y和x，因为ynext和xnext本身就是偏移量了
    temp['angle_next'] = (temp['ynext'] - temp['y']) / (temp['xnext'] - temp['x'])
    temp['angle_next'] = np.arctan(temp['angle_next']) / math.pi * 180
    temp['angle_next_next'] = temp.groupby('ship')['angle_next'].shift(-1)
    
    temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
    temp['timediff'] = temp['timediff'].fillna(method='ffill')
    
    temp['hc_xy'] = abs(temp['angle_next_next'] - temp['angle_next'])
    # 对于hc_xy这列的值>180度的，进行修改成360度求差，仅考虑与水平线的角度
    temp.loc[temp['hc_xy'] > 180, 'hc_xy'] = (360 - temp.loc[temp['hc_xy'] > 180, 'hc_xy'])
    temp['hc_xy_s'] = temp.apply(lambda x: x['hc_xy'] / x['timediff'].total_seconds(), axis=1)

    temp['d_next'] = temp.groupby('ship')['d'].shift(-1)
    temp['hc_d'] = abs(temp['d_next'] - temp['d'])
    temp.loc[temp['hc_d'] > 180, 'hc_d'] = 360 - temp.loc[temp['hc_d'] > 180, 'hc_d']
    temp['hc_d_s'] = temp.apply(lambda x: x['hc_d'] / x['timediff'].total_seconds(), axis=1)

    temp1 = temp[['ship', 'hc_xy_s', 'hc_d_s']]
#     xy_d_rate = temp1.groupby('ship')['hc_xy_s'].agg({'hc_xy_s_max': 'max',})
#     xy_d_rate = xy_d_rate.reset_index()
#     d_d_rate = temp1.groupby('ship')['hc_d_s'].agg({'hc_d_s_max': 'max',})
#     d_d_rate = d_d_rate.reset_index()
#     tmp = xy_d_rate.merge(d_d_rate, on='ship', how='left')
    d_d_rate = temp1.groupby('ship').agg({'hc_xy_s':'max','hc_d_s':'max'}).reset_index()
    d_d_rate.rename(columns = {'hc_xy_s':'hc_xy_s_max','hc_d_s':'hc_d_s_max'})

    return d_d_rate

c5 = get_d_change_rate_fea(temp)

In [12]:
f1 = temp.merge(c1,on='ship',how='left')
f1 = f1.merge(c2,on='ship',how='left')
f1 = f1.merge(c3,on='ship',how='left')
f1 = f1.merge(c4,on='ship',how='left')
f1 = f1.merge(c5,on='ship',how='left')

#### 分箱特征
##### v、x、y的分箱特征


In [13]:
# v、x、y的分箱特征
pre_cols = df.columns

df['v_bin'] = pd.qcut(df['v'], 200, duplicates='drop') # 速度进行 200分位数分箱
df['v_bin'] = df['v_bin'].map(dict(zip(df['v_bin'].unique(), range(df['v_bin'].nunique())))) # 分箱后映射编码
for f in ['x', 'y']:
    df[f + '_bin1'] = pd.qcut(df[f], 1000, duplicates='drop') # x,y位置分箱1000
    df[f + '_bin1'] = df[f + '_bin1'].map(dict(zip(df[f + '_bin1'].unique(), range(df[f + '_bin1'].nunique()))))#编码
    df[f + '_bin2'] = df[f] // 10000 # 取整操作
    df[f + '_bin1_count'] = df[f + '_bin1'].map(df[f + '_bin1'].value_counts()) #x,y不同分箱的数量映射
    df[f + '_bin2_count'] = df[f + '_bin2'].map(df[f + '_bin2'].value_counts()) #数量映射
    df[f + '_bin1_id_nunique'] = df.groupby(f + '_bin1')['id'].transform('nunique')#基于分箱1 id数量映射
    df[f + '_bin2_id_nunique'] = df.groupby(f + '_bin2')['id'].transform('nunique')#基于分箱2 id数量映射
    
for i in [1, 2]:
    # 特征交叉x_bin1（2）,y_bin1（2） 形成类别 统计每类数量映射到列  
    df['x_y_bin{}'.format(i)] = df['x_bin{}'.format(i)].astype('str') + '_' + df['y_bin{}'.format(i)].astype('str')
    df['x_y_bin{}'.format(i)] = df['x_y_bin{}'.format(i)].map(
        dict(zip(df['x_y_bin{}'.format(i)].unique(), range(df['x_y_bin{}'.format(i)].nunique())))
    )
    df['x_bin{}_y_bin{}_count'.format(i, i)] = df['x_y_bin{}'.format(i)].map(df['x_y_bin{}'.format(i)].value_counts())
for stat in ['max', 'min']:
    # 统计x_bin1 y_bin1的最大最小值
    df['x_y_{}'.format(stat)] = df['y'] - df.groupby('x_bin1')['y'].transform(stat)
    df['y_x_{}'.format(stat)] = df['x'] - df.groupby('y_bin1')['x'].transform(stat)

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

Unnamed: 0,v_bin,x_bin1,x_bin2,x_bin1_count,x_bin2_count,x_bin1_id_nunique,x_bin2_id_nunique,y_bin1,y_bin2,y_bin1_count,...,y_bin1_id_nunique,y_bin2_id_nunique,x_y_bin1,x_bin1_y_bin1_count,x_y_bin2,x_bin2_y_bin2_count,x_y_max,y_x_max,x_y_min,y_x_min
0,0,0,626.0,302,345,1,1,0,527.0,304,...,2,2,0,302,0,345,0.0,-49224.816068,0.0,0.0
1,1,0,626.0,302,345,1,1,0,527.0,304,...,2,2,0,302,0,345,0.0,-49224.816068,0.0,0.0
2,1,0,626.0,302,345,1,1,0,527.0,304,...,2,2,0,302,0,345,0.0,-49224.816068,0.0,0.0
3,1,0,626.0,302,345,1,1,0,527.0,304,...,2,2,0,302,0,345,0.0,-49224.816068,0.0,0.0
4,2,0,626.0,302,345,1,1,0,527.0,304,...,2,2,0,302,0,345,0.0,-49224.816068,0.0,0.0


##### 将x、y进行分箱并构造区域

In [14]:

def traj_to_bin(traj=None, x_min=12031967.16239096, x_max=14226964.881853,
                y_min=1623579.449434373, y_max=4689471.1780792,
                row_bins=4380, col_bins=3136):

    # Establish bins on x direction and y direction
    x_bins = np.linspace(x_min, x_max, endpoint=True, num=col_bins + 1)
    y_bins = np.linspace(y_min, y_max, endpoint=True, num=row_bins + 1)

    # Determine each x coordinate belong to which bin
    traj.sort_values(by='x', inplace=True)
    x_res = np.zeros((len(traj), ))
    j = 0
    for i in range(1, col_bins + 1):
        low, high = x_bins[i-1], x_bins[i]
        while( j < len(traj)):
            # low - 0.001 for numeric stable.
            if (traj["x"].iloc[j] <= high) & (traj["x"].iloc[j] > low - 0.001):
                x_res[j] = i
                j += 1
            else:
                break
    traj["x_grid"] = x_res
    traj["x_grid"] = traj["x_grid"].astype(int)
    traj["x_grid"] = traj["x_grid"].apply(str)

    # Determine each y coordinate belong to which bin
    traj.sort_values(by='y', inplace=True)
    y_res = np.zeros((len(traj), ))
    j = 0
    for i in range(1, row_bins + 1):
        low, high = y_bins[i-1], y_bins[i]
        while( j < len(traj)):
            # low - 0.001 for numeric stable.
            if (traj["y"].iloc[j] <= high) & (traj["y"].iloc[j] > low - 0.001):
                y_res[j] = i
                j += 1
            else:
                break
    traj["y_grid"] = y_res
    traj["y_grid"] = traj["y_grid"].astype(int)
    traj["y_grid"] = traj["y_grid"].apply(str)

    # Determine which bin each coordinate belongs to.
    traj["no_bin"] = [i + "_" + j for i, j in zip(
        traj["x_grid"].values.tolist(), traj["y_grid"].values.tolist())]
    traj.sort_values(by='time', inplace=True)
    return traj

bin_size = 800
col_bins = int((14226964.881853 - 12031967.16239096) / bin_size)
row_bins = int((4689471.1780792 - 1623579.449434373) / bin_size)

In [15]:
pre_cols = df.columns
# 特征x_grid,y_grid,no_bin
df = traj_to_bin(df)

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols]

Unnamed: 0,x_grid,y_grid,no_bin
2320,0,0,0_0
1200,0,0,0_0
2319,0,0,0_0
1199,0,0,0_0
2318,0,0,0_0
...,...,...,...
1205,0,4238,0_4238
1204,0,4239,0_4239
1203,0,4239,0_4239
1202,0,4240,0_4240


#### 统计特征
##### count计数值


In [16]:
# def find_save_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
#     """Find and save the visit frequency of each bin."""
#     visit_count_df = traj_data_df.groupby(["no_bin"]).agg({'x':'count'}).reset_index()[["no_bin", "x"]].rename({"x":"visit_count"})
# #     visit_count_df = visit_count_df[["no_bin", "x"]]
# #     visit_count_df.rename({"x":"visit_count"}, axis=1, inplace=True)
#     return visit_count_df

# def find_save_unique_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
#     """Find and save the unique boat visit count of each bin."""
#     unique_boat_count_df = traj_data_df.groupby(["no_bin"]).agg({'id':'nunique'}).reset_index().rename({"id":"visit_boat_count"})
# #     unique_boat_count_df.rename({"id":"visit_boat_count"}, axis=1, inplace=True)

#     unique_boat_count_df_save = pd.merge(bin_to_coord_df, unique_boat_count_df,
#                                          on="no_bin", how="left")
#     return unique_boat_count_df

# traj_df = df[["id","x", "y",'time',"no_bin"]]
# bin_to_coord_df = traj_df.groupby(["no_bin"]).median().reset_index()
def find_save_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the visit frequency of each bin."""
    visit_count_df = traj_data_df.groupby(["no_bin"]).count().reset_index()
    visit_count_df = visit_count_df[["no_bin", "x"]]
    visit_count_df.rename({"x":"visit_count"}, axis=1, inplace=True)
    return visit_count_df

def find_save_unique_visit_count_table(traj_data_df=None, bin_to_coord_df=None):
    """Find and save the unique boat visit count of each bin."""
    unique_boat_count_df = traj_data_df.groupby(["no_bin"])["id"].nunique().reset_index()
    unique_boat_count_df.rename({"id":"visit_boat_count"}, axis=1, inplace=True)

    unique_boat_count_df_save = pd.merge(bin_to_coord_df, unique_boat_count_df,
                                         on="no_bin", how="left")
    return unique_boat_count_df

traj_df = df[["id","x", "y",'time',"no_bin"]]
bin_to_coord_df = traj_df.groupby(["no_bin"]).median().reset_index()

In [17]:
pre_cols = df.columns

# DataFrame tmp for finding POIs
visit_count_df = find_save_visit_count_table(
    traj_df, bin_to_coord_df)
unique_boat_count_df = find_save_unique_visit_count_table(
    traj_df, bin_to_coord_df)

# # 特征'visit_count','visit_boat_count'
df = df.merge(visit_count_df,on='no_bin',how='left')
df = df.merge(unique_boat_count_df,on='no_bin',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()


Unnamed: 0,visit_count,visit_boat_count
0,2705,7
1,2705,7
2,2705,7
3,2705,7
4,2705,7


##### shift偏移量特征


In [18]:
# shift偏移量特征
pre_cols = df.columns

g = df.groupby('id')
for f in ['x', 'y']:
    #对x,y坐标进行时间平移 1 -1 2
    df[f + '_prev_diff'] = df[f] - g[f].shift(1)
    df[f + '_next_diff'] = df[f] - g[f].shift(-1)
    df[f + '_prev_next_diff'] = g[f].shift(1) - g[f].shift(-1)
    ## 三角形求解上时刻1距离  下时刻-1距离 2距离 
df['dist_move_prev'] = np.sqrt(np.square(df['x_prev_diff']) + np.square(df['y_prev_diff']))
df['dist_move_next'] = np.sqrt(np.square(df['x_next_diff']) + np.square(df['y_next_diff']))
df['dist_move_prev_next'] = np.sqrt(np.square(df['x_prev_next_diff']) + np.square(df['y_prev_next_diff']))
df['dist_move_prev_bin'] = pd.qcut(df['dist_move_prev'], 50, duplicates='drop')# 2时刻距离等频分箱50
df['dist_move_prev_bin'] = df['dist_move_prev_bin'].map(
    dict(zip(df['dist_move_prev_bin'].unique(), range(df['dist_move_prev_bin'].nunique())))
) #上一时刻映射编码

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()


Unnamed: 0,x_prev_diff,x_next_diff,x_prev_next_diff,y_prev_diff,y_next_diff,y_prev_next_diff,dist_move_prev,dist_move_next,dist_move_prev_next,dist_move_prev_bin
0,,0.0,,,0.0,,,0.0,,
1,,-2379.515712,,,928.13647,,,2554.120657,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2379.515712,-2289.136947,-4668.652659,-928.13647,1261.126019,2189.262489,2554.120657,2613.539133,5156.470488,2.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
pre_cols = df.columns

def start(x):
    try:
        return x[0]
    except:
        return None

def end(x):
    try:
        return x[-1]
    except:
        return None


def mode(x):
    try:
        return pd.Series(x).value_counts().index[0]
    except:
        return None
for f in ['dist_move_prev_bin', 'v_bin']:
    # 上一时刻类别 速度类别映射处理
    df[f + '_sen'] = df['id'].map(df.groupby('id')[f].agg(lambda x: ','.join(x.astype(str))))
    # 一系列基本统计量特征 每列执行相应的操作
g = df.groupby('id').agg({
    'id': ['count'], 
    'x_bin1': [mode], 
    'y_bin1': [mode], 
    'x_bin2': [mode], 
    'y_bin2': [mode], 
    'x_y_bin1': [mode],
    'x': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'y': ['mean', 'max', 'min', 'std', np.ptp, start, end],
    'v': ['mean', 'max', 'min', 'std', np.ptp], 
    'dir': ['mean'],
    'x_bin1_count': ['mean'], 
    'y_bin1_count': ['mean', 'max', 'min'],
    'x_bin2_count': ['mean', 'max', 'min'], 
    'y_bin2_count': ['mean', 'max', 'min'],
    'x_bin1_y_bin1_count': ['mean', 'max', 'min'],
    'dist_move_prev': ['mean', 'max', 'std', 'min', 'sum'],
    'x_y_min': ['mean', 'min'], 
    'y_x_min': ['mean', 'min'],
    'x_y_max': ['mean', 'min'], 
    'y_x_max': ['mean', 'min'],
}).reset_index()
g.columns = ['_'.join(col).strip() for col in g.columns] #提取列名
g.rename(columns={'id_': 'id'}, inplace=True) #重命名id_
cols = [f for f in g.keys() if f != 'id'] #特征列名提取    

In [20]:
df = df.merge(g,on='id',how='left')
new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

Unnamed: 0,dist_move_prev_bin_sen,v_bin_sen,id_count,x_bin1_mode,y_bin1_mode,x_bin2_mode,y_bin2_mode,x_y_bin1_mode,x_mean,x_max,...,dist_move_prev_min,dist_move_prev_sum,x_y_min_mean,x_y_min_min,y_x_min_mean,y_x_min_min,x_y_max_mean,x_y_max_min,y_x_max_mean,y_x_max_min
0,"nan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1....","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",405,430,385,610.0,511.0,914,6111712.0,6146439.0,...,0.0,167403.635587,3951.751016,0.0,2010.203566,0.0,-1994.8752,-27993.134668,-5527.254366,-103239.952092
1,"nan,2.0,3.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,4.0,2....","22,20,19,15,68,66,15,66,52,61,18,17,18,17,17,1...",386,94,93,675.0,543.0,221,6806687.0,6965640.0,...,0.0,539855.200175,3285.973541,0.0,35750.598766,0.0,-1703.494682,-46893.002685,-32262.980249,-318563.512015
2,"nan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1....","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",405,430,385,610.0,511.0,914,6111712.0,6146439.0,...,0.0,167403.635587,3951.751016,0.0,2010.203566,0.0,-1994.8752,-27993.134668,-5527.254366,-103239.952092
3,"nan,2.0,3.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,4.0,2....","22,20,19,15,68,66,15,66,52,61,18,17,18,17,17,1...",386,94,93,675.0,543.0,221,6806687.0,6965640.0,...,0.0,539855.200175,3285.973541,0.0,35750.598766,0.0,-1703.494682,-46893.002685,-32262.980249,-318563.512015
4,"nan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1....","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",405,430,385,610.0,511.0,914,6111712.0,6146439.0,...,0.0,167403.635587,3951.751016,0.0,2010.203566,0.0,-1994.8752,-27993.134668,-5527.254366,-103239.952092


In [21]:
# def group_feature(df, key, target, aggs,flag):   
#     """通过字典的形式来构建方法和重命名"""
#     agg_dict = {}
#     for ag in aggs:
#         agg_dict['{}_{}_{}'.format(target,ag,flag)] = ag
#     print(agg_dict)
#     t = df.groupby(key, as_index=False)[target].agg(agg_dict).reset_index()
#     return t
def group_feature(df, key, target, aggs,flag):   
    """通过字典的形式来构建方法和重命名"""
    agg_dict = {}
    agg_list = []
    for ag in aggs:
#         agg_dict['{}_{}_{}'.format(target,ag,flag)] = ag
        agg_list.append(['{}_{}_{}'.format(target,ag,flag), ag])
#     print(agg_dict)
#     t = df.groupby(key)[target].agg(agg_dict).reset_index()
    t = df.groupby(key)[target].agg(agg_list).reset_index()
    return t

def extract_feature(df, train, flag):
    '''
    统计feature
    注意理解group_feature的使用和效果
    '''
    if (flag == 'on_night') or (flag == 'on_day'): 
        t = group_feature(df, 'ship','speed',['max','mean','median','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')
        # return train
    
    
    if flag == "0":
        t = group_feature(df, 'ship','direction',['max','median','mean','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')  
    elif flag == "1":
        t = group_feature(df, 'ship','speed',['max','mean','median','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left')
        t = group_feature(df, 'ship','direction',['max','median','mean','std','skew'],flag)
        train = pd.merge(train, t, on='ship', how='left') 
        # .nunique().to_dict() 将nunique得到的对应唯一值统计量做成字典
        # to_dict() 与 map的使用可以很方便地构建一些统计量映射特征，如CTR（分类）问题中的转化率
        # 提问： 如果根据训练集给定的label(0,1)来构建训练集+测试集的转化率特征，注：测试集与训练集存在部分id相同
        hour_nunique = df.groupby('ship')['speed'].nunique().to_dict()
        train['speed_nunique_{}'.format(flag)] = train['ship'].map(hour_nunique)   
        hour_nunique = df.groupby('ship')['direction'].nunique().to_dict()
        train['direction_nunique_{}'.format(flag)] = train['ship'].map(hour_nunique)  

    t = group_feature(df, 'ship','x',['max','min','mean','median','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','median','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','base_dis_diff',['max','min','mean','std','skew'],flag)
    train = pd.merge(train, t, on='ship', how='left')

       
    train['x_max_x_min_{}'.format(flag)] = train['x_max_{}'.format(flag)] - train['x_min_{}'.format(flag)]
    train['y_max_y_min_{}'.format(flag)] = train['y_max_{}'.format(flag)] - train['y_min_{}'.format(flag)]
    train['y_max_x_min_{}'.format(flag)] = train['y_max_{}'.format(flag)] - train['x_min_{}'.format(flag)]
    train['x_max_y_min_{}'.format(flag)] = train['x_max_{}'.format(flag)] - train['y_min_{}'.format(flag)]
    train['slope_{}'.format(flag)] = train['y_max_y_min_{}'.format(flag)] / np.where(train['x_max_x_min_{}'.format(flag)]==0, 0.001, train['x_max_x_min_{}'.format(flag)])
    train['area_{}'.format(flag)] = train['x_max_x_min_{}'.format(flag)] * train['y_max_y_min_{}'.format(flag)] 
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour_{}'.format(flag)] = train['ship'].map(mode_hour)
    train['slope_median_{}'.format(flag)] = train['y_median_{}'.format(flag)] / np.where(train['x_median_{}'.format(flag)]==0, 0.001, train['x_median_{}'.format(flag)])

    return train

In [22]:
data  = df.copy()
data.rename(columns={
    'id':'ship',
    'v':'speed',
    'dir':'direction'
},inplace=True)
# 去重
data_label = data.drop_duplicates(['ship'],keep = 'first')

data_1 = data[data['speed']==0]
data_2 = data[data['speed']!=0]
data_label = extract_feature(data_1, data_label,"0")
data_label = extract_feature(data_2, data_label,"1")

data_1 = data[data['day_nig'] == 0]
data_2 = data[data['day_nig'] == 1]
data_label = extract_feature(data_1, data_label,"on_night")
data_label = extract_feature(data_2, data_label,"on_day")
data_label.rename(columns={'ship':'id','speed':'v','direction':'dir'},inplace=True)

In [23]:
new_cols = [i for i in data_label.columns if i not in df.columns]
df = df.merge(data_label[new_cols+['id']],on='id',how='left')

df[new_cols].head()

Unnamed: 0,direction_max_0,direction_median_0,direction_mean_0,direction_std_0,direction_skew_0,x_max_0,x_min_0,x_mean_0,x_median_0,x_std_0,...,base_dis_diff_std_on_day,base_dis_diff_skew_on_day,x_max_x_min_on_day,y_max_y_min_on_day,y_max_x_min_on_day,x_max_y_min_on_day,slope_on_day,area_on_day,mode_hour_on_day,slope_median_on_day
0,0.0,0.0,0.0,0.0,0.0,6102853.0,6102751.0,6102817.0,6102853.0,48.919553,...,8609.844843,0.945951,43889.138,36438.288957,-989901.6,1070229.0,0.830235,1599245000.0,19,0.837728
1,301.0,40.0,85.764706,103.134942,0.817325,6963742.0,6739805.0,6823112.0,6756708.0,103589.588732,...,90377.633369,1.022255,325691.519166,149044.4476,-1057917.0,1532653.0,0.457625,48542510000.0,18,0.812848
2,0.0,0.0,0.0,0.0,0.0,6102853.0,6102751.0,6102817.0,6102853.0,48.919553,...,8609.844843,0.945951,43889.138,36438.288957,-989901.6,1070229.0,0.830235,1599245000.0,19,0.837728
3,301.0,40.0,85.764706,103.134942,0.817325,6963742.0,6739805.0,6823112.0,6756708.0,103589.588732,...,90377.633369,1.022255,325691.519166,149044.4476,-1057917.0,1532653.0,0.457625,48542510000.0,18,0.812848
4,0.0,0.0,0.0,0.0,0.0,6102853.0,6102751.0,6102817.0,6102853.0,48.919553,...,8609.844843,0.945951,43889.138,36438.288957,-989901.6,1070229.0,0.830235,1599245000.0,19,0.837728


##### 分组统计特征
##### 划分数据后进行统计

In [24]:

temp = df.copy()
temp.rename(columns={'id':'ship','dir':'d'},inplace=True)

def coefficient_of_variation(x):
    x = x.values
    if np.mean(x) == 0:
        return 0
    return np.std(x) / np.mean(x)

def max_2(x):
    x = list(x.values)
    x.sort(reverse=True)
    return x[1]

def max_3(x):
    x = list(x.values)
    x.sort(reverse=True)
    return x[2]

def diff_abs_mean(x):  # 统计特征 deta绝对值均值
    return np.mean(np.abs(np.diff(x)))

f1 = pd.DataFrame()
for col in ['x', 'y', 'v', 'd']:
    features = temp.groupby('ship', as_index=False)[col].agg({
        '{}_min'.format(col): 'min',
        '{}_max'.format(col): 'max',
        '{}_mean'.format(col): 'mean',
        '{}_median'.format(col): 'median',
        '{}_std'.format(col): 'std',
        '{}_skew'.format(col): 'skew',
        '{}_sum'.format(col): 'sum',
        '{}_diff_abs_mean'.format(col): diff_abs_mean,
        '{}_mode'.format(col): lambda x: x.value_counts().index[0],
        '{}_coefficient_of_variation'.format(col): coefficient_of_variation,
        '{}_max2'.format(col): max_2,
        '{}_max3'.format(col): max_3
    })
    if f1.shape[0] == 0:
        f1 = features
    else:
        f1 = f1.merge(features, on='ship', how='left')

f1['x_max_x_min'] = f1['x_max'] - f1['x_min']
f1['y_max_y_min'] = f1['y_max'] - f1['y_min']
f1['y_max_x_min'] = f1['y_max'] - f1['x_min']
f1['x_max_y_min'] = f1['x_max'] - f1['y_min']
f1['slope'] = f1['y_max_y_min'] / np.where(f1['x_max_x_min'] == 0, 0.001, f1['x_max_x_min'])
f1['area'] = f1['x_max_x_min'] * f1['y_max_y_min']
f1['dis_max_min'] = (f1['x_max_x_min'] ** 2 + f1['y_max_y_min'] ** 2) ** 0.5
f1['dis_mean'] = (f1['x_mean'] ** 2 + f1['y_mean'] ** 2) ** 0.5
f1['area_d_dis_max_min'] = f1['area'] / f1['dis_max_min']

# 加速度
temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
temp['ynext'] = temp['ynext'].fillna(method='ffill')
temp['xnext'] = temp['xnext'].fillna(method='ffill')
temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
temp['a_y'] = temp.apply(lambda x: (x['ynext'] - x['y']) / x['timediff'].total_seconds(), axis=1)
temp['a_x'] = temp.apply(lambda x: (x['xnext'] - x['x']) / x['timediff'].total_seconds(), axis=1)
for col in ['a_y', 'a_x']:
    f2 = temp.groupby('ship', as_index=False)[col].agg({
        '{}_max'.format(col): 'max',
        '{}_mean'.format(col): 'mean',
        '{}_min'.format(col): 'min',
        '{}_median'.format(col): 'median',
        '{}_std'.format(col): 'std'})
    f1 = f1.merge(f2, on='ship', how='left')

# 曲率
temp['y_pre'] = temp.groupby('ship')['y'].shift(1)
temp['x_pre'] = temp.groupby('ship')['x'].shift(1)
temp['y_pre'] = temp['y_pre'].fillna(method='bfill')
temp['x_pre'] = temp['x_pre'].fillna(method='bfill')
temp['d_pre'] = ((temp['x'] - temp['x_pre']) ** 2 + (temp['y'] - temp['y_pre']) ** 2) ** 0.5
temp['d_next'] = ((temp['xnext'] - temp['x']) ** 2 + (temp['ynext'] - temp['y']) ** 2) ** 0.5
temp['d_pre_next'] = ((temp['xnext'] - temp['x_pre']) ** 2 + (temp['ynext'] - temp['y_pre']) ** 2) ** 0.5
temp['curvature'] = (temp['d_pre'] + temp['d_next']) / temp['d_pre_next']

f2 = temp.groupby('ship', as_index=False)['curvature'].agg({
    'curvature_max': 'max',
    'curvature_mean': 'mean',
    'curvature_min': 'min',
    'curvature_median': 'median',
    'curvature_std': 'std'})
f1 = f1.merge(f2, on='ship', how='left')

#### embedding特征
#####  Word2vec构造词向量

In [25]:
def traj_cbow_embedding(traj_data_corpus=None, embedding_size=70,
                        iters=40, min_count=3, window_size=25,
                        seed=9012, num_runs=5, word_feat="no_bin"):
    """CBOW embedding for trajectory data."""
    boat_id = traj_data_corpus['id'].unique()
    sentences, embedding_df_list, embedding_model_list = [], [], []
    for i in boat_id:
        traj = traj_data_corpus[traj_data_corpus['id']==i]
        sentences.append(traj[word_feat].values.tolist())

    print("\n@Start CBOW word embedding at {}".format(datetime.now()))
    print("-------------------------------------------")
    for i in tqdm(range(num_runs)):
        model = Word2Vec(sentences, size=embedding_size,
                                  min_count=min_count,
                                  workers=mp.cpu_count(),
                                  window=window_size,
                                  seed=seed, iter=iters, sg=0)

        # Sentance vector
        embedding_vec = []
        for ind, seq in enumerate(sentences):
            seq_vec, word_count = 0, 0
            for word in seq:
                if word not in model:
                    continue
                else:
                    seq_vec += model[word]
                    word_count += 1
            if word_count == 0:
                embedding_vec.append(embedding_size * [0])
            else:
                embedding_vec.append(seq_vec / word_count)
        embedding_vec = np.array(embedding_vec)
        embedding_cbow_df = pd.DataFrame(embedding_vec, 
            columns=["embedding_cbow_{}_{}".format(word_feat, i) for i in range(embedding_size)])
        embedding_cbow_df["id"] = boat_id
        embedding_df_list.append(embedding_cbow_df)
        embedding_model_list.append(model)
    print("-------------------------------------------")
    print("@End CBOW word embedding at {}".format(datetime.now()))
    return embedding_df_list, embedding_model_list

In [26]:
embedding_size=70
iters=70
min_count=3
window_size=25
num_runs=1

df_list, model_list = traj_cbow_embedding(df,
                                          embedding_size=embedding_size,
                                          iters=iters, min_count=min_count,
                                          window_size=window_size,
                                          seed=9012,
                                          num_runs=num_runs,
                                          word_feat="no_bin")

train_embedding_df_list = [d.reset_index(drop=True) for d in df_list]
fea = train_embedding_df_list[0]
fea = pd.DataFrame(fea)

  0%|          | 0/1 [00:00<?, ?it/s]


@Start CBOW word embedding at 2021-04-20 08:40:52.612425
-------------------------------------------


100%|██████████| 1/1 [00:00<00:00,  1.79it/s]

-------------------------------------------
@End CBOW word embedding at 2021-04-20 08:40:53.179347





In [27]:
pre_cols = df.columns
df = df.merge(fea,on='id',how='left')


new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

Unnamed: 0,embedding_cbow_no_bin_0,embedding_cbow_no_bin_1,embedding_cbow_no_bin_2,embedding_cbow_no_bin_3,embedding_cbow_no_bin_4,embedding_cbow_no_bin_5,embedding_cbow_no_bin_6,embedding_cbow_no_bin_7,embedding_cbow_no_bin_8,embedding_cbow_no_bin_9,...,embedding_cbow_no_bin_60,embedding_cbow_no_bin_61,embedding_cbow_no_bin_62,embedding_cbow_no_bin_63,embedding_cbow_no_bin_64,embedding_cbow_no_bin_65,embedding_cbow_no_bin_66,embedding_cbow_no_bin_67,embedding_cbow_no_bin_68,embedding_cbow_no_bin_69
0,-0.296518,-1.381415,-1.751447,-2.614199,-1.398765,1.045716,1.585713,0.765186,-0.252884,1.851986,...,-0.825406,-0.207794,-0.659545,1.122984,-0.139747,1.99854,-1.693542,0.77139,1.579915,0.400014
1,-0.296518,-1.381416,-1.751446,-2.614197,-1.398766,1.045716,1.585714,0.765185,-0.252885,1.851985,...,-0.825406,-0.207794,-0.659545,1.122984,-0.139747,1.99854,-1.693542,0.77139,1.579916,0.400014
2,-0.296518,-1.381415,-1.751447,-2.614199,-1.398765,1.045716,1.585713,0.765186,-0.252884,1.851986,...,-0.825406,-0.207794,-0.659545,1.122984,-0.139747,1.99854,-1.693542,0.77139,1.579915,0.400014
3,-0.296518,-1.381416,-1.751446,-2.614197,-1.398766,1.045716,1.585714,0.765185,-0.252885,1.851985,...,-0.825406,-0.207794,-0.659545,1.122984,-0.139747,1.99854,-1.693542,0.77139,1.579916,0.400014
4,-0.296518,-1.381415,-1.751447,-2.614199,-1.398765,1.045716,1.585713,0.765186,-0.252884,1.851986,...,-0.825406,-0.207794,-0.659545,1.122984,-0.139747,1.99854,-1.693542,0.77139,1.579915,0.400014


In [28]:
boat_id = df['id'].unique()
total_embedding = pd.DataFrame(boat_id, columns=["id"])
traj_data = df[['v','dir','id']].rename(columns = {'v':'speed','dir':'direction'})

# Step 1: Construct the words
traj_data_corpus = []
traj_data["speed_str"]     = traj_data["speed"].apply(lambda x: str(int(x*100)))
traj_data["direction_str"] = traj_data["direction"].apply(str)
traj_data["speed_dir_str"] = traj_data["speed_str"] + "_" + traj_data["direction_str"]
traj_data_corpus = traj_data[["id", "speed_str",
                                  "direction_str", "speed_dir_str"]]
print("\n@Round 2 speed embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                          embedding_size=10,
                                          iters=40, min_count=3,
                                          window_size=25, seed=9102,
                                          num_runs=1, word_feat="speed_str")
speed_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_embedding,
                           on="id", how="left")


print("\n@Round 2 direction embedding:")
df_list, model_list = traj_cbow_embedding(traj_data_corpus,
                                          embedding_size=12,
                                          iters=70, min_count=3,
                                          window_size=25, seed=9102,
                                          num_runs=1, word_feat="speed_dir_str")
speed_dir_embedding = df_list[0].reset_index(drop=True)
total_embedding = pd.merge(total_embedding, speed_dir_embedding,
                           on="id", how="left")

  0%|          | 0/1 [00:00<?, ?it/s]


@Round 2 speed embedding:

@Start CBOW word embedding at 2021-04-20 08:41:06.286387
-------------------------------------------


100%|██████████| 1/1 [00:00<00:00,  3.09it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

-------------------------------------------
@End CBOW word embedding at 2021-04-20 08:41:06.619359

@Round 2 direction embedding:

@Start CBOW word embedding at 2021-04-20 08:41:06.646096
-------------------------------------------


100%|██████████| 1/1 [00:00<00:00,  2.35it/s]

-------------------------------------------
@End CBOW word embedding at 2021-04-20 08:41:07.084008





In [29]:
pre_cols = df.columns
df = df.merge(total_embedding,on='id',how='left')

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()

Unnamed: 0,embedding_cbow_speed_str_0,embedding_cbow_speed_str_1,embedding_cbow_speed_str_2,embedding_cbow_speed_str_3,embedding_cbow_speed_str_4,embedding_cbow_speed_str_5,embedding_cbow_speed_str_6,embedding_cbow_speed_str_7,embedding_cbow_speed_str_8,embedding_cbow_speed_str_9,...,embedding_cbow_speed_dir_str_2,embedding_cbow_speed_dir_str_3,embedding_cbow_speed_dir_str_4,embedding_cbow_speed_dir_str_5,embedding_cbow_speed_dir_str_6,embedding_cbow_speed_dir_str_7,embedding_cbow_speed_dir_str_8,embedding_cbow_speed_dir_str_9,embedding_cbow_speed_dir_str_10,embedding_cbow_speed_dir_str_11
0,0.918525,-0.505222,-2.032478,0.745203,1.776894,3.214431,1.974173,-4.381144,-3.217552,-2.600746,...,-0.343606,3.504519,-0.923818,2.080787,2.996411,-1.263102,1.427308,-3.428284,-2.485857,1.193597
1,0.41086,2.054793,-1.519732,-2.662427,0.996046,0.974712,0.801571,0.363235,-1.279728,0.106097,...,-0.737868,-0.164093,-1.127899,-1.022281,0.105744,0.084507,-0.400303,-2.012783,-0.368905,-0.106053
2,0.918525,-0.505222,-2.032478,0.745203,1.776894,3.214431,1.974173,-4.381144,-3.217552,-2.600746,...,-0.343606,3.504519,-0.923818,2.080787,2.996411,-1.263102,1.427308,-3.428284,-2.485857,1.193597
3,0.41086,2.054793,-1.519732,-2.662427,0.996046,0.974712,0.801571,0.363235,-1.279728,0.106097,...,-0.737868,-0.164093,-1.127899,-1.022281,0.105744,0.084507,-0.400303,-2.012783,-0.368905,-0.106053
4,0.918525,-0.505222,-2.032478,0.745203,1.776894,3.214431,1.974173,-4.381144,-3.217552,-2.600746,...,-0.343606,3.504519,-0.923818,2.080787,2.996411,-1.263102,1.427308,-3.428284,-2.485857,1.193597


#####  NMF提取文本的主题分布

In [30]:
class nmf_list(object):
    def __init__(self,data,by_name,to_list,nmf_n,top_n):
        self.data = data
        self.by_name = by_name
        self.to_list = to_list
        self.nmf_n = nmf_n
        self.top_n = top_n

    def run(self,tf_n):
        df_all = self.data.groupby(self.by_name)[self.to_list].apply(lambda x :'|'.join(x)).reset_index()
        self.data =df_all.copy()

        print('bulid word_fre')
        # 词频的构建
        def word_fre(x):
            word_dict = []
            x = x.split('|')
            docs = []
            for doc in x:
                doc = doc.split()
                docs.append(doc)
                word_dict.extend(doc)
            word_dict = Counter(word_dict)
            new_word_dict = {}
            for key,value in word_dict.items():
                new_word_dict[key] = [value,0]
            del word_dict  
            del x
            for doc in docs:
                doc = Counter(doc)
                for word in doc.keys():
                    new_word_dict[word][1] += 1
            return new_word_dict 
        self.data['word_fre'] = self.data[self.to_list].apply(word_fre)

        print('bulid top_' + str(self.top_n))
        # 设定100个高频词
        def top_100(word_dict):
            return sorted(word_dict.items(),key = lambda x:(x[1][1],x[1][0]),reverse = True)[:self.top_n]
        self.data['top_'+str(self.top_n)] = self.data['word_fre'].apply(top_100)
        def top_100_word(word_list):
            words = []
            for i in word_list:
                i = list(i)
                words.append(i[0])
            return words 
        self.data['top_'+str(self.top_n)+'_word'] = self.data['top_' + str(self.top_n)].apply(top_100_word)
        # print('top_'+str(self.top_n)+'_word的shape')
        print(self.data.shape)

        word_list = []
        for i in self.data['top_'+str(self.top_n)+'_word'].values:
            word_list.extend(i)
        word_list = Counter(word_list)
        word_list = sorted(word_list.items(),key = lambda x:x[1],reverse = True)
        user_fre = []
        for i in word_list:
            i = list(i)
            user_fre.append(i[1]/self.data[self.by_name].nunique())
        stop_words = []
        for i,j in zip(word_list,user_fre):
            if j>0.5:
                i = list(i)
                stop_words.append(i[0])

        print('start title_feature')
        # 讲融合后的taglist当作一句话进行文本处理
        self.data['title_feature'] = self.data[self.to_list].apply(lambda x: x.split('|'))
        self.data['title_feature'] = self.data['title_feature'].apply(lambda line: [w for w in line if w not in stop_words])
        self.data['title_feature'] = self.data['title_feature'].apply(lambda x: ' '.join(x))

        print('start NMF')
        # 使用tfidf对元素进行处理
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(tf_n,tf_n))
        tfidf = tfidf_vectorizer.fit_transform(self.data['title_feature'].values)
        #使用nmf算法，提取文本的主题分布
        text_nmf = NMF(n_components=self.nmf_n).fit_transform(tfidf)


        # 整理并输出文件
        name = [str(tf_n) + self.to_list + '_' +str(x) for x in range(1,self.nmf_n+1)]
        tag_list = pd.DataFrame(text_nmf)
        print(tag_list.shape)
        tag_list.columns = name
        tag_list[self.by_name] = self.data[self.by_name]
        column_name = [self.by_name] + name
        tag_list = tag_list[column_name]
        return tag_list

In [31]:
data = df.copy()
data.rename(columns={'v':'speed','id':'ship'},inplace=True)
for j in range(1,4):
    print('********* {} *******'.format(j))
    for i in ['speed','x','y']:
        data[i + '_str'] = data[i].astype(str)
        nmf = nmf_list(data,'ship',i + '_str',8,2)
        nmf_a = nmf.run(j)
        nmf_a.rename(columns={'ship':'id'},inplace=True)
        data_label = data_label.merge(nmf_a,on = 'id',how = 'left')
        
new_cols = [i for i in data_label.columns if i not in df.columns]
df = df.merge(data_label[new_cols+['id']],on='id',how='left')
df[new_cols].head()

********* 1 *******
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
********* 2 *******
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
********* 3 *******
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)
bulid word_fre
bulid top_2
(8, 5)
start title_feature
start NMF
(8, 8)


Unnamed: 0,1speed_str_1,1speed_str_2,1speed_str_3,1speed_str_4,1speed_str_5,1speed_str_6,1speed_str_7,1speed_str_8,1x_str_1,1x_str_2,...,3x_str_7,3x_str_8,3y_str_1,3y_str_2,3y_str_3,3y_str_4,3y_str_5,3y_str_6,3y_str_7,3y_str_8
0,0.0,0.190613,0.0,0.0,0.843149,0.0,0.0,2.2e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.952014
1,0.429569,0.0,0.0,0.058269,9e-06,0.761026,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.190613,0.0,0.0,0.843149,0.0,0.0,2.2e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.952014
3,0.429569,0.0,0.0,0.058269,9e-06,0.761026,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.190613,0.0,0.0,0.843149,0.0,0.0,2.2e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.952014


#### 按类别特征编码

##### 均值编码

In [159]:

class MeanEncoder:
    from sklearn.model_selection import StratifiedKFold
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode

        :param n_splits: the number of splits used in mean encoding

        :param target_type: str, 'regression' or 'classification'

        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()

#         col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y = X_train.groupby(variable)['pred_temp'].agg(['mean','size']).rename(columns={'mean':'mean','size':'beta'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)

        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y

    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        print(self.target_type)
        if self.target_type == 'classification':
            
            skf = StratifiedKFold(self.n_splits, shuffle=True,random_state=6666)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X_new, list(y)):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target,
                        self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X_new, list(y)):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None,
                        self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new

##### 减少内存

In [160]:

def reduce_mem_usage(df, verbose=True):
    print('Reduce mem usage....')
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('     Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [161]:
df = df.convert_dtypes()
# df = reduce_mem_usage(df, verbose=True)
# fea_columns_df = pd.DataFrame(df.dtypes).reset_index()
# fea_columns_df.columns = ['columns_','type_']

In [202]:

class_list = ['v_bin', 'x_bin1', 'y_bin1', 'x_end', 'y_end','id', 'x_grid', 'y_grid', 'no_bin', 'dist_move_prev_bin_sen', 'v_bin_sen']
for i in class_list:
    df[i] = df[i].astype(str)
df['label'] = df['label'].astype(int)
MeanEnocodeFeature = class_list
ME = MeanEncoder(MeanEnocodeFeature, target_type='classification')
dfaa = ME.fit_transform(df, df['label'])

classification


##### K-fold mean-target 编码

In [210]:
from category_encoders.target_encoder import TargetEncoder 
from sklearn import base
from sklearn.model_selection import KFold

df = pd.DataFrame({'Feature':['A','B','B','B','B', 'A','B','A','A','B','A','A','B','A','A','B','B','B','A','A'],\
                   'Target':[1,0,0,1,1, 1,0,0,0,0,1, 0,1, 0,1,0,0,0,1,1]})

class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):

        self.colnames   = colnames
        self.targetName = targetName
        self.n_fold     = n_fold
        self.verbosity  = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self


    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold, shuffle = True, random_state=2019)



        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind] 
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())

        X[col_mean_name].fillna(mean_of_target, inplace = True)

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
                                                                                      self.targetName,
                                                                                      np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
            

        return X
    
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):


        mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})

        return X

In [211]:
targetc   = KFoldTargetEncoderTrain('Feature','Target',n_fold=5)
new_train = targetc.fit_transform(df)
new_train

Correlation between the new feature, Feature_Kfold_Target_Enc and, Target is 0.18053954978064135.


Unnamed: 0,Feature,Target,Feature_Kfold_Target_Enc
0,A,1,0.571429
1,B,0,0.375
2,B,0,0.333333
3,B,1,0.25
4,B,1,0.333333
5,A,1,0.571429
6,B,0,0.333333
7,A,0,0.625
8,A,0,0.571429
9,B,0,0.375


In [215]:
20000*0.08

1600.0

##### WOE编码1

In [170]:
df.loc[~(df['label'] ==0),'label_0'] = 0 
df.loc[(df['label'] ==0),'label_0'] = 1 
df.loc[~(df['label'] ==1),'label_1'] = 0 
df.loc[(df['label'] ==1),'label_1'] = 1 
df.loc[~(df['label'] ==2),'label_2'] = 0 
df.loc[(df['label'] ==2),'label_2'] = 1 

In [171]:
from category_encoders import WOEEncoder 
enc = WOEEncoder(cols=class_list) 
df_label_0 = enc.fit_transform(df, df['label_0'])
df_label_1 = enc.fit_transform(df, df['label_1'])
df_label_2 = enc.fit_transform(df, df['label_2'])


In [174]:
df_label_0[class_list]

Unnamed: 0,v_bin,x_bin1,y_bin1,x_end,y_end,id,x_grid,y_grid,no_bin,dist_move_prev_bin_sen,v_bin_sen
0,-0.036669,5.149837,5.156617,-0.000107,-0.000107,6.165758,-0.000107,-0.241137,-0.241137,6.165758,6.165758
1,-2.479652,-1.226889,-1.226889,-0.000107,-0.000107,-5.799020,-0.000107,-0.241137,-0.241137,-5.799020,-5.799020
2,-0.036669,5.149837,5.156617,-0.000107,-0.000107,6.165758,-0.000107,-0.241137,-0.241137,6.165758,6.165758
3,-2.037820,-1.226889,-1.226889,-0.000107,-0.000107,-5.799020,-0.000107,-0.241137,-0.241137,-5.799020,-5.799020
4,-0.036669,5.149837,5.156617,-0.000107,-0.000107,6.165758,-0.000107,-0.241137,-0.241137,6.165758,6.165758
...,...,...,...,...,...,...,...,...,...,...,...
2996,2.299471,1.545699,1.545699,-0.000107,-0.000107,5.853137,-0.000107,2.461990,2.461990,5.853137,5.853137
2997,2.140406,1.545699,1.545699,-0.000107,-0.000107,5.853137,-0.000107,2.238846,2.238846,5.853137,5.853137
2998,2.299471,1.545699,1.545699,-0.000107,-0.000107,5.853137,-0.000107,2.238846,2.238846,5.853137,5.853137
2999,2.916245,1.545699,1.545699,-0.000107,-0.000107,5.853137,-0.000107,1.951164,1.951164,5.853137,5.853137


#### Weight of evidence

In [None]:
'''
    代码摘自：https://github.com/Sundar0989/WOE-and-IV
'''
import os
import pandas as pd
import numpy as np


 
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1      = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss  = df1[['X','Y']][df1.X.notnull()]
    r        = 0
    
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n    = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"]     = d2.count().Y
    d3["EVENT"]     = d2.sum().Y  # 正样本
    d3["NONEVENT"]  = d2.count().Y - d2.sum().Y # 负样本
    d3              = d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"]     = d3.EVENT/d3.COUNT       # 正样本类内百分比
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT    # 负样本类内百分比
    
    d3["DIST_EVENT"]     = d3.EVENT/d3.sum().EVENT # 正的样本占所有正样本百分比
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT # 负的样本占所有负样本百分比
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3) 


def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
     
    return iv_df 
# df = pd.read_csv('./data/bank.csv',sep=';')
# dic = {'yes':1, 'no':0}
# df['target'] = df['y'].map(dic)
# df = df.drop('y',axis=1) 

final_iv = data_vars(df[class_list],df['label'])

##### 直方图编码


In [178]:

def histogram_encoding(X,y):
    category=list(set(X))
    labels=list(set(y))
    data=pd.concat([X,pd.DataFrame(y)],axis=1)
    data.columns=['data','labels']
    dictionary={}
    for item in category:
        temp=data[data['data']==item]
        tp=temp['labels'].value_counts()
        if tp.shape[0]<len(labels):
            for label in labels:
                if label not in tp.index:
                    tp[label]=0
        nums=tp.tolist()
        sums=sum(nums)
        nums=[items*1.0/sums for items in nums] ### 这里sums如果-1就是one leave out的分类问题形式
        ##其实问题差别不是很大，数据量一般都是至少几十万的级别的这么一个数据点的删除与否没什么大影响
        dictionary[item]=nums
    hs_enc=X.copy()
    hs_enc=hs_enc.values.tolist()
    for i in range(len(hs_enc)):
        hs_enc[i]=dictionary[hs_enc[i]]
    return hs_enc,dictionary

In [183]:
df.columns

Index(['id', 'x', 'y', 'v', 'dir', 'time', 'label', 'base_dis_diff', 'date',
       'hour',
       ...
       '3y_str_2', '3y_str_3', '3y_str_4', '3y_str_5', '3y_str_6', '3y_str_7',
       '3y_str_8', 'label_0', 'label_1', 'label_2'],
      dtype='object', length=397)

In [196]:
hs_enc,dictionary = histogram_encoding(df[class_list[0]],df['label'])

In [191]:
len(hs_enc)

3001

##### target 编码

In [197]:
from category_encoders import *
import pandas as pd
# from sklearn.datasets import load_boston
# bunch = load_boston()
# y = bunch.target
# X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
enc = TargetEncoder(cols=['x']).fit(df['x'],df['label'])
numeric_dataset = enc.transform(df['x'])
print(numeric_dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3001 entries, 0 to 3000
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       3001 non-null   float64
dtypes: float64(1)
memory usage: 126.9 KB
None


## 参考文献

[特征编码方法总结—part1](https://www.zhihu.com/search?type=content&q=WOEEncoder)

[Python - pandas - groupby+agg聚合重命名解决办法](https://blog.csdn.net/qq_24256877/article/details/108732042)