In [77]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import matplotlib as mpl
import seaborn as sns

import time
from functools import wraps

import pysnooper


def fn_timer(function):
  @wraps(function)
  def function_timer(*args, **kwargs):
    t0 = time.time()
    result = function(*args, **kwargs)
    t1 = time.time()
    print ("Total time running %s: %s seconds" %
        (function.__name__, str(t1-t0)))
    return result
  return function_timer

%matplotlib inline
dataset = pd.read_csv("Dataset/trainData/ANTIF_TRAIN_SET.csv",header=0,index_col=0)
dataset_x=dataset.drop(['target'],axis=1)
dataset_y=dataset['target']
X_train,X_test,Y_train,Y_test=train_test_split(dataset_x,dataset_y,test_size=0.1,random_state=0)

def balance(x,y):
    columns=x.columns
    sm=SMOTE(random_state=0)
    X,Y=sm.fit_sample(x,y)
    X=pd.DataFrame(X)
    X_train.columns=columns
    Y=pd.Series(Y)
    Y.name='target'
    return X,Y

def split_list(x,y):
    for i in range(len(x.columns)):
        x.iloc[:,i]=pd.cut(x.iloc[:,i],5,retbins=False)
    return x,y

def convert2one_hot(x,y):
    x=pd.get_dummies(x)
    return x,y



In [99]:
import woe.feature_process as fp
import woe.eval as eval

def split_list(x,y,path='',isread=False):
    #%% woe分箱, iv and transform
    x=x.copy()
    if(isread==False):
        splited_list=pd.DataFrame([],columns=['split_list','feature_name'])
        data=x.copy() #用于存储所有数据的woe值
        data['target']=y
        data_woe=data
        civ_list = []
        n_positive = sum(y)
        n_negtive = len(y) - n_positive
        min_sample=0.03*len(data)
        alpha=0.01  #用于自动计算分箱时的一个标准，默认0.01.如果iv_划分>iv_不划分*（1+alpha)则划分。
        for column in x.columns:
            if x[column].dtypes == 'object':
                #如果这列是标签列，则使用proc_woe_discrete
                civ = fp.proc_woe_discrete(data, column, n_positive, n_negtive, min_sample, alpha=0.01)
            else:     
                #如果这列是连续变量，则使用proc_woe_continuous
                civ = fp.proc_woe_continuous(data, column, n_positive, n_negtive, min_sample, alpha=0.01)
            df=pd.DataFrame([float('-inf')]+civ.split_list+[float('inf')],columns=['split_list'])
            df['feature_name']=column
            splited_list=pd.concat([splited_list,df],axis=0)
            x[column]=pd.cut(x[column],[float('-inf')]+civ.split_list+[float('inf')])

        if(path!=''):
            splited_list.to_csv(path)
    else:
        splited_list=pd.read_csv(path,header=0,index_col=0)
        for column in x.columns:
            x[column]=pd.cut(x[column],splited_list[splited_list['feature_name']==column]['split_list'])
        
    
    '''civ_list.append(civ)
        data_woe[column] = fp.woe_trans(data[column], civ)

    civ_df = eval.eval_feature_detail(civ_list,'woe特征分箱.csv')
    #删除iv值过小的变量
    iv_thre = 0.001
    iv = civ_df[['var_name','split_list','iv']].drop_duplicates()
    x_columns = iv[iv.iv > iv_thre]'''
    
    return x,y,splited_list

x,y,splited_list=split_list(X_train.iloc[:,0:2],Y_train,'分桶/woe分桶.csv',True)

0          -inf
1     11.531420
2     13.235880
3     14.270040
4     14.990680
5     15.761000
6     16.497600
7     17.058200
8     17.782400
9     18.358376
10    18.931800
11    19.510218
12    20.353800
13    20.919400
14    21.636816
15    22.169000
16    22.688956
17    23.246540
18    24.080072
19    24.932482
20    25.752976
21    26.597800
22    27.598400
23    28.530400
24    29.728890
25    33.402556
26          inf
Name: split_list, dtype: float64
0          -inf
1    -16.227800
2    -14.737150
3    -13.039000
4    -11.660400
5    -10.214400
6     -8.857284
7     -7.912800
8     -6.819800
9     -5.935200
10    -4.618192
11    -3.876448
12    -3.175112
13    -2.278804
14    -1.484960
15    -0.412800
16     0.328360
17     1.118760
18     2.176400
19     3.701008
20     5.019588
21     6.026654
22     7.167246
23    10.928608
24          inf
Name: split_list, dtype: float64


# 训练数据保存

In [29]:
X_train,Y_train=balance(X_train,Y_train)
X_train,Y_train=split_list(X_train,Y_train)
X_train,Y_train=convert2one_hot(X_train,Y_train)
print(X_train.shape)

X_test,Y_test=split_list(X_test,Y_test)
X_test,Y_test=convert2one_hot(X_test,Y_test)
print(X_test.shape)

dataset_x,dataset_y=balance(dataset_x,dataset_y)
dataset_x,dataset_y=split_list(dataset_x,dataset_y)
dataset_x,dataset_y=convert2one_hot(dataset_x,dataset_y)
print(dataset_x.shape)

dataset_save=dataset_x
dataset_save['target']=dataset_y
dataset_train_save=X_train
dataset_train_save['target']=Y_train
dataset_test_save=X_test
dataset_test_save['target']=Y_test 

print(dataset_save.shape)
print(dataset_train_save.shape)
print(dataset_test_save.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/

(215882, 1000)
(194306, 1000)
(12000, 1000)


In [30]:
dataset_save.to_csv("Dataset/trainData/ANTIF_TRAIN_SET_(分桶+onehot+平衡)全量.csv")
dataset_train_save.to_csv("Dataset/trainData/ANTIF_TRAIN_SET_(分桶+onehot+平衡).csv")
dataset_test_save.to_csv("Dataset/trainData/ANTIF_TRAIN_SET_(分桶+onehot+平衡)线下测试集.csv")
pd.read_csv("Dataset/trainData/ANTIF_TRAIN_SET_(分桶+onehot+平衡).csv",header=0,index_col=0)

In [38]:
pd.read_csv("Dataset/trainData/ANTIF_TRAIN_SET_(分桶+onehot+平衡)线下测试集.csv",header=0,index_col=0).shape

(12000, 1001)

# 1.数据平衡

## 1.1下采样

In [32]:
df_y0=dataset_y[dataset_y==0].iloc[0:12000]
df_y1=dataset_y[dataset_y==1]
df_x0=dataset_x[dataset_y==0].iloc[0:12000]
df_x1=dataset_x[dataset_y==1]
dataset_x=pd.concat([df_x0,df_x1],axis=0)
dataset_y=pd.concat([df_y0,df_y1],axis=0)

## 1.2 SMOTE

In [6]:
def balance(x,y):
    sm=SMOTE(random_state=0)
    X,Y=sm.fit_sample(x,y)
    return pd.DataFrame(X),pd.Series(Y)

#X_train,Y_train=smote(X_train,Y_train)


# 2.分箱
对应的分箱方法：
A. 无监督： (1) 等宽 (2) 等频  (3) 聚类

B. 有监督 (1) 卡方分箱法(ChiMerge)  (2) ID3、C4.5、CART等单变量决策树算法  (3)  信用评分建模的IV最大化分箱 等

以下基于CART算法对连续变量进行最优分箱

由于CART是决策树分类算法，所以相当于是单变量决策树分类。

简单介绍下理论：

CART是二叉树，每次仅进行二元分类，对于连续性变量，方法是依次计算相邻两元素值的中位数，将数据集一分为二，计算该点作为切割点时的基尼值较分割前的基尼值下降程度，每次切分时，选择基尼下降程度最大的点为最优切分点，再将切分后的数据集按同样原则切分，直至终止条件为止。

关于CART分类的终止条件：视实际情况而定，以下设置为 a.每个叶子节点的样本量>=总样本量的5%   b.内部节点再划分所需的最小样本数>=总样本量的10%

## 2.1 CART分箱

In [None]:
def calc_score_median(sample_set, var):
    '''
    计算相邻评分的中位数，以便进行决策树二元切分
    param sample_set: 待切分样本
    param var: 分割变量名称
    '''
    var_list = list(np.unique(sample_set[var]))
    var_median_list = []
    for i in range(len(var_list) -1):
        var_median = (var_list[i] + var_list[i+1]) / 2
        var_median_list.append(var_median)
    return var_median_list

#var表示需要进行分箱的变量名，返回一个样本变量中位数的list
def choose_best_split(sample_set, var, min_sample):
    '''
    使用CART分类决策树选择最好的样本切分点
    返回切分点
    param sample_set: 待切分样本
    param var: 分割变量名称
    param min_sample: 待切分样本的最小样本量(限制条件)
    '''
    # 根据样本评分计算相邻不同分数的中间值
    score_median_list = calc_score_median(sample_set, var)
    median_len = len(score_median_list)
    sample_cnt = sample_set.shape[0]
    sample1_cnt = sum(sample_set['target'])
    sample0_cnt =  sample_cnt- sample1_cnt
    Gini = 1 - np.square(sample1_cnt / sample_cnt) - np.square(sample0_cnt / sample_cnt)
    
    bestGini = 0.0; bestSplit_point = 0.0; bestSplit_position = 0.0
    for i in range(median_len):
        left = sample_set[sample_set[var] < score_median_list[i]]
        right = sample_set[sample_set[var] > score_median_list[i]]
        
        left_cnt = left.shape[0]; right_cnt = right.shape[0]
        left1_cnt = sum(left['target']); right1_cnt = sum(right['target'])
        left0_cnt =  left_cnt - left1_cnt; right0_cnt =  right_cnt - right1_cnt
        left_ratio = left_cnt / sample_cnt; right_ratio = right_cnt / sample_cnt
        
        if left_cnt < min_sample or right_cnt < min_sample:
            continue
        
        Gini_left = 1 - np.square(left1_cnt / left_cnt) - np.square(left0_cnt / left_cnt)
        Gini_right = 1 - np.square(right1_cnt / right_cnt) - np.square(right0_cnt / right_cnt)
        Gini_temp = Gini - (left_ratio * Gini_left + right_ratio * Gini_right)
        if Gini_temp > bestGini:
            bestGini = Gini_temp; bestSplit_point = score_median_list[i]
            if median_len > 1:
                bestSplit_position = i / (median_len - 1)
            else:
                bestSplit_position = i / median_len
        else:
            continue
               
    Gini = Gini - bestGini
    return bestSplit_point, bestSplit_position

#min_sample 参数为最小叶子节点的样本阈值，如果小于该阈值则不进行切分，如前面所述设置为整体样本量的5%
#返回的结果我这里只返回了最优分割点，如果需要返回其他的比如GINI值，可以自行添加。
def bining_data_split(sample_set, var, min_sample, split_list):
    '''
    划分数据找到最优分割点list
    param sample_set: 待切分样本
    param var: 分割变量名称
    param min_sample: 待切分样本的最小样本量(限制条件)
    param split_list: 最优分割点list
    '''
    split, position = choose_best_split(sample_set, var, min_sample)
    if split != 0.0:
        split_list.append(split)
    # 根据分割点划分数据集，继续进行划分
    sample_set_left = sample_set[sample_set[var] < split]
    sample_set_right = sample_set[sample_set[var] > split]
    # 如果左子树样本量超过2倍最小样本量，且分割点不是第一个分割点，则切分左子树
    if len(sample_set_left) >= min_sample * 2 and position not in [0.0, 1.0]:
        bining_data_split(sample_set_left, var, min_sample, split_list)
    else:
        None
    # 如果右子树样本量超过2倍最小样本量，且分割点不是最后一个分割点，则切分右子树
    if len(sample_set_right) >= min_sample * 2 and position not in [0.0, 1.0]:
        bining_data_split(sample_set_right, var, min_sample, split_list)
    else:
        None
        
#split_list 参数是用来保存返回的切分点，每次切分后返回的切分点存入该list
#在这里判断切分点分割的左子树和右子树是否满足“内部节点再划分所需的最小样本数>=总样本量的10%”的条件，
#如果满足则进行递归调用。
@fn_timer
def get_bestsplit_list(sample_set, var):
    '''
    根据分箱得到最优分割点list
    param sample_set: 待切分样本
    param var: 分割变量名称
    '''
    # 计算最小样本阈值（终止条件）
    min_df = sample_set.shape[0] * 0.05
    split_list = []
    # 计算第一个和最后一个分割点
    bining_data_split(sample_set, var, min_df, split_list)
    return split_list


split_list=get_bestsplit_list(dataset,['attr_0'])
split_list

## 2.2 等宽分箱
https://www.cnblogs.com/sench/p/10128216.html


In [22]:
def split_list(x,y):
    for i in range(len(x.columns)):
        x.iloc[:,i]=pd.cut(x.iloc[:,i],5,retbins=False)
    return x,y
#X_train,Y_train=split_list(X_train,Y_train)


# 3.one-hot

In [5]:
def convert2one_hot(x,y):
    x=pd.get_dummies(x)
    return x,y
#X_train,Y_train=convert2one_hot(X_train,Y_train)
