In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.utils import *

# 设置单元格所有行全部输出模式
from IPython.core.interactiveshell import InteractiveShell

# 设置InteractiveShell对象的ast_node_interactivity的值为all，即表示每一个单元格所有有输出的代码行全部需要输出
InteractiveShell.ast_node_interactivity = 'all'

In [69]:
df = pd.read_csv('./data/loan_status-3.csv')

In [70]:
# 展示基本的分箱过程
def Binning(df,method='cut',bins=2):
    '''
    分箱方法：离散化实现
    df:参与分箱的数据集
    method:选用的分箱方法，默认为cut，表示默认采用等宽分箱；可选参数qcut，表示采用等频分箱
    bins:分箱个数，默认为2
    '''
    if method == 'cut':
        
        for col in df.columns:
            value = pd.cut(df[col],bins=bins) # retbins参数会多让cut方法多返回一个结果值
            tmp = value.to_frame()
            tmp.columns = [col]
            df[col] = tmp
    elif method == 'qcut':
        
         for col in df.columns:
            value = pd.qcut(x=df[col],q=bins,duplicates='drop')
            tmp = value.to_frame()
            tmp.columns = [col]
            df[col] = tmp
    
    return df
    
# df[['ApplicantIncome','CoapplicantIncome','LoanAmount']] = Binning(df[['ApplicantIncome','CoapplicantIncome','LoanAmount']],bins=4)

In [40]:
# 等宽分箱，指定bins个数，完成分箱
df[['ApplicantIncome','CoapplicantIncome','LoanAmount']] = Binning(df[['ApplicantIncome','CoapplicantIncome','LoanAmount']],method='cut',bins=4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = tmp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = tmp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = tmp


In [41]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status-N,Loan_Status-Y
0,1,1,1,0,0,"(0.0437, 1.493]","(0.14, 1.191]","(0.564, 1.67]",8,1,0,1,0
1,1,1,0,0,1,"(-1.406, 0.0437]","(-0.915, 0.14]","(-1.647, -0.542]",8,1,2,0,1
2,1,1,0,1,0,"(-1.406, 0.0437]","(0.14, 1.191]","(-0.542, 0.564]",8,1,2,0,1
3,1,0,0,0,0,"(1.493, 2.943]","(-0.915, 0.14]","(0.564, 1.67]",8,1,2,0,1
4,1,1,0,1,0,"(-1.406, 0.0437]","(0.14, 1.191]","(-0.542, 0.564]",8,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,0,0,0,0,0,"(1.493, 2.943]","(-0.915, 0.14]","(-0.542, 0.564]",8,0,0,1,0
538,0,0,1,0,0,"(0.0437, 1.493]","(-0.915, 0.14]","(-0.542, 0.564]",8,1,0,1,0
539,1,1,1,0,0,"(-1.406, 0.0437]","(0.14, 1.191]","(-0.542, 0.564]",8,0,2,1,0
540,0,0,0,0,0,"(0.0437, 1.493]","(-0.915, 0.14]","(-0.542, 0.564]",8,1,0,1,0


In [49]:
# 针对分箱完毕的特征，使用标签编码转换为数值表示，完成最终的离散化
for col in ['ApplicantIncome','CoapplicantIncome','LoanAmount']:
    
    # 通过encoder_feature方法完成特征编码
    df[[col]] = encoder_feature(df[[col]]).reshape(-1,1)
    
    # 验证
    df[col].value_counts()


  y = column_or_1d(y, warn=True)


1    314
2    171
3     38
0     19
Name: ApplicantIncome, dtype: int64

  y = column_or_1d(y, warn=True)


0    318
1    169
2     46
3      9
Name: CoapplicantIncome, dtype: int64

  y = column_or_1d(y, warn=True)


2    236
3    161
1     98
0     47
Name: LoanAmount, dtype: int64

In [50]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status-N,Loan_Status-Y
0,1,1,1,0,0,2,1,3,8,1,0,1,0
1,1,1,0,0,1,1,0,1,8,1,2,0,1
2,1,1,0,1,0,1,1,2,8,1,2,0,1
3,1,0,0,0,0,3,0,3,8,1,2,0,1
4,1,1,0,1,0,1,1,2,8,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,0,0,0,0,0,3,0,2,8,0,0,1,0
538,0,0,1,0,0,2,0,2,8,1,0,1,0
539,1,1,1,0,0,1,1,2,8,0,2,1,0
540,0,0,0,0,0,2,0,2,8,1,0,1,0


In [51]:
df.to_csv('./data/loan_status-4.csv',index=False)

细节化的说明：关于分箱后是否还需继续进行标签编码将区间转换为数值；
- 如果进行了分箱处理的特征在分箱完毕后表达为了一种无序的分类特征，则可以选择不再进行标签编码；
- 但是：如果进行了分箱处理的特征在分箱完毕后仍然会表达为一种有序分类特征，或者说模型建模要求中明确给定了：分箱后必须编码，则需要进行标签编码；


In [71]:
# 等频分箱，指定bins个数，完成分箱
df[['ApplicantIncome','CoapplicantIncome','LoanAmount']] = 
Binning(df[['ApplicantIncome','CoapplicantIncome','LoanAmount']],method='qcut',bins=3)
# 针对分箱完毕的特征，使用标签编码转换为数值表示，完成最终的离散化
for col in ['ApplicantIncome','CoapplicantIncome','LoanAmount']:
    
    # 通过encoder_feature方法完成特征编码
    df[[col]] = encoder_feature(df[[col]]).reshape(-1,1)
    
    # 验证
    df[col].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = tmp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = tmp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = tmp
  y = column_or_1d(y, warn=True)


2    181
0    181
1    180
Name: ApplicantIncome, dtype: int64

  y = column_or_1d(y, warn=True)


0    361
1    181
Name: CoapplicantIncome, dtype: int64

  y = column_or_1d(y, warn=True)


1    193
0    182
2    167
Name: LoanAmount, dtype: int64

In [72]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status-N,Loan_Status-Y
0,1,1,1,0,0,2,0,2,8,1,0,1,0
1,1,1,0,0,1,1,0,0,8,1,2,0,1
2,1,1,0,1,0,0,1,1,8,1,2,0,1
3,1,0,0,0,0,2,0,2,8,1,2,0,1
4,1,1,0,1,0,0,0,0,8,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,0,0,0,0,0,2,0,1,8,0,0,1,0
538,0,0,1,0,0,2,0,1,8,1,0,1,0
539,1,1,1,0,0,0,1,1,8,0,2,1,0
540,0,0,0,0,0,2,0,1,8,1,0,1,0
