# 数据预处理

## 数据导入

In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv('./Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 数据清理

In [4]:
repl_columns = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport', 'StreamingTV', 'StreamingMovies']
for i in repl_columns:
    df[i] = df[i].replace({ 'No internet service': 'No'})
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})

# 替换值SeniorCitizen
df["SeniorCitizen"] = df["SeniorCitizen"].replace({1: "Yes", 0: "No"}) 

# 替换值TotalCharges
df[ 'TotalCharges'] = df[ 'TotalCharges'].replace( ' ', np.nan)
# TotalCharges空值：数据量小，直接删除
df = df.dropna(subset=[ 'TotalCharges'])
df.reset_index(drop= True, inplace= True) # 重置索引

## 数据的选择与转换

### 数据选择

区分离散和连续变量

**改版说明**
这里不再对使用期限做手动的划分,并添加新的属性进去，
这对于整体的数据来说是冗余数据，而冗余数据对于大部分的分类模型来说都有不好的影响，
因此我在这里删掉了这一步，交由模型自行使用原始的数值变量值做完整的判断。

后面我们会进行对比实验：加入这个认为构造的属性和不加如这个认为构造的属性对于性能的影响

In [5]:
df[ 'TotalCharges'] = df[ 'TotalCharges'].astype( 'float')
Id_col = ['customerID']
target_col = ['Churn']
cat_cols = df.nunique()[df.nunique() < 10].index.tolist()
num_cols = [i for i in df.columns if i not in cat_cols + Id_col]

print( '类别型字段：n', cat_cols)
print( '-' * 30)
print( '数值型字段：n', num_cols)

类别型字段：n ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
------------------------------
数值型字段：n ['tenure', 'MonthlyCharges', 'TotalCharges']


### 数据转换

In [6]:
from sklearn.preprocessing import LabelEncoder
df_model = df
Id_col = ['customerID']
target_col = ['Churn']
# 分类型
cat_cols = df_model.nunique()[df_model.nunique() < 10].index.tolist()
# 二分类属性
binary_cols = df_model.nunique()[df_model.nunique() == 2].index.tolist()
# 多分类属性
multi_cols = [i for i in cat_cols if i not in binary_cols]
# 数值型
num_cols = [i for i in df_model.columns if i not in cat_cols + Id_col]
# 二分类-标签编码
le = LabelEncoder()
for i in binary_cols:
    df_model[i] = le.fit_transform(df_model[i])
# 多分类-哑变量转换
df_model = pd.get_dummies(data=df_model, columns=multi_cols)
df_model.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,0,1,0,1,0,0,0,1,...,True,False,False,True,False,False,False,False,True,False
1,5575-GNVDE,1,0,0,0,34,1,0,1,0,...,True,False,False,False,True,False,False,False,False,True
2,3668-QPYBK,1,0,0,0,2,1,0,1,1,...,True,False,False,True,False,False,False,False,False,True
3,7795-CFOCW,1,0,0,0,45,0,0,1,0,...,True,False,False,False,True,False,True,False,False,False
4,9237-HQITU,0,0,0,0,2,1,0,0,0,...,False,True,False,True,False,False,False,False,True,False


### 将转换后的数据定义为自变量和标签值

这个数据是没有经过特征筛选的，因为不是所有的模型都需要经过特征筛选才能使用，有一些的特征选择是在模型中自己隐形选择的，因此我们在这里要保留一份最完整特征版的数据

In [47]:
X = df_model.copy().drop(['customerID','Churn'], axis=1)
print(X.shape)
y = df_model[target_col]

(7032, 26)


### 数据归一化

In [49]:
from sklearn.preprocessing import StandardScaler

st= StandardScaler()

X_data_norm = pd.DataFrame(st.fit_transform(X[num_cols]), columns=num_cols)
X_data_norm = pd.concat([X.drop(num_cols, axis= 1), X_data_norm], axis= 1)
X = X_data_norm

## 数据平衡性改善

因为我们发现流失客户和未流失客户的样本数量是不平衡的，因此使用上采样的方式增加流失客户的数据量从而使数据平衡性得到改善

In [50]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X.shape)
print(X_resampled.shape)
print(y['Churn'].value_counts())
print(y_resampled['Churn'].value_counts())

(7032, 26)
(10326, 26)
Churn
0    5163
1    1869
Name: count, dtype: int64
Churn
0    5163
1    5163
Name: count, dtype: int64


观察上述记过我们可以看到经过上采样之后，我们的y对应的两种标签值的数据数量一致了，具体观察对应的y值分布变化可以看出来是将流失客户数据进行了补齐，补齐到了和未流失客户数据两相同的程度

## 数据压缩

### PCA主成分分析法对数据实现不同程度的无监督压缩

In [52]:
from sklearn.decomposition import PCA

pca5 = PCA(n_components=5,copy=True,whiten=False,svd_solver="auto",
tol=0.0,iterated_power="auto",random_state=None)

pca10 = PCA(n_components=10,copy=True,whiten=False,svd_solver="auto",
tol=0.0,iterated_power="auto",random_state=None)

pca20 = PCA(n_components=20,copy=True,whiten=False,svd_solver="auto",
tol=0.0,iterated_power="auto",random_state=None)

pca_Compress_to_5X = pca5.fit_transform(X)
pca_Compress_to_10X = pca10.fit_transform(X)
pca_Compress_to_20X = pca20.fit_transform(X)

### 使用特征检定的方式进行特征筛选

In [53]:
from sklearn.feature_selection import SelectKBest, f_classif

fs5 = SelectKBest(score_func=f_classif, k=5) 
fs10 = SelectKBest(score_func=f_classif, k=10) 
fs20 = SelectKBest(score_func=f_classif, k=20) 

X_train_fs5 = fs5.fit_transform(X, y)
X_train_fs10 = fs10.fit_transform(X, y)
X_train_fs20 = fs20.fit_transform(X, y)

def SelectName(feature_data, model):
    scores = model.scores_
    indices = np.argsort(scores)[::-1]
    return list(feature_data.columns.values[indices[0:model.k]])

# 输出选择变量名称
fea_name = [i for i in X.columns if i in SelectName(X, fs5)]
st_Compress_to_5X = pd.DataFrame(X_train_fs5, columns = fea_name)
fea_name = [i for i in X.columns if i in SelectName(X, fs10)]
st_Compress_to_10X = pd.DataFrame(X_train_fs10, columns = fea_name)
fea_name = [i for i in X.columns if i in SelectName(X, fs20)]
st_Compress_to_20X = pd.DataFrame(X_train_fs20, columns = fea_name)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [54]:
st_Compress_to_20X.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,OnlineSecurity,OnlineBackup,TechSupport,PaperlessBilling,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.280248,-1.161694,-0.994194
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.064303,-0.260878,-0.17374
2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.239504,-0.363923,-0.959649
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.512486,-0.74785,-0.195248
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.239504,0.196178,-0.940457


# 探索性分析

了解数据的基本分布情况，从而更好的对数据进行挖掘

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

#目标变量Churn分布
df['Churn'].value_counts()

trace0 = go.Pie(labels=['未流失客户','流失客户'],
#                 labels=df['Churn'].value_counts().index, 
                values=df['Churn'].value_counts().values,
                hole=.5,
                rotation=90,
                marker=dict(colors=['rgb(154,203,228)', 'rgb(191,76,81)'], 
                            line=dict(color='white', width=1.3))
               )
data = [trace0] 
layout = go.Layout(title='目标变量Churn分布', font=dict(size=26))

fig = go.Figure(data=data, layout=layout)
py.offline.plot(fig, filename= '整体流失情况分布.html', auto_open=False)

'整体流失情况分布.html'

In [6]:
def plot_bar(input_col: str, target_col: str, title_name: str):
    cross_table = round(pd.crosstab(df[input_col], df[target_col], normalize='index')*100, 2)

    # 索引
    index_0 = cross_table.columns.tolist()[0] 
    index_1 = cross_table.columns.tolist()[1] 

    # 绘图轨迹
    trace0 = go.Bar(x=cross_table.index.tolist(), 
                    y=cross_table[index_0].values.tolist(), 
#                     name=index_0,
                    marker=dict(color='rgb(154,203,228)'),
                    name='未流失客户'
                   ) 
    trace1 = go.Bar(x=cross_table.index.tolist(), 
                    y=cross_table[index_1].values.tolist(), 
#                     name=index_1,
                    marker=dict(color='rgb(191,76,81)'),
                    name='流失客户'
                   ) 

    data = [trace0, trace1] 
    # 布局
    layout = go.Layout(title=title_name, bargap=0.4, barmode='stack', font=dict(size=26))
    
    # 画布
    fig = go.Figure(data=data, layout=layout)
    # 绘图
    py.offline.plot(fig, filename=f'./html/category_relation/{title_name}.html',auto_open=False) 

# 绘制数值型的数据在不同分布区间内流失和未流失客户的直方图可视化
def plot_histogram(input_col: str, title_name: str):
    churn_num = df[df['Churn'] == 'Yes'][input_col]
    not_churn_num = df[df['Churn'] == 'No'][input_col] 
    
    # 图形轨迹
    trace0 = go.Histogram(x=churn_num, 
                          bingroup=25,
                          histnorm='percent',
                          name='流失客户',
                          marker=dict(color='rgb(191,76,81)')
                         )
    trace1 = go.Histogram(x=not_churn_num, 
                          bingroup=25,
                          histnorm='percent',
                          name='未流失客户',
                          marker=dict(color='rgb(154,203,228)')
                         )

    data = [trace0, trace1]
    layout = go.Layout(title=title_name, font=dict(size=26))

    fig = go.Figure(data=data, layout=layout)
    py.offline.plot(fig, filename=f'./html/num_relation/{title_name}.html',auto_open=False) 

## 绘制不同类别型数据与是否流失的关系

In [7]:
#类别型数据与是否流失的关系
for char in cat_cols:
    plot_bar(input_col=char, target_col='Churn', title_name='{}与是否流失的关系'.format(char))
#数值类数据与是否流失的关系
for char in num_cols :
    plot_histogram(input_col=char, title_name='{}与是否流失的关系'.format(char))

# 建立模型评估的框架

## k-折交叉验证

In [222]:
from sklearn.model_selection import cross_val_score

def model_report_k_cross_val(model, X, y, name, k=10) :
    """
    Input: 
    model: 输入的模型实例
    X: 训练的自变量
    y: 与X一一对应的标签值
    name: 模型名字，打表用
    k: k折交叉验证

    Output:
    使用k折交叉验证给出的模型的指标结果,是DataFrame类型的数据
    默认使用10折
    """
    y = y.values.reshape(-1)
    accuracy = np.mean(cross_val_score(model, X, y, cv=k, scoring='accuracy'))
    recallscore = np.mean(cross_val_score(model, X, y, cv=k, scoring='recall'))
    precision = np.mean(cross_val_score(model, X, y, cv=k, scoring='precision'))
    # roc_auc = np.mean(cross_val_score(model, X, y, cv=k, scoring='roc_auc'))
    f1score = np.mean(cross_val_score(model, X, y, cv=k, scoring='f1'))

    # 保存数据
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                    #    "Area_under_curve": [roc_auc],
                      })
    return df

# 模型声明与定义

这里先定义了最基础的未经调参的模型，用这些基础模型先对比各自的效果，并且控制这些基础模型变量不变，对比上面经过不同的数据压缩和选择算法之后的效果，下面将进行对比实验

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# 实例化模型

# 训练时间较短的1-5s左右
logit = LogisticRegression() 
decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)  
knn = KNeighborsClassifier(n_neighbors=5) # knn的系数
gnb = GaussianNB()

# 训练时间中等短的5-10s

# lgbm 是一种决策树的集成算法，跟xgboost处于同一地位
lgbm_c = LGBMClassifier(boosting_type='gbdt', n_estimators=100, random_state=0) 

# 时间训练时间偏长的1min左右
xgc = XGBClassifier(n_estimators=100, eta=0.02, max_depth=15, random_state=0, learning_rate=0.001) # 1min
rfc = RandomForestClassifier(n_estimators=100, random_state=0) 
mlp_model = MLPClassifier(hidden_layer_sizes=(8,), alpha=0.05, max_iter=50000, 
                          activation='logistic', random_state=0)

# 训练时间特长的3mins往上
svc_lin  = SVC(kernel='linear', random_state=0, probability=True) 
svc_rbf  = SVC(kernel='rbf', random_state=0, probability=True) 

In [114]:
import plotly.figure_factory as ff
import plotly as py

# 定义绘制结果表格的函数
def draw_result(reports):
    model_performances = pd.concat(reports, axis=0).reset_index()
    model_performances = model_performances.drop(columns="index",axis =1)
    table  = ff.create_table(np.round(model_performances, 5))
    py.offline.iplot(table) 

## 对比不同数据压缩和变化在同一个模型上的效果

为了查看各种数据降维的效果以及数据上采样后的效果，我们进行了如下实验的设计，统一采用训练时间较快的基于朴素贝叶斯原理的模型GaussianNB做数据处理效果的对比

In [115]:

def data_preprocess_test(model):
    """
    这个函数输入一个模型，可以用这个模型在压缩过的数据和上采样过的数据上进行训练，
    给出各种经过预处理数据训练结果，以及最基础的未经预处理的数据训练的结果
    """
    # 对比各种不同压缩方法的效果

    ## pca不同程度的压缩
    pca5_report = model_report_k_cross_val(model, pca_Compress_to_5X, y, 'pac5')
    pca10_report = model_report_k_cross_val(model, pca_Compress_to_10X, y, 'pac10')
    pca20_report = model_report_k_cross_val(model, pca_Compress_to_20X, y, 'pac20')

    # 特征提取不同程度的压缩
    fs5_report = model_report_k_cross_val(model, st_Compress_to_5X, y, 'fs5')
    fs10_report = model_report_k_cross_val(model, st_Compress_to_10X, y, 'fs10')
    fs20_report = model_report_k_cross_val(model, st_Compress_to_20X, y, 'fs20')

    ## 对比上采样后的数据和未上采样的数据在同一个模型上的效果
    resample_report = model_report_k_cross_val(model, X_resampled, y_resampled, 'resampled')
    base_report = model_report_k_cross_val(model, X, y, 'base')

    draw_result([base_report, pca5_report, pca10_report, pca20_report,
                                    fs5_report, fs10_report, fs20_report,
                                    resample_report])

In [101]:
data_preprocess_test(gnb)

In [102]:
data_preprocess_test(logit)

In [103]:
data_preprocess_test(decision_tree)

通过上述实验结果我们可以看出来，通过上采样对于模型的效果提升最好，在所有的指标上有提升；同时我们也发现，两种数据降维的压缩方式对于模型反而有了负面效果，至少在贝叶斯分类器上是这样的效果，或许是由于朴素贝叶斯分类器对于冗余数据的抗干扰性较强，或是这些属性本身的相关程度就不大，因此自变量被减少之后，导致了信息量的真实损失，而不是解除了维度诅咒，因此导致效果变差。

因此得出结论，指导我在之后的操作过程中直接使用上采样的平衡数据，不要采用降维方法应该会得到更好的效果

# 使用上采样的数据将各种模型都跑一遍10折交叉验证对比效果

In [104]:
logit_report = model_report_k_cross_val(logit, X_resampled, y_resampled, 'logit')
svm_linear_report = model_report_k_cross_val(svc_lin, X_resampled, y_resampled, 'svc_linear')
svm_rbf_report = model_report_k_cross_val(svc_rbf, X_resampled, y_resampled, 'svc_rbf')
mlp_report = model_report_k_cross_val(mlp_model, X_resampled, y_resampled, 'mlp_model')
gnb_report = model_report_k_cross_val(gnb, X_resampled, y_resampled, 'Naive Bayes')
decision_report = model_report_k_cross_val(decision_tree, X_resampled, y_resampled, 'decision_tree')
rgc_report = model_report_k_cross_val(rfc, X_resampled, y_resampled, 'Random Forest Classifier')
xgc_report = model_report_k_cross_val(xgc, X_resampled, y_resampled, 'XGBoost Classifier')
knn_report = model_report_k_cross_val(knn, X_resampled, y_resampled, 'KNN Classifier')
lgbm_report = model_report_k_cross_val(lgbm_c, X_resampled, y_resampled, 'LGBM Classifier')


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_spar

In [116]:
draw_result([logit_report, svm_linear_report, svm_rbf_report,
                                mlp_report, gnb_report, decision_report,
                                rgc_report, lgbm_report, xgc_report])

观察结果我们发现使用了集成思想的后面三个模型的所有指标都显著高于前面的那些单一模型，尤其是随机森林的效果最好，召回率和auc面积都达到了97%以上的精确率，可以说是非常高的水平了，改良的效果很好,由于我们使用的是10折交叉验证，因此这个结果具有可信度，不是随机导致的某次结果很好产生的效果。


和未经数据上采样的各项指标对比，我们会发现各个模型要么有很大的提升，要么会有很小的提升，但是都没有出现负面的效果，而其中提升最显著的就是决策树相关的四种算法，说明决策树对于数据不平衡较为敏感，且在数据平衡后，能产生非常好的效果。

# 手动实现集成的过程

我们知道决策树是一种不稳定的算法，虽然skl库中已经提供了他的很多集成算法模型，但是为了手动体验这个过程，我打算自己实现一遍集成的算法看看能否提高性能，具体来说，我采用的bagging方法，实现了一个满足sklearn库模型接口的类my_bagging_decision_tree，这个自己实现的模型类中使用了bagging的方式集成了多个决策树模型，使用最大数投票原则给出了最终的分类结果，同时，实现了10折交叉验证的分数计算接口，实现了和打框架的特匹配和统一，可以直接调用cross_val_score函数进行交叉验证，给出模型效果指标。

In [224]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score, recall_score

class my_bagging_decision_tree(ClassifierMixin, BaseEstimator):
    def __init__(self, bagging_num=10, sample_rate=0.67):
        self.decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)  
        self.bagging_num = bagging_num
        self.sample_rate = sample_rate
        self.bagging_models = []
        self.isPredicted = False
        super().__init__()

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes_ = sorted(set(y))
        for i in range(self.bagging_num):
            X_sample, _, y_sample, _ = train_test_split(X, y, test_size=(1 - self.sample_rate))
            self.bagging_models.append(self.decision_tree.fit(X_sample, y_sample))
        return self

    # 多数投票原则，每一个bagging的模型都跑一边得到结果
    def predict(self, X):
        self.result = np.zeros(len(X))
        for model in self.bagging_models:
            self.result = np.vstack((self.result, model.predict(X)))
        self.result = np.sum(self.result, axis=0) / self.bagging_num
        self.result = np.where(self.result > 0.5, 1, 0)
        self.isPredicted = True
        return self.result

    def precision(self, X, y):
        # 精确度计算代码
        if self.isPredicted == False:
            self.predictions = self.predict(X)
        return precision_score(y, self.predictions)

    def accuracy(self, X, y):
        # 精确度计算代码
        if self.isPredicted == False:
            self.predictions = self.predict(X)
        return accuracy_score(y, self.predictions)

    def f1(self, X, y):
        # 精确度计算代码
        if self.isPredicted == False:
            self.predictions = self.predict(X)
        return f1_score(y, self.predictions)

    def recall(self, X, y):
        if self.isPredicted == False:
            self.predictions = self.predict(X)
        return recall_score(y, self.predictions)

    def roc_auc(self, X, y):
        if self.isPredicted == False:
            self.predictions = self.predict(X)
        return roc_auc_score(y, self.predictions)

    def predict_proba(self, X):
        # 使用基础模型获取类别的概率
        self.prob = np.zeros(len(X))
        for model in self.bagging_models:
            self.prob = np.vstack((self.prob, model.predict_proba(X)))
        self.prob = np.sum(self.prob, axis=0) / self.bagging_num

        return self.prob

In [None]:
mbdt = my_bagging_decision_tree(bagging_num=10, sample_rate=0.67)
mbdt_report = model_report_k_cross_val(mbdt, X_resampled, y_resampled, "my_bagging_decision_tree")

In [219]:
print(mbdt_report)

                      Model  Accuracy_score  Recall_score  Precision  f1_score
0  my_bagging_decision_tree        0.764482      0.804393   0.747789  0.772631


# 集成不同类别的分类器

In [226]:
from sklearn.ensemble import VotingClassifier

# 创建一个投票分类器
ensemble_classifier = VotingClassifier(estimators=[
    ('logistic_regression', logit),
    ('decision_tree', decision_tree),
    ('random_forest', rfc)
], voting='soft')

mulit_classifier_report = model_report_k_cross_val(ensemble_classifier, X_resampled, y_resampled, "ensemble")
print(mulit_classifier_report)


      Model  Accuracy_score  Recall_score  Precision  f1_score  \
0  ensemble         0.84215      0.913416   0.799747  0.852614   

   Area_under_curve  
0           0.92651  


通过上面的实验，我们可以发现，如果集成的分类器有稳定的也有不稳定的，不稳定的分类器反而会将本来效果很好的稳定的分类器的性能给拉低

# 决策树调参

In [117]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import  GridSearchCV
parameters = { 'splitter': ( 'best', 'random'),
              'criterion': ( "gini", "entropy"),
              "max_depth": [* range( 3, 20)],
}

clf = DecisionTreeClassifier(random_state= 25)

GS = GridSearchCV(clf, parameters, scoring= 'f1', cv= 10)

refine_decision_tree_report = model_report_k_cross_val(GS, X_resampled, y_resampled, 'refined Decision Tree')

In [119]:
print(refine_decision_tree_report)

                   Model  Accuracy_score  Recall_score  Precision  f1_score  \
0  refined Decision Tree        0.877016      0.959704   0.823505  0.886037   

   Area_under_curve  
0          0.884341  


# 使用自己写的神经网络来完成这个任务

不需要再做单独的数据特征提取了，只需要将预处理好的所有特征都输入到网络中，然后让网络直接给出结果即可

## 多层感知机预测模型

In [None]:
# 定义数据集
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# 步骤1：准备数据集
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        return x, y


X_data = df_model.drop(columns=["customerID", "Churn"])
X_label = df_model["Churn"]

X_data_norm = pd.DataFrame(st.fit_transform(X_data[num_cols]), columns=num_cols)
X_data_norm = pd.concat([X_data.drop(num_cols, axis= 1), X_data_norm], axis= 1)

import random

# 假设 data 是你的数据集，labels 是对应的标签
data = X_data_norm.values.astype(float)  # numpy强制类型转换
labels = X_label.values.astype(int)
# 将数据类型统一
print(type(data))
data = torch.from_numpy(data).float()
labels = torch.from_numpy(labels).long()

# 首先，确保 data 和 labels 一一对应

# 随机打乱数据集
random.seed(42)  # 设置随机种子以确保可重复性
print(len(data))
shuffled_indices = list(range(len(data)))
random.shuffle(shuffled_indices)

shuffled_data = []
shuffled_labels = []

for i in range(len(data)):
    shuffled_data.append(data[shuffled_indices[i]])
    shuffled_labels.append(labels[shuffled_indices[i]])

# 划分数据集
total_samples = len(shuffled_data)
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_split = int(total_samples * train_ratio)
val_split = train_split + int(total_samples * val_ratio)

train_data = shuffled_data[:train_split]
train_labels = shuffled_labels[:train_split]

val_data = shuffled_data[train_split:val_split]
val_labels = shuffled_labels[train_split:val_split]

test_data = shuffled_data[val_split:]
test_labels = shuffled_labels[val_split:]

# 创建一个CustomDataset实例来表示训练数据集
train_dataset = CustomDataset(train_data, train_labels)
valid_dataset = CustomDataset(val_data, val_labels)
test_dataset = CustomDataset(test_data, test_labels)

# 步骤2：创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)
valid_loader = DataLoader(valid_dataset, batch_size=64)


<class 'numpy.ndarray'>
7032


In [None]:
# 步骤3：定义模型
class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, int(hidden_size / 2))
        self.fc3 = nn.Linear(int(hidden_size / 2), int(hidden_size / 4))
        self.fc4 = nn.Linear(int(hidden_size / 4), num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        # x = self.relu(x)
        # x = self.fc4(x)
        return x

model = SimpleModel(input_size=32, hidden_size=100, num_classes=2)

# 步骤4：定义损失函数
criterion = nn.CrossEntropyLoss()

# 步骤5：选择优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 步骤6：训练模型
num_epochs = 1000
last_f1 = 0.0

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model.forward(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    correct = 0
    total = 0

    real_one = 0
    pre_one_true = 0 
    pre_one_false = 0
    pre_zero_true = 0
    pre_zero_false = 0

    # 使用f1值作为过拟合点判断
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model.forward(inputs)
            _, predicted = torch.max(outputs, 1)
            for pre in predicted:
                if pre == 1:
                    pre_one_false += (labels != predicted).sum().item()
                    pre_one_true += (labels == predicted).sum().item()
                else:
                    pre_zero_false += (labels != predicted).sum().item()
                    pre_zero_true += (labels == predicted).sum().item()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    recall = pre_one_true / (pre_one_true + pre_zero_false)
    precision = pre_one_true / (pre_one_true + pre_one_false)
    f1_score = 2 * precision * recall / (precision + recall)
    if last_f1 > f1_score:
        torch.save(model.state_dict(), 'model.pkl')
        print(f'last epoch : {epoch}, Accuracy on the valid set: {100 * accuracy:.2f}%')
        print(f'Accuracy on the test set: {100 * accuracy:.2f}%')
        print(f'Recall on the test set: {100 * recall:.2f}%')
        print(f'Precision on the test set: {100 * precision:.2f}%')
        print(f'F1-score on the test set: {100 * f1_score:.2f}%')
        break
    last_f1 = f1_score
    print(f'epoch : {epoch}')
    print(f'Accuracy set: {100 * accuracy:.2f}%')
    print(f'Recall set: {100 * recall:.2f}%')
    print(f'Precision set: {100 * precision:.2f}%')
    print(f'F1-score set: {100 * f1_score:.2f}%')

print("-"*10)

# 步骤7：评估模型
# 我们认为1是流失了，就是检测出来的一个用户，这个看成正例

real_one = 0
pre_one_true = 0 
pre_one_false = 0
pre_zero_true = 0
pre_zero_false = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model.forward(inputs)
        _, predicted = torch.max(outputs, 1)
        for pre in predicted:
            if pre == 1:
                pre_one_false += (labels != predicted).sum().item()
                pre_one_true += (labels == predicted).sum().item()
            else:
                pre_zero_false += (labels != predicted).sum().item()
                pre_zero_true += (labels == predicted).sum().item()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
recall = pre_one_true / (pre_one_true + pre_zero_false)
precision = pre_one_true / (pre_one_true + pre_one_false)
f1_score = 2 * precision * recall / (precision + recall)
print(f'Accuracy on the test set: {100 * accuracy:.2f}%')
print(f'Recall on the test set: {100 * recall:.2f}%')
print(f'Precision on the test set: {100 * precision:.2f}%')
print(f'F1-score on the test set: {100 * f1_score:.2f}%')

epoch : 0
Accuracy set: 79.83%
Recall set: 40.32%
Precision set: 79.86%
F1-score set: 53.58%
epoch : 1
Accuracy set: 80.21%
Recall set: 53.12%
Precision set: 80.16%
F1-score set: 63.89%
last epoch : 2, Accuracy on the valid set: 81.44%
Accuracy on the test set: 81.44%
Recall on the test set: 49.12%
Precision on the test set: 81.62%
F1-score on the test set: 61.33%
----------
Accuracy on the test set: 81.44%
Recall on the test set: 49.12%
Precision on the test set: 81.62%
F1-score on the test set: 61.33%


In [None]:
# 神经网络给出的重要性特征的探索
X_data_headers = X_data.columns.values.tolist()
print(X_data_headers)
# 根据权重参数进行排序，我们认为绝对值越的对于最终的结果影响越大，基于这个认知，我们进行下述操作
# 将每个特征对应的权重值进行求绝对值运算，然后将这个权重值进行累加，用这个累加后的结果参与排序，作为重要性指标
# 获取特征层的权重参数
feature_weight = model.state_dict()['fc1.weight']
abs_feature_weight = torch.abs(feature_weight)
sum_abs_feature_weight = list(abs_feature_weight.sum(dim=1))
print(sum_abs_feature_weight)


for i in range(len(X_data_headers)):
    for j in range(0, len(X_data_headers) - i - 1):
        if sum_abs_feature_weight[j] < sum_abs_feature_weight[j + 1]:
            X_data_headers[j], X_data_headers[j + 1] = X_data_headers[j + 1], X_data_headers[j]
            sum_abs_feature_weight[j], sum_abs_feature_weight[j + 1] = sum_abs_feature_weight[j + 1], sum_abs_feature_weight[j]

print(sum_abs_feature_weight)
for i in range(len(X_data_headers)):
    print(X_data_headers[i], sum_abs_feature_weight[i])

['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'InternetService_DSL', 'InternetService_Fiber optic', 'InternetService_No', 'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'tenure_group_Tenure_1', 'tenure_group_Tenure_2', 'tenure_group_Tenure_3', 'tenure_group_Tenure_4', 'tenure_group_Tenure_5', 'tenure_group_Tenure_over_5']
[tensor(2.5085), tensor(3.0021), tensor(3.6075), tensor(2.9942), tensor(3.1955), tensor(3.1550), tensor(2.9383), tensor(3.9614), tensor(2.8122), tensor(2.9193), tensor(3.3020), tensor(2.6238), tensor(2.9674), tensor(3.3294), tensor(2.4104), tensor(2.9522), tensor(2.6725), tensor(3.1514), tensor(3.6

# 总结实验结果

通过上述的实验，
1. 我们探究了数据变换和压缩对于这个案例中分类器性能的影响，发现了数据平衡后对于分类器性能提升是几乎不区分模型类型的效果显著，其中又数随机森林的提升最大
2. 同时通过多组对比实验，找到了最优的模型，**经过数据平衡采样后的随机森林模型**，使其性能auc面积达到了97.688%
3. 本实验中我还自行实现了多层感知机模型来完成这个任务，其性能能够与基准值模型中最好的贝叶斯相比
4. 我在实验中自行实现了集成决策树这个弱分类器的过程，发现了集成对于弱分类器性能的提升
5. 我在本实验中重构了代码，将各部分的处理用markdown组织了起来，模块化了各部分，数据处理，模型验证，模型定义过程，并且将所有的评估都纳入了k折交叉验证的框架中，方便了实验的复现和验证，也有很好的可扩展性，只要定义新的模型并调用我的报告函数即可实现计算对应的指标，对比模型性能