In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import math
bank=pd.read_csv("G:\\XX\\Data\\bank.csv", index_col=False)
print(bank.head(5))

   age     sex      region   income married  children  car save_act  \
0   48  FEMALE  INNER_CITY  17546.0      NO         1   NO       NO   
1   40    MALE        TOWN  30085.1     YES         3  YES       NO   
2   51  FEMALE  INNER_CITY  16575.4     YES         0  YES      YES   
3   23  FEMALE        TOWN  20375.4     YES         3   NO       NO   
4   57  FEMALE       RURAL  50576.3     YES         0   NO      YES   

  current_act mortgage  pep  
0          NO       NO  YES  
1         YES      YES   NO  
2         YES       NO   NO  
3         YES       NO   NO  
4          NO       NO   NO  


In [2]:
# 将分类变量转化为数值变量
for name in ["sex", "region", "married", "car", "save_act", "current_act", "mortgage", "pep"]:
    col=pd.Categorical(bank[name])
    bank[name]=col.codes
print(bank["car"].head(10))

0    0
1    1
2    1
3    0
4    0
5    0
6    0
7    1
8    1
9    1
Name: car, dtype: int8


In [None]:
def calc_entropy(column):
    # Compute the counts of each unique value in the column
    counts = numpy.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    return -entropy
def calc_information_gain(data, split_name, target_name):
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    # Return information gain
    return original_entropy - to_subtract
columns = ["age", "sex", "region", "income", "married", "children", "car", "save_act", "current_act", "mortgage"]
def find_best_column(data, target_name, columns):
    information_gains = []
    # Loop through and compute information gains
    for col in columns:
        information_gain = calc_information_gain(data, col, "pep")
        information_gains.append(information_gain)
    # Find the name of the column with the highest gain
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

bank_split = find_best_column(bank, "pep", columns)
print(bank_split)

In [3]:
# 将数据集划分为训练集：测试集=8:2
np.random.seed(1)
bank=bank.reindex(np.random.permutation(bank.index))
train_max_row=math.floor(bank.shape[0]* 0.8)
train = bank[0: train_max_row]
test = bank[train_max_row:]

In [4]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
# from matplotlib import gridspec
# import seaborn as sns
columns = ["age", "sex", "region", "income", "married", "children", "car", "save_act", "current_act", "mortgage"]
# col=['age', 'income']
# Normalizer().fit_transform(bank[col])
print(bank.head(5))
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# selector = SelectKBest(chi2, k=5)
# optimized_columns = selector.fit_transform(bank[columns], bank['pep'])
# print(selector)
# SelectKBest(chi2, k=5).fit_transform(bank[columns], bank['pep'])

# plt.figure(figsize=(8,10*4))
# v_features = bank[columns].columns
# gs = gridspec.GridSpec(10, 1)
# for i, cn in enumerate(bank[v_features]):
#     ax = plt.subplot(gs[i])
#     sns.distplot(bank[bank['pep'] == 1][cn])
#     sns.distplot(bank[bank['pep'] == 0][cn])
#     ax.set_xlabel('')
#     ax.set_title('histogram of feature:' + str(cn))
# plt.show()

     age  sex  region   income  married  children  car  save_act  current_act  \
446   52    0       3  43719.5        1         0    0         1            1   
404   24    0       0  13864.6        1         3    0         1            1   
509   23    0       1  11215.3        1         2    1         1            1   
455   27    0       0  11866.4        1         0    1         1            1   
201   46    0       3  41627.1        1         0    0         1            1   

     mortgage  pep  
446         0    0  
404         0    1  
509         0    1  
455         0    0  
201         1    0  


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
# 决策树模型调参寻优
hyperparameters={
    "criterion": ["entropy", "gini"],
    "splitter": ["best", "random"],
    "max_depth": range(1,20,2),
    "min_samples_split": range(3,16)
}
clf = DecisionTreeClassifier()
grid=GridSearchCV(clf, param_grid=hyperparameters, cv=10)
grid.fit(train[columns], train["pep"])
best_params=grid.best_params_
best_score=grid.best_score_
print(best_params)

{'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 13, 'splitter': 'best'}




In [8]:
from sklearn import metrics
clf = DecisionTreeClassifier(criterion="gini", splitter="best", max_depth=7, min_samples_split=14)
clf.fit(train[columns], train["pep"])
prediction=clf.predict(test[columns])

# 混淆矩阵的测试集评测指标
# f1=metrics.f1_score(test['pep'], prediction)
# print("决策树模型的测试集F1值为",f1)
# cm=metrics.confusion_matrix(test['pep'], prediction)
# tn, fp, fn, tp = cm.ravel()
# accuracy=(tp + tn) / (tp + fn + fp + tn)
# print("决策树模型的测试集正确率为",accuracy)
# presision= tp / (tp + fp)
# print("决策树模型的测试集准确率为",presision)
# recall= tp / (tp + fn)
# print("决策树模型的测试集召回率为",recall)

# 决策树模型的测试集评测结果
print(metrics.confusion_matrix(test['pep'], prediction))
print(metrics.classification_report(test['pep'], prediction))
accuracy = metrics.accuracy_score(test["pep"], prediction)
print("决策树模型的测试集正确率为",accuracy)
auc = roc_auc_score(test["pep"], prediction)
print("决策树模型的测试集AUC值为", auc)

# DT分类算法的五折交叉验证的评测指标
accuracy = cross_val_score(clf, train[columns], train["pep"], scoring='accuracy', cv=5)
precision = cross_val_score(clf, train[columns], train["pep"], scoring='precision', cv=5)
recall = cross_val_score(clf, train[columns], train["pep"], scoring='recall', cv=5)
f1_score = cross_val_score(clf, train[columns], train["pep"], scoring='f1', cv=5)
auc = cross_val_score(clf, train[columns], train["pep"], scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())

[[64  5]
 [ 6 45]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        69
           1       0.90      0.88      0.89        51

   micro avg       0.91      0.91      0.91       120
   macro avg       0.91      0.90      0.91       120
weighted avg       0.91      0.91      0.91       120

决策树模型的测试集正确率为 0.9083333333333333
决策树模型的测试集AUC值为 0.9049445865302643


In [None]:
from sklearn.tree import export_graphviz
from sklearn import tree
with open("G:\\XX\\Data\\bank.dot", 'w') as f:
    f=tree.export_graphviz(clf, feature_names=columns, class_names=['NO', 'YES'], filled=True, rounded=True, out_file=f)

In [None]:
# KNN分类模型
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# 绘制折线图：基于F1值找出最优K值为27
def plot_dict(dictionary):
    pd.Series(dictionary).plot(figsize=(8,4), xlim=(1,50), ylim=(0.4,0.6), marker='o', markersize=3)
    plt.xlabel("K")
    plt.ylabel("F1 score")
    plt.xticks(range(1,50,2))
    plt.show()
    
knn_f1 = dict()
for k in range(1,50,2):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(train[columns], train['pep'])
    prediction = knn.predict(test[columns])
    f1=metrics.f1_score(test['pep'], prediction)
    knn_f1[k]=f1
plot_dict(knn_f1)

In [None]:
# from sklearn.model_selection import GridSearchCV
# hyperparameters = {
#     "n_neighbors": range(1,30,2),
#     "weights": ["distance", "uniform"],
#     "algorithm": ['brute'],
#     "p": [1,2]
# }
# knn=KNeighborsClassifier()
# grid=GridSearchCV(knn, param_grid=hyperparameters, cv=10)
# grid.fit(train[columns], train['pep'])
# best_params=grid.best_params_
# print(best_params)

knn = KNeighborsClassifier(n_neighbors=27)
knn.fit(train[columns], train['pep'])
prediction = knn.predict(test[columns])

print(metrics.confusion_matrix(test['pep'], prediction))
print(metrics.classification_report(test['pep'], prediction))
accuracy = metrics.accuracy_score(test["pep"], prediction)
print("KNN模型的测试集正确率为",accuracy)
auc = roc_auc_score(test["pep"], prediction)
print("KNN模型的测试集AUC值为", auc)

# # KNN分类算法的五折交叉验证的评测指标
accuracy = cross_val_score(knn, train[columns], train["pep"], scoring='accuracy', cv=5)
precision = cross_val_score(knn, train[columns], train["pep"], scoring='precision', cv=5)
recall = cross_val_score(knn, train[columns], train["pep"], scoring='recall', cv=5)
f1_score = cross_val_score(knn, train[columns], train["pep"], scoring='f1', cv=5)
auc = cross_val_score(knn, train[columns], train["pep"], scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())

In [11]:
from sklearn.svm import SVC
# svc = SVC(kernel='rbf', class_weight='balanced')
# train['pep1']=train['pep']
# test['pep1']=test['pep']
# train.loc[train.pep==0, 'pep1']=-1
# test.loc[test.pep==0, 'pep1']=-1

svc = SVC()
c_range = np.logspace(-5, 15, 11, base=2)
gamma_range = np.logspace(-9, 3, 13, base=2)
# 网格搜索交叉验证的参数范围，10折交叉
param_grid = [{'kernel': ['rbf'], 'C': c_range, 'gamma': gamma_range}]
grid = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1)
clf = grid.fit(train[columns], train["pep"])
best_params=grid.best_params_
best_score=grid.best_score_
print(best_params)

{'C': 2.0, 'gamma': 0.125, 'kernel': 'rbf'}




In [12]:
import warnings
svc = SVC(kernel='rbf', C=2.0, gamma= 0.125, class_weight='balanced')
svc.fit(train[columns], train['pep'])
prediction = svc.predict(test[columns])

print(metrics.confusion_matrix(test['pep'], prediction))
print(metrics.classification_report(test['pep'], prediction))
accuracy = metrics.accuracy_score(test["pep"], prediction)
print("SVM模型的测试集正确率为",accuracy)
auc = roc_auc_score(test["pep1"], prediction)
print("SVM模型的测试集AUC值为", auc)

# SVM分类算法的五折交叉验证的评测指标
# warnings.filterwarnings("ignore")
accuracy = cross_val_score(svc, train[columns], train["pep"], scoring='accuracy', cv=5)
precision = cross_val_score(svc, train[columns], train["pep"], scoring='precision', cv=5)
recall = cross_val_score(svc, train[columns], train["pep"], scoring='recall', cv=5)
f1_score = cross_val_score(svc, train[columns], train["pep"], scoring='f1', cv=5)
auc = cross_val_score(svc, train[columns], train["pep"], scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())

[[69  0]
 [51  0]]
              precision    recall  f1-score   support

           0       0.57      1.00      0.73        69
           1       0.00      0.00      0.00        51

   micro avg       0.57      0.57      0.57       120
   macro avg       0.29      0.50      0.37       120
weighted avg       0.33      0.57      0.42       120

SVM模型的测试集正确率为 0.575
SVM模型的测试集AUC值为 0.5


  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
import matplotlib.pyplot as plt
gnb = GaussianNB()
gnb.fit(train[columns], train['pep'])
prediction = gnb.predict(test[columns])

print(metrics.confusion_matrix(test['pep'], prediction))
print(metrics.classification_report(test['pep'], prediction))
accuracy = metrics.accuracy_score(test["pep"], prediction)
print("NB模型的测试集正确率为",accuracy)
auc = roc_auc_score(test["pep"], prediction)
print("NB模型的测试集AUC值为", auc)

# NB分类算法的五折交叉验证的评测指标
# warnings.filterwarnings("ignore")
# accuracy = cross_val_score(gnb, train[columns], train["pep"], scoring='accuracy', cv=5)
# precision = cross_val_score(gnb, train[columns], train["pep"], scoring='precision', cv=5)
# recall = cross_val_score(gnb, train[columns], train["pep"], scoring='recall', cv=5)
# f1_score = cross_val_score(gnb, train[columns], train["pep"], scoring='f1', cv=5)
# auc = cross_val_score(gnb, train[columns], train["pep"], scoring='roc_auc', cv=5)
# print("准确率:",accuracy.mean())
# print("精确率:",precision.mean())
# print("召回率:",recall.mean())
# print("F1_score:",f1_score.mean())
# print("AUC:",auc.mean())

In [None]:
mnb = MultinomialNB()
mnb.fit(train[columns], train['pep'])
prediction = mnb.predict(test[columns])

# NB分类算法的五折交叉验证的评测指标
warnings.filterwarnings("ignore")
accuracy = cross_val_score(mnb, train[columns], train["pep"], scoring='accuracy', cv=5)
precision = cross_val_score(mnb, train[columns], train["pep"], scoring='precision', cv=5)
recall = cross_val_score(mnb, train[columns], train["pep"], scoring='recall', cv=5)
f1_score = cross_val_score(mnb, train[columns], train["pep"], scoring='f1', cv=5)
auc = cross_val_score(mnb, train[columns], train["pep"], scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())

In [None]:
bnb = BernoulliNB()
bnb.fit(train[columns], train['pep'])
prediction = bnb.predict(test[columns])

# NB分类算法的五折交叉验证的评测指标
warnings.filterwarnings("ignore")
accuracy = cross_val_score(bnb, train[columns], train["pep"], scoring='accuracy', cv=5)
precision = cross_val_score(bnb, train[columns], train["pep"], scoring='precision', cv=5)
recall = cross_val_score(bnb, train[columns], train["pep"], scoring='recall', cv=5)
f1_score = cross_val_score(bnb, train[columns], train["pep"], scoring='f1', cv=5)
auc = cross_val_score(bnb, train[columns], train["pep"], scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())