In [20]:
from __future__ import division
import pandas as pd
import numpy as np
 
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()
print('列名',col_names)
 
#选择列名中前后6个
to_show = col_names[:6] + col_names[-6:]
 
#显示6个数据
churn_df[to_show].head(6)


列名 ['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']


Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,8.41,10.1,3,2.73,3,False.
5,AL,118,510,391-8027,yes,no,9.18,6.3,6,1.7,0,False.


In [21]:
#将最后一列的churn?的true和false修改成0,1的形式
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
print(len(y))
 
#删除不要的字段
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
 
#显示了yes或no的两个字段
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

features = churn_feat_space.columns
X = churn_feat_space.values.astype(np.float)
 
#提供了数据预处理的库
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
#显示数据X的类型，为3333个记录，17个字段
print ("Feature space holds %d observations and %d features" % X.shape)
#y的指标是0,1
print ("Unique target labels:", np.unique(y))
#打印这个矩阵的第一条记录
print(X[0])
#查看y==0的记录有多少条
print (len(y[y == 0]))
print(y.copy())


3333
Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]
[ 0.67648946 -0.32758048  1.6170861   1.23488274  1.56676695  0.47664315
  1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
  0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
2850
[0 0 0 ... 0 0 0]


In [22]:
from sklearn.cross_validation import KFold
#X判断数据的标准，y预测的label，clf选择的分类器，**指定的参数
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    #做成几份交叉验证
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    
 
    # 分成了几份做交叉验证循环
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # 分类器的类型
        clf = clf_class(**kwargs)
        #放入训练集
        clf.fit(X_train,y_train)
        #预测
        y_pred[test_index] = clf.predict(X_test)
        #返回预测值
    return y_pred


In [23]:
#三个分类器
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
#定义了一个精确函数，比较真实值和预测值，返回均值
def accuracy(y_true,y_pred):
    #返回的是0,1值，表示预测是否正确的指标
    #NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)
 
#精确度很高并不能判定模型的优越性，最终想要的目标是检测出来流失的，主要看召回率
#而召回率主要是把实际上是真的判断成的真真和真假的占比
# print("Support vector machines:")
print('支持向量机')
print( "%.3f" % accuracy(y, run_cv(X,y,SVC)))
# print("Random forest:") 
print('随机森林')
print("%.3f" % accuracy(y, run_cv(X,y,RF))) 
print('K近邻算法')
print("%.3f" % accuracy(y, run_cv(X,y,KNN))) 

支持向量机
0.919
随机森林
0.942
K近邻算法
0.895


In [25]:
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob


In [26]:
import warnings
warnings.filterwarnings('ignore')
#上面两个可要可不要
# 使用10个估计器，所以预测都是0.1的倍数。
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print(pred_prob[0])
pred_churn = pred_prob[:,1]
is_churn = y == 1
 
# 一个预测概率被分配给观测值的次数
counts = pd.value_counts(pred_churn)
#print(count)
 
# 计算真概率
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)
    
# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts

Unnamed: 0,pred_prob,count,true_prob
0,0.0,1838,0.026115
1,0.1,652,0.033742
2,0.2,236,0.050847
3,0.3,126,0.126984
4,0.8,84,0.940476
5,0.9,84,0.97619
6,0.4,68,0.294118
7,1.0,68,1.0
8,0.6,62,0.774194
9,0.7,58,0.87931
