In [1]:
import pandas as pd
import numpy as np

## 数据预处理

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#删除重复样本
print('数据集尺寸：', train.shape)
train.drop_duplicates(keep='first', inplace=True)
print('去除重复样本后的数据集尺寸：', train.shape)

数据集尺寸： (891, 12)
去除重复样本后的数据集尺寸： (891, 12)


In [4]:
#删除无关或意义重复的特征
train.drop(['PassengerId', 'Name', 'Ticket', 'Embarked', 'Cabin', 'Fare'], axis=1, inplace=True)
print('数据集尺寸：', train.shape)
train.head()

数据集尺寸： (891, 6)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [5]:
#查看每个特征的缺失值个数
num_of_nan = train.shape[0] - train.count()
num_of_nan

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
dtype: int64

In [6]:
#将Age分为≤20,21~40,41~60,>60和missing五类
pd.options.display.max_rows = None
train.loc[train.Age <= 20, 'Age2'] = '≤20'
train.loc[train.Age > 20, 'Age2'] = '21~40'
train.loc[train.Age > 40, 'Age2'] = '41~60'
train.loc[train.Age > 60, 'Age2'] = '>60'
train.Age2.fillna('missing', inplace=True)

train.Age = train.Age2
train.drop(['Age2'], axis=1, inplace=True)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,21~40,1,0
1,1,1,female,21~40,1,0
2,1,3,female,21~40,0,0
3,1,1,female,21~40,1,0
4,0,3,male,21~40,0,0


In [7]:
#将特征和标签以数组或列表的形式存储在train_features和train_labels中
train = np.array(train)
train_features = train[:,1:train.shape[1]]
train_labels = list(train[:,0]  )

In [8]:
#用同样的方法处理测试集特征
test = pd.read_csv("test.csv")
test.drop(['PassengerId', 'Name', 'Ticket', 'Embarked', 'Cabin', 'Fare'], axis=1, inplace=True)
test.loc[test.Age <= 20, 'Age2'] = '≤20'
test.loc[test.Age > 20, 'Age2'] = '21~40'
test.loc[test.Age > 40, 'Age2'] = '41~60'
test.loc[test.Age > 60, 'Age2'] = '>60'
test.Age2.fillna('missing', inplace=True)
test.Age = test.Age2
test.drop(['Age2'], axis=1, inplace=True)
test_features = np.array(test)
test_features

array([[3, 'male', '21~40', 0, 0],
       [3, 'female', '41~60', 1, 0],
       [2, 'male', '>60', 0, 0],
       ...,
       [3, 'male', '21~40', 0, 0],
       [3, 'male', 'missing', 0, 0],
       [3, 'male', 'missing', 1, 1]], dtype=object)

## 朴素贝叶斯算法（极大似然估计）

In [9]:
def ProbabilityCalculation(train_features, train_labels):
    
    
    #计算先验概率P_Y
    train_labels_set = set(train_labels)
    P_Y = {}
    for i in train_labels_set:
        P_Y['Y=' + str(i)] = train_labels.count(i) / float(len(train_labels))
    for i in P_Y:
        print('P(' + i + ') =', P_Y[i])
    
    
    #计算条件概率P_X_Y
    X_set = []
    for i in range(len(train_features[0])):
        X_set.append(set(train_features[:, i]))

    P_X_Y = {}
    for label in train_labels_set:
        label_index = [i for i, j in enumerate(train_labels) if j == label]  #找出所有Y等于一个可取值的样本的索引
        for num in range(len(train_features[0])):
            for k in X_set[num]:
                feature_index = [i for i, j in enumerate(train_features[:, num]) if j == k] #找出所有X等于一个可取值的样本的索引
                label_feature_count = len(set(label_index) & set(feature_index)) #索引取交集，计算个数
                #交集个数除以取这个可取值的Y的个数得到条件概率
                P_X_Y['X' + str(num) + '=' + str(k) + '|Y=' + str(label)] = label_feature_count / float(train_labels.count(label))
    
    for i in P_X_Y:
        print('P(' + i + ') =', P_X_Y[i])
        
        
    #返回先验概率和条件概率
    return [P_Y, P_X_Y]

In [10]:
#计算先验概率和条件概率
P_Y, P_X_Y = ProbabilityCalculation(train_features, train_labels)

P(Y=0) = 0.6161616161616161
P(Y=1) = 0.3838383838383838
P(X0=1|Y=0) = 0.14571948998178508
P(X0=2|Y=0) = 0.1766848816029144
P(X0=3|Y=0) = 0.6775956284153005
P(X1=female|Y=0) = 0.14754098360655737
P(X1=male|Y=0) = 0.8524590163934426
P(X2=≤20|Y=0) = 0.1766848816029144
P(X2=21~40|Y=0) = 0.4225865209471767
P(X2=>60|Y=0) = 0.030965391621129327
P(X2=missing|Y=0) = 0.22768670309653916
P(X2=41~60|Y=0) = 0.14207650273224043
P(X3=0|Y=0) = 0.7249544626593807
P(X3=1|Y=0) = 0.1766848816029144
P(X3=2|Y=0) = 0.0273224043715847
P(X3=3|Y=0) = 0.02185792349726776
P(X3=4|Y=0) = 0.0273224043715847
P(X3=5|Y=0) = 0.009107468123861567
P(X3=8|Y=0) = 0.012750455373406194
P(X4=0|Y=0) = 0.8105646630236795
P(X4=1|Y=0) = 0.0965391621129326
P(X4=2|Y=0) = 0.07285974499089254
P(X4=3|Y=0) = 0.0036429872495446266
P(X4=4|Y=0) = 0.007285974499089253
P(X4=5|Y=0) = 0.007285974499089253
P(X4=6|Y=0) = 0.0018214936247723133
P(X0=1|Y=1) = 0.39766081871345027
P(X0=2|Y=1) = 0.2543859649122807
P(X0=3|Y=1) = 0.347953216374269
P(X1=

In [11]:
#计算后验概率（由于测试集中可能存在训练集意外的取值，故使用try语句）
output = []
for i in test_features:
    ans = []
    
    #计算Y=0的情况
    try:
        ans.append(P_Y['Y=0'] 
               * P_X_Y['X0='+ str(i[0]) + '|Y=0'] 
               * P_X_Y['X1='+ str(i[1]) + '|Y=0']
               * P_X_Y['X2='+ str(i[2]) + '|Y=0']
               * P_X_Y['X3='+ str(i[3]) + '|Y=0']
               * P_X_Y['X4='+ str(i[4]) + '|Y=0'])
    except:
        ans.append(0)
        
    #计算Y=1的情况
    try:
        ans.append(P_Y['Y=1'] 
               * P_X_Y['X0='+ str(i[0]) + '|Y=1'] 
               * P_X_Y['X1='+ str(i[1]) + '|Y=1']
               * P_X_Y['X2='+ str(i[2]) + '|Y=1']
               * P_X_Y['X3='+ str(i[3]) + '|Y=1']
               * P_X_Y['X4='+ str(i[4]) + '|Y=1'])
    except:
        ans.append(0)

    if ans[0] >= ans [1]:
        output.append(0)
    else:
        output.append(1)
    
    
print('预测结果为：\n', output)   

预测结果为：
 [0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1

## 朴素贝叶斯算法（贝叶斯估计）

In [12]:
def ProbabilityCalculation2(train_features, train_labels, K, λ): #K：Y可能取值的个数；λ：随机变量各个取值的频数上赋予的正数
    
    
    #计算先验概率P_Y
    train_labels_set = set(train_labels)
    P_Y = {}
    for i in train_labels_set:
        P_Y['Y=' + str(i)] = (train_labels.count(i) + λ) / (float(len(train_labels)) + K * λ)
    for i in P_Y:
        print('P(' + i + ') =', P_Y[i])
    
    
    #计算条件概率P_X_Y
    X_set = []
    for i in range(len(train_features[0])):
        X_set.append(set(train_features[:, i]))

    P_X_Y = {}
    for label in train_labels_set:
        label_index = [i for i, j in enumerate(train_labels) if j == label]  #找出所有Y等于一个可取值的样本的索引
        for num in range(len(train_features[0])):
            for k in X_set[num]:
                feature_index = [i for i, j in enumerate(train_features[:, num]) if j == k] #找出所有X等于一个可取值的样本的索引
                label_feature_count = len(set(label_index) & set(feature_index)) #索引取交集，计算个数
                #交集个数除以取这个可取值的Y的个数得到条件概率
                P_X_Y['X' + str(num) + '=' + str(k) + '|Y=' + str(label)] = (label_feature_count + λ) / (float(train_labels.count(label)) + len(X_set[num]) * λ)
    
    for i in P_X_Y:
        print('P(' + i + ') =', P_X_Y[i])
        
        
    #返回先验概率和条件概率
    return [P_Y, P_X_Y]

In [13]:
#计算先验概率和条件概率
K = 2 #Y可能的取值个数为2
λ = 1 #拉普拉斯平滑
P_Y2, P_X_Y2 = ProbabilityCalculation2(train_features, train_labels, K, λ)


P(Y=0) = 0.6159014557670772
P(Y=1) = 0.3840985442329227
P(X0=1|Y=0) = 0.14673913043478262
P(X0=2|Y=0) = 0.17753623188405798
P(X0=3|Y=0) = 0.6757246376811594
P(X1=female|Y=0) = 0.14882032667876588
P(X1=male|Y=0) = 0.8511796733212341
P(X2=≤20|Y=0) = 0.17689530685920576
P(X2=21~40|Y=0) = 0.42057761732851984
P(X2=>60|Y=0) = 0.032490974729241874
P(X2=missing|Y=0) = 0.22743682310469315
P(X2=41~60|Y=0) = 0.14259927797833935
P(X3=0|Y=0) = 0.7176258992805755
P(X3=1|Y=0) = 0.17625899280575538
P(X3=2|Y=0) = 0.02877697841726619
P(X3=3|Y=0) = 0.023381294964028777
P(X3=4|Y=0) = 0.02877697841726619
P(X3=5|Y=0) = 0.01079136690647482
P(X3=8|Y=0) = 0.014388489208633094
P(X4=0|Y=0) = 0.802158273381295
P(X4=1|Y=0) = 0.09712230215827339
P(X4=2|Y=0) = 0.0737410071942446
P(X4=3|Y=0) = 0.00539568345323741
P(X4=4|Y=0) = 0.008992805755395683
P(X4=5|Y=0) = 0.008992805755395683
P(X4=6|Y=0) = 0.0035971223021582736
P(X0=1|Y=1) = 0.39710144927536234
P(X0=2|Y=1) = 0.25507246376811593
P(X0=3|Y=1) = 0.34782608695652173

In [14]:
#计算后验概率（由于测试集中可能存在训练集意外的取值，故使用try语句）
output2 = []
for i in test_features:
    ans = []
    
    #计算Y=0的情况
    try:
        ans.append(P_Y2['Y=0'] 
               * P_X_Y2['X0='+ str(i[0]) + '|Y=0'] 
               * P_X_Y2['X1='+ str(i[1]) + '|Y=0']
               * P_X_Y2['X2='+ str(i[2]) + '|Y=0']
               * P_X_Y2['X3='+ str(i[3]) + '|Y=0']
               * P_X_Y2['X4='+ str(i[4]) + '|Y=0'])
    except:
        ans.append(0)
        
    #计算Y=1的情况
    try:
        ans.append(P_Y2['Y=1'] 
               * P_X_Y2['X0='+ str(i[0]) + '|Y=1'] 
               * P_X_Y2['X1='+ str(i[1]) + '|Y=1']
               * P_X_Y2['X2='+ str(i[2]) + '|Y=1']
               * P_X_Y2['X3='+ str(i[3]) + '|Y=1']
               * P_X_Y2['X4='+ str(i[4]) + '|Y=1'])
    except:
        ans.append(0)

    if ans[0] >= ans [1]:
        output2.append(0)
    else:
        output2.append(1)
    
    
print('预测结果为：\n', output2)   

预测结果为：
 [0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1

## 验证

In [15]:
test_labels = pd.read_csv('gender_submission.csv')
ground_truth = list(np.array(test_labels)[:, 1])
count = 0
count2 = 0

for i in range(len(ground_truth)):
    if output[i] == ground_truth[i]:
        count += 1
    if output2[i] == ground_truth[i]:
        count2 += 1

print("用极大似然估计得到的准确率为：{}".format(count/len(ground_truth)))
print("用贝叶斯估计得到的准确率为：{}".format(count2/len(ground_truth)))     

用极大似然估计得到的准确率为：0.8875598086124402
用贝叶斯估计得到的准确率为：0.8899521531100478
