In [27]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
# 数据加载
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
# 数据探索
# 查看train_data信息
#pd.set_option('display.max_columns', None) #显示所有列
print('查看数据信息：列名、非空个数、类型等')
print(train_data.info())
print('-'*30)
print('查看数据摘要')
print(train_data.describe())
print('-'*30)
print('查看离散数据分布')
print(train_data.describe(include=['O']))
print('-'*30)
print('查看前5条数据')
print(train_data.head())
print('-'*30)
print('查看后5条数据')
print(train_data.tail())
# 使用平均年龄来填充年龄中的nan值
### inplace=True: 直接修改原对象
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)  
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features] ####读取多行
train_labels = train_data['Survived']
test_features = test_data[features]
print('特征值')
print(train_features)
dvec=DictVectorizer(sparse=False) ###sparse=False意思是不产生稀疏矩阵
## 转化为0，1向量
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
test_features=dvec.fit_transform(test_features.to_dict(orient='record'))
print(dvec.feature_names_)
print(train_features[1])


查看数据信息：列名、非空个数、类型等
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
查看数据摘要
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%

In [28]:
x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, train_size=0.75, test_size=0.25)

In [29]:
# 高斯朴素贝叶斯 GaussianNB/MultinomialNB/BernoulliNB
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB 
# 创建贝叶斯分类器
model = GaussianNB()
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
print('朴素贝叶斯准确率: %0.4lf' %accuracy_score(y_predict,y_test))

朴素贝叶斯准确率: 0.7758


In [31]:
Result=model.predict(test_features)
final = pd.DataFrame({'PassengerId': test_data['PassengerId'],'Sex': test_data['Sex'], 'Survived': Result})
print(final)

     PassengerId     Sex  Survived
0            892    male         0
1            893  female         1
2            894    male         0
3            895    male         0
4            896  female         1
..           ...     ...       ...
413         1305    male         0
414         1306  female         1
415         1307    male         0
416         1308    male         0
417         1309    male         0

[418 rows x 3 columns]


In [34]:
# 创建LR分类器
x1_train, x1_test, y1_train, y1_test = train_test_split(train_features, train_labels, train_size=0.75, test_size=0.25)
from sklearn.linear_model import LogisticRegression #逻辑回归
lr = LogisticRegression(solver='liblinear', multi_class='auto') #数据集比较小，使用liblinear，数据集大使用 sag或者saga
lr.fit(x1_train,y1_train)
y1_predict=lr.predict(x1_test)
print('LR准确率: %0.4lf' % accuracy_score(y1_predict,y1_test))


LR准确率: 0.7982


In [35]:
Result1=model.predict(test_features)
final_1 = pd.DataFrame({'PassengerId': test_data['PassengerId'],'Sex': test_data['Sex'], 'Survived': Result1})
print(final_1)

     PassengerId     Sex  Survived
0            892    male         0
1            893  female         1
2            894    male         0
3            895    male         0
4            896  female         1
..           ...     ...       ...
413         1305    male         0
414         1306  female         1
415         1307    male         0
416         1308    male         0
417         1309    male         0

[418 rows x 3 columns]


In [50]:
# 创建LDA分类器
x2_train, x2_test, y2_train, y2_test = train_test_split(train_features, train_labels, train_size=0.75, test_size=0.25)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
model2 = LinearDiscriminantAnalysis(n_components=2)
model2.fit(x2_train,y2_train)
y2_predict=model.predict(x2_test)
print('LDA准确率: %0.4lf' %accuracy_score(y2_predict,y2_test))
Result2=model.predict(test_features)
final_2 = pd.DataFrame({'PassengerId': test_data['PassengerId'],'Sex': test_data['Sex'], 'Survived': Result2})
print(final_2)

LDA准确率: 0.8251
     PassengerId     Sex  Survived
0            892    male         0
1            893  female         1
2            894    male         0
3            895    male         0
4            896  female         1
..           ...     ...       ...
413         1305    male         0
414         1306  female         1
415         1307    male         0
416         1308    male         0
417         1309    male         0

[418 rows x 3 columns]


