In [None]:
# 加载必要的库

import sys
import pandas as pd
import numpy as np
import sklearn
import random
import time


In [None]:
sys.version

In [None]:
from sklearn import ensemble
from sklearn.preprocessing import LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

### 第一步：加载原数据集

In [None]:
data_raw = pd.read_csv('train.csv')

data_val = pd.read_csv('test.csv')


In [None]:
# 显示部分数据

data_raw.head()

In [None]:
data_val.head()

In [None]:
data_raw.info()

In [None]:
data_val.info()

In [None]:
# 列名称全部转换为小写格式

data_raw.columns = data_raw.columns.str.lower()  
data_val.columns = data_val.columns.str.lower()

In [None]:
data_raw.columns

In [None]:
# 绘制图形

sns.countplot(data_raw['survived'])

In [None]:
# 合并两个数据集，进行统一的清洗

data_all = [data_raw,data_val]


### 数据清洗

In [None]:
data_raw.isnull().sum() # 统计训练集中哪些特征（列）里有NULL

In [None]:
data_val.isnull().sum()  # 统计验证集中哪些特征（列）里有NULL

In [None]:
# 对原数据进行描述

data_raw.describe(include='all')

In [None]:
# 对原始数据集（训练集 + 验证集）进行清理

for dataset in data_all:
    # 补足空缺值   inplace 方法已经弃用了，直接赋值即可
    # dataset['age'].fillna(dataset['age'].median(), inplace=True)
    # dataset['fare'].fillna(dataset['fare'].median(), inplace=True)
    # dataset['embarked'].fillna(dataset['embarked'].mode()[0], inplace=True)
    dataset['age'] = dataset['age'].fillna(dataset['age'].median())
    dataset['fare'] = dataset['fare'].fillna(dataset['fare'].median())
    dataset['embarked'] = dataset['embarked'].fillna(dataset['embarked'].mode()[0])


In [None]:
data_raw.isnull().sum()
data_val.isnull().sum()

In [None]:
# 删除一些字段

drop_columns = ['cabin', 'passengerid', 'ticket']

data_raw = data_raw.drop(drop_columns,axis=1,errors='ignore')

data_val = data_val.drop(drop_columns,axis=1,errors='ignore')

data_all = [data_raw] + [data_val]


In [None]:
data_raw.columns

In [None]:
data_raw.isnull().sum()

In [None]:
data_val.isnull().sum()

### 第三步：进行特征的构建


In [None]:
for dataset in data_all:
    # 构建新的字段：
    # (1) family_size 家庭规模：sibsp + parch

    dataset['family_size'] = dataset['sibsp'] + dataset['parch'] + 1

    # (2) single 单身：1：单身；0：非单身

    dataset['single'] = 1
    dataset.loc[dataset['family_size'] > 1, 'single'  ] = 0 

    # (3) title 身份 

    dataset['title'] = dataset['name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]   # expand 默认series返回列，true返回dataframe [1]表示第二个元素
    
    # (4) fare_bin 票价 (不分组太多类了)

    dataset['fare_bin'] = pd.qcut(dataset['fare'], 4) # 根据票价分成4组，每组的元素的个数一致

    # (5) age_bin 年龄

    dataset['age_bin'] = pd.cut(dataset['age'].astype(int), 5) # 根据年龄分组，分成5组(每组的元素不一致)
    
    




In [None]:
dataset.head()

In [None]:
# 根据title统计人数

data_raw['title'].value_counts()

In [None]:
title_names = (data_raw['title'].value_counts() < 10)
title_names

In [None]:
# title 将哪些称谓人数小于10的数据，全部归为一类：other
# 统计每个 title 的出现次数
title_counts = data_raw['title'].value_counts()

# 创建掩码：出现次数 < 10 的为 True
mask = title_counts < 10
print(mask)
# 把出现次数少的 title 映射为 'other'，其他的保持不变
data_raw['title'] = data_raw['title'].map(
    lambda x: 'other' if mask[x] else x
)

In [None]:
data_raw['title']

In [None]:
data_raw['title'].value_counts()

In [None]:
data_raw['survived'].groupby(data_raw['title']).mean()

### (二)构建新的字段，基于scikit-learn中的LabelEncoder()

In [None]:
data_raw.head()

In [None]:
label = LabelEncoder()

In [None]:
for dataset in data_all:
    # (1) 新字段：sex_code    为什么？因为计算机不能识别字符串

    dataset['sex_code'] = label.fit_transform(dataset['sex'])

    # (2) 新字段：embarked_code 

    dataset['embarked_code'] = label.fit_transform(dataset['embarked'])

    # (3) 新字段：title_code 

    dataset['title_code'] = label.fit_transform(dataset['title'])

    # (4) 新字段：age_bin_code

    dataset['age_bin_code'] = label.fit_transform(dataset['age_bin'])

    # (5) 新字段：fare_bin_code

    dataset['fare_bin_code'] = label.fit_transform(dataset['fare_bin'])


In [None]:
data_raw.head()

In [None]:
# 删除一些字段

# drop_columns_2 = ['age', 'fare', 'embarked', 'title', 'sex', 'fare_bin', 'age_bin']

# data_raw = data_raw.drop(drop_columns_2, axis=1, errors='ignore')

# data_val = data_val.drop(drop_columns_2, axis=1, errors='ignore')

# data_all = [data_raw] + [data_val]

In [None]:
data_raw.head()

In [None]:
# 查看列的特征名称  以上构建好了所有字段

data_raw.columns.to_list()

### 方式一： 特征选择

In [None]:
Target = ['survived'] # 标签


In [525]:
data_columns_one = ['pclass','sibsp','parch','sex_code','embarked_code','title_code','age','fare']
colunms_one = Target + data_columns_one
data_raw.head()


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family_size,single,title,fare_bin,age_bin,sex_code,embarked_code,title_code,age_bin_code,fare_bin_code
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0,Mr,"(-0.001, 7.91]","(16.0, 32.0]",1,2,2,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,0,Mrs,"(31.0, 512.329]","(32.0, 48.0]",0,0,3,2,3
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1,Miss,"(7.91, 14.454]","(16.0, 32.0]",0,2,1,1,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0,Mrs,"(31.0, 512.329]","(32.0, 48.0]",0,2,3,2,3
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(32.0, 48.0]",1,2,2,2,1


### 方式二：特征选择

In [None]:
data_columns_two = ['pclass', 'sibsp', 'parch', 'family_size', 'single',
                    'sex_code', 'embarked_code', 'title_code', 'age_bin_code', 'fare_bin_code']
colunms_two = Target + data_columns_two

### 通过Pandas中的get_dummies()进行编码

In [526]:
data_one_dummy = pd.get_dummies(data_raw[data_columns_one], columns= data_columns_one)
data_one_dummy_list = data_one_dummy.columns.to_list()
data_one_dummy_list # 通过编码转换后的特征类别 字符串有几个就有几类编码

['pclass_1',
 'pclass_2',
 'pclass_3',
 'sibsp_0',
 'sibsp_1',
 'sibsp_2',
 'sibsp_3',
 'sibsp_4',
 'sibsp_5',
 'sibsp_8',
 'parch_0',
 'parch_1',
 'parch_2',
 'parch_3',
 'parch_4',
 'parch_5',
 'parch_6',
 'sex_code_0',
 'sex_code_1',
 'embarked_code_0',
 'embarked_code_1',
 'embarked_code_2',
 'title_code_0',
 'title_code_1',
 'title_code_2',
 'title_code_3',
 'title_code_4',
 'age_0.42',
 'age_0.67',
 'age_0.75',
 'age_0.83',
 'age_0.92',
 'age_1.0',
 'age_2.0',
 'age_3.0',
 'age_4.0',
 'age_5.0',
 'age_6.0',
 'age_7.0',
 'age_8.0',
 'age_9.0',
 'age_10.0',
 'age_11.0',
 'age_12.0',
 'age_13.0',
 'age_14.0',
 'age_14.5',
 'age_15.0',
 'age_16.0',
 'age_17.0',
 'age_18.0',
 'age_19.0',
 'age_20.0',
 'age_20.5',
 'age_21.0',
 'age_22.0',
 'age_23.0',
 'age_23.5',
 'age_24.0',
 'age_24.5',
 'age_25.0',
 'age_26.0',
 'age_27.0',
 'age_28.0',
 'age_28.5',
 'age_29.0',
 'age_30.0',
 'age_30.5',
 'age_31.0',
 'age_32.0',
 'age_32.5',
 'age_33.0',
 'age_34.0',
 'age_34.5',
 'age_35.0',
 'a

### 获取训练集和测试集

### 方式一：训练集和测试集

In [527]:
X_train_one, X_test_one, y_train_one, y_test_one = model_selection.train_test_split(data_one_dummy[data_one_dummy_list]
                                                                    ,data_raw[Target]
                                                                    ,random_state = 0
                                                                    )
X_train_one.shape
X_test_one.shape

(223, 363)

### 方式二：训练集和测试集

In [528]:
X_train_two, X_test_two, y_train_two, y_test_two = model_selection.train_test_split(data_raw[data_columns_two]
                                                                                    , data_raw[Target]
                                                                                    , random_state=0
                                                                                    )
X_train_two.shape
X_test_two.shape

(223, 10)

### 随机森林算法实现

In [529]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [530]:
rf = RandomForestClassifier(max_features='sqrt'
                            ,random_state=1
                            ,n_jobs=-1
                            )

In [531]:
param_gird = {
    'criterion' : ['gini', 'entropy']
    ,'min_samples_leaf' : [1,5,10]
    ,'min_samples_split' : [2,4,10,12,16]
    ,'n_estimators' : [50,100,400,700,1000]
}

In [532]:
gs = GridSearchCV(estimator=rf
                  ,param_grid=param_gird
                  ,scoring='accuracy'
                  ,cv=3
                  ,n_jobs=-1,
                  )

### (1)对特征一进行训练


In [533]:

gs = gs.fit(X_train_one, y_train_one)

  return fit_method(estimator, *args, **kwargs)


In [536]:
print(gs.best_score_)

0.8218869093308555


In [537]:
print(gs.best_params_)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [539]:
# 创建一个对象

rf2 = RandomForestClassifier(criterion='entropy'
                             ,min_samples_leaf=5
                             ,min_samples_split=12
                             ,n_estimators=100
                             ,n_jobs=-1
                             ,random_state=1
                             )






In [540]:
rf2.fit(X_train_one, y_train_one)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'entropy'
,max_depth,
,min_samples_split,12
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [541]:
X_train_one.head()

Unnamed: 0,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8,...,fare_153.4625,fare_164.8667,fare_211.3375,fare_211.5,fare_221.7792,fare_227.525,fare_247.5208,fare_262.375,fare_263.0,fare_512.3292
105,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
68,False,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
253,False,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
320,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
706,False,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [542]:
# 根据特征的重要性排序

pd.concat((pd.DataFrame(X_train_one.iloc[:,1:].columns,columns= ['Variable'])
           ,pd.DataFrame(rf2.feature_importances_, columns=['importance']))
           ,axis=1).sort_values(by='importance',ascending=False)

Unnamed: 0,Variable,importance
18,embarked_code_0,0.179899
24,title_code_3,0.176014
17,sex_code_1,0.147291
25,title_code_4,0.075237
2,sibsp_0,0.060997
...,...,...
64,age_29.0,0.000000
351,fare_151.55,0.000000
350,fare_146.5208,0.000000
349,fare_135.6333,0.000000


In [543]:
# 在test上进行预测

In [544]:
pred = rf2.predict(X_test_one)

In [545]:
pred = pd.DataFrame(pred,columns=['survived'])

In [546]:
pred

Unnamed: 0,survived
0,0
1,0
2,0
3,1
4,1
...,...
218,1
219,1
220,0
221,1
