In [51]:
import pandas as pd
import numpy as np
# 数据分隔
from sklearn.model_selection import train_test_split
# 字典特征提取
from sklearn.feature_extraction import DictVectorizer
# 交叉验证
from sklearn.model_selection import GridSearchCV
# lgbm
from lightgbm import LGBMClassifier, early_stopping

# 读取数据

In [52]:
train_data = pd.read_csv("../data/titanic/train.csv")
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [53]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [54]:
x = train_data[["Pclass", "Age", "Sex"]]
x

Unnamed: 0,Pclass,Age,Sex
0,3,22.0,male
1,1,38.0,female
2,3,26.0,female
3,1,35.0,female
4,3,35.0,male
...,...,...,...
886,2,27.0,male
887,1,19.0,female
888,3,,female
889,1,26.0,male


In [55]:
y = train_data[["Survived"]]
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


# 缺失值处理

In [56]:
np.any(x["Pclass"].isnull())

False

In [57]:
np.any(x["Age"].isnull())

True

In [58]:
x["Age"].fillna(x["Age"].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["Age"].fillna(x["Age"].mean(), inplace=True)


In [59]:
np.any(x["Age"].isnull())

False

In [60]:
np.any(x["Sex"].isnull())

False

# 数据集划分

In [61]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=23)

# 特征工程(字典特征抽取)

In [62]:
# x_train格式不对,
x_train

Unnamed: 0,Pclass,Age,Sex
151,1,22.000000,female
753,3,23.000000,male
746,3,16.000000,male
684,2,60.000000,male
887,1,19.000000,female
...,...,...,...
31,1,29.699118,female
488,3,30.000000,male
40,3,40.000000,female
742,1,21.000000,female


In [63]:
y_val

Unnamed: 0,Survived
727,1
668,0
210,0
654,0
228,0
...,...
629,0
430,1
432,1
431,1


In [64]:
# 转换成字典格式
x_train1 = x_train.to_dict(orient="records")
x_val1 = x_val.to_dict(orient="records")
x_train1

[{'Pclass': 1, 'Age': 22.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 23.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 16.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 60.0, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 19.0, 'Sex': 'female'},
 {'Pclass': 3, 'Age': 34.5, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 25.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 23.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 34.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 1, 'Age': 24.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 18.0, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 4.0, 'Sex': 'female'},
 {'Pclass': 1, 'Age': 64.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 57.0, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 20.0, 'Sex': 'male'},
 {'Pclass': 2, 'Age': 32.5, 'Sex': 'male'},
 {'Pclass': 3, 'Age': 29.69911764705882, 'Sex': 'male'},
 {'P

In [65]:
# sparse=False 返回非sparse矩阵
transfer = DictVectorizer(sparse=False)

transfer.fit(x_train1)
x_train2 = transfer.transform(x_train1)
x_val2 = transfer.transform(x_val1)
x_train2, x_val2

(array([[22.,  1.,  1.,  0.],
        [23.,  3.,  0.,  1.],
        [16.,  3.,  0.,  1.],
        ...,
        [40.,  3.,  1.,  0.],
        [21.,  1.,  1.,  0.],
        [36.,  3.,  0.,  1.]]),
 array([[29.69911765,  3.        ,  1.        ,  0.        ],
        [43.        ,  3.        ,  0.        ,  1.        ],
        [24.        ,  3.        ,  0.        ,  1.        ],
        [18.        ,  3.        ,  1.        ,  0.        ],
        [18.        ,  2.        ,  0.        ,  1.        ],
        [29.69911765,  3.        ,  0.        ,  1.        ],
        [ 4.        ,  3.        ,  0.        ,  1.        ],
        [21.        ,  3.        ,  0.        ,  1.        ],
        [29.69911765,  3.        ,  1.        ,  0.        ],
        [36.        ,  3.        ,  0.        ,  1.        ],
        [29.69911765,  2.        ,  0.        ,  1.        ],
        [29.69911765,  3.        ,  0.        ,  1.        ],
        [57.        ,  2.        ,  1.        ,  0.        ],

# LGBM

In [76]:
lgbmc = LGBMClassifier()

## 定义超参数的选择列表

In [77]:
param_grid = {
    "learning_rate": [0.01, 0.1, 1],
    "n_estimators": range(10, 101, 10),
    "num_leaves": np.arange(10, 50 , 1),
}

## 使用GridSearchCV进行网格搜索

In [68]:
gs = GridSearchCV(lgbmc, param_grid=param_grid, cv=4, n_jobs=4)

In [69]:
gs.fit(x_train2, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## 模型评估

In [78]:
gs.score(x_val2, y_val)

0.8100558659217877

In [79]:
gs.predict(x_val2)

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1], dtype=int64)

In [80]:
gs.best_params_

{'learning_rate': 0.1, 'n_estimators': 60, 'num_leaves': 19}

# 得到最好模型

In [81]:
best_lgbmc = LGBMClassifier(**gs.best_params_)

In [82]:
best_lgbmc.fit(x_train2, y_train, eval_set=[(x_val2, y_val)], eval_metric='logloss')

[1]	valid_0's binary_logloss: 0.619184
[2]	valid_0's binary_logloss: 0.59149
[3]	valid_0's binary_logloss: 0.569399
[4]	valid_0's binary_logloss: 0.551278
[5]	valid_0's binary_logloss: 0.535563
[6]	valid_0's binary_logloss: 0.523076
[7]	valid_0's binary_logloss: 0.512624
[8]	valid_0's binary_logloss: 0.504218
[9]	valid_0's binary_logloss: 0.497373
[10]	valid_0's binary_logloss: 0.491773
[11]	valid_0's binary_logloss: 0.486499
[12]	valid_0's binary_logloss: 0.481664
[13]	valid_0's binary_logloss: 0.477877
[14]	valid_0's binary_logloss: 0.474924
[15]	valid_0's binary_logloss: 0.472045
[16]	valid_0's binary_logloss: 0.469483
[17]	valid_0's binary_logloss: 0.469256
[18]	valid_0's binary_logloss: 0.467501
[19]	valid_0's binary_logloss: 0.465287
[20]	valid_0's binary_logloss: 0.464599
[21]	valid_0's binary_logloss: 0.463696
[22]	valid_0's binary_logloss: 0.463224
[23]	valid_0's binary_logloss: 0.463293
[24]	valid_0's binary_logloss: 0.463004
[25]	valid_0's binary_logloss: 0.461967
[26]	valid

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [83]:
best_lgbmc.score(x_val2, y_val)

0.8100558659217877

# Test

## 读取数据

In [84]:
x_test_data = pd.read_csv("../data/titanic/test.csv")
y_test_data = pd.read_csv("../data/titanic/gender_submission.csv")
x_test_data.head(), y_test_data.head()

(   PassengerId  Pclass                                          Name     Sex  \
 0          892       3                              Kelly, Mr. James    male   
 1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
 2          894       2                     Myles, Mr. Thomas Francis    male   
 3          895       3                              Wirz, Mr. Albert    male   
 4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   
 
     Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
 0  34.5      0      0   330911   7.8292   NaN        Q  
 1  47.0      1      0   363272   7.0000   NaN        S  
 2  62.0      0      0   240276   9.6875   NaN        Q  
 3  27.0      0      0   315154   8.6625   NaN        S  
 4  22.0      1      1  3101298  12.2875   NaN        S  ,
    PassengerId  Survived
 0          892         0
 1          893         1
 2          894         0
 3          895         0
 4          896         1)

In [86]:
x_test = x_test_data[["Pclass", "Age", "Sex"]]
y_test = y_test_data["Survived"]
x_test.head(), y_test.head()

(   Pclass   Age     Sex
 0       3  34.5    male
 1       3  47.0  female
 2       2  62.0    male
 3       3  27.0    male
 4       3  22.0  female,
 0    0
 1    1
 2    0
 3    0
 4    1
 Name: Survived, dtype: int64)

## 缺失值处理

In [87]:
np.any(x_test["Pclass"].isnull())

False

In [88]:
np.any(x_test["Age"].isnull())

True

In [89]:
x_test["Age"].fillna(x_test["Age"].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["Age"].fillna(x_test["Age"].mean(), inplace=True)


In [90]:
np.any(x_test["Age"].isnull())

False

In [91]:
np.any(x_test["Sex"].isnull())

False

## 特征工程(字典特征抽取)

In [92]:
# 转换成字典格式
x_test1 = x_test.to_dict(orient="records")

In [93]:
x_test2 = transfer.fit_transform(x_test1)
x_test2

array([[34.5       ,  3.        ,  0.        ,  1.        ],
       [47.        ,  3.        ,  1.        ,  0.        ],
       [62.        ,  2.        ,  0.        ,  1.        ],
       ...,
       [38.5       ,  3.        ,  0.        ,  1.        ],
       [30.27259036,  3.        ,  0.        ,  1.        ],
       [30.27259036,  3.        ,  0.        ,  1.        ]])

## 评估

In [94]:
best_lgbmc.score(x_test2, y_test)

0.8564593301435407