In [6]:
###import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
### load data 
train = pd.read_csv("F:/postguaduate/Machine Learning/Titanic/train.csv")
test = pd.read_csv("F:/postguaduate/Machine Learning/Titanic/test.csv")
target='Survived' # Disbursed的值就是二元分类的输出
IDcol = 'PassengerId'
train.head() #查看数据集内容

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
###数据预处理

###抛弃无关数据
train = train.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
test = test.drop(['Name','Ticket','Cabin'], axis=1)
train["Embarked"] = train["Embarked"].fillna("S")


###Fare
train['Fare'] = train['Fare'].astype(int)   #float 2 int
test.Fare.fillna(test.Fare.median(), inplace=True)   #插补缺失数据
test['Fare'] = test['Fare'].astype(int)


###Embarked
embark_dummies_train  = pd.get_dummies(train['Embarked'])     # pd.get_dummies()方法对离散数据重新编码，生成0-1矩阵
embark_dummies_test  = pd.get_dummies(test['Embarked'])
train = train.join(embark_dummies_train)   #合并矩阵
test = test.join(embark_dummies_test)
train.drop(['Embarked'], axis=1,inplace=True)   #删除原来的Embark列，inplace=Ture
test.drop(['Embarked'], axis=1,inplace=True)


###Age 主要是对缺失的值进行处理填补
average_age_titanic   = train["Age"].mean()
std_age_titanic       = train["Age"].std()
count_nan_age_titanic = train["Age"].isnull().sum()

average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()

rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)
train["Age"][np.isnan(train["Age"])] = rand_1    
test["Age"][np.isnan(test["Age"])] = rand_2
train['Age'] = train['Age'].astype(int)
test['Age']  = test['Age'].astype(int)


###Sex  考虑到小孩及妇女优先对待，分三类
def get_person(passenger):
    age,sex = passenger
    return "child" if age < 16 else sex

train['Person'] = train[['Age','Sex']].apply(get_person,axis=1)
test['Person']  = test[['Age','Sex']].apply(get_person,axis=1)

train.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)

person_dummies_titanic  = pd.get_dummies(train['Person'])
person_dummies_titanic.columns = ['Child','Female','Male']

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Child','Female','Male']

train = train.join(person_dummies_titanic)
test = test.join(person_dummies_test)
train.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)


###P class
pclass_dummies_titanic  = pd.get_dummies(train['Pclass'])
pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']

pclass_dummies_test  = pd.get_dummies(test['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']

train.drop(['Pclass'],axis=1,inplace=True)
test.drop(['Pclass'],axis=1,inplace=True)

train = train.join(pclass_dummies_titanic)
test= test.join(pclass_dummies_test)

train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,C,Q,S,Child,Female,Male,Class_1,Class_2,Class_3
0,0,22,1,0,7,0,0,1,0,0,1,0,0,1
1,1,38,1,0,71,1,0,0,0,1,0,1,0,0
2,1,26,0,0,7,0,0,1,0,1,0,0,0,1
3,1,35,1,0,53,0,0,1,0,1,0,1,0,0
4,0,35,0,0,8,0,0,1,0,0,1,0,0,1


In [4]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Survived']

In [7]:
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
print (rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:,1]
print( "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.7901234567901234
AUC Score (Train): 0.990773


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [15]:
##对n_estimate进行调优
param_test1 = {'n_estimators':range(10,71,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=2,
                                  min_samples_leaf=1,max_depth=8,max_features='sqrt' ,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.85374, std: 0.03691, params: {'n_estimators': 10},
  mean: 0.86201, std: 0.03205, params: {'n_estimators': 20},
  mean: 0.86407, std: 0.03372, params: {'n_estimators': 30},
  mean: 0.86522, std: 0.03448, params: {'n_estimators': 40},
  mean: 0.86579, std: 0.03435, params: {'n_estimators': 50},
  mean: 0.86530, std: 0.03501, params: {'n_estimators': 60},
  mean: 0.86478, std: 0.03549, params: {'n_estimators': 70}],
 {'n_estimators': 50},
 0.8657887782110725)

In [21]:
##上一步得到了最佳迭代次数为50 对max_depth和min_sample_split进行调优
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(2,10,1)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 50, 
                                  min_samples_leaf=1,max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_



([mean: 0.85944, std: 0.01824, params: {'max_depth': 3, 'min_samples_split': 2},
  mean: 0.85944, std: 0.01824, params: {'max_depth': 3, 'min_samples_split': 3},
  mean: 0.85944, std: 0.01824, params: {'max_depth': 3, 'min_samples_split': 4},
  mean: 0.85944, std: 0.01824, params: {'max_depth': 3, 'min_samples_split': 5},
  mean: 0.85944, std: 0.01840, params: {'max_depth': 3, 'min_samples_split': 6},
  mean: 0.85947, std: 0.01839, params: {'max_depth': 3, 'min_samples_split': 7},
  mean: 0.85947, std: 0.01839, params: {'max_depth': 3, 'min_samples_split': 8},
  mean: 0.85947, std: 0.01839, params: {'max_depth': 3, 'min_samples_split': 9},
  mean: 0.86404, std: 0.02896, params: {'max_depth': 5, 'min_samples_split': 2},
  mean: 0.86513, std: 0.03018, params: {'max_depth': 5, 'min_samples_split': 3},
  mean: 0.86417, std: 0.03008, params: {'max_depth': 5, 'min_samples_split': 4},
  mean: 0.86362, std: 0.02962, params: {'max_depth': 5, 'min_samples_split': 5},
  mean: 0.86558, std: 0.0301

In [24]:
#现在看看模型的袋外分数
rf1 = RandomForestClassifier(n_estimators=50, max_depth=7, min_samples_split=5,min_samples_leaf=1,max_features='sqrt' ,oob_score=True,random_state=10)
rf1.fit(X,y)
print(rf1.oob_score_)

0.8159371492704826


In [27]:
Y_pred = rf1.predict(test.drop('PassengerId',axis=1))
my_submission =pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(),'Survived': Y_pred.astype(np.int32)})
my_submission.to_csv("F:/postguaduate/Machine Learning/Titanic/my_submission_rf.csv",index=False)