In [869]:
import math
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn import model_selection

In [870]:
train = pd.read_csv('D:/JISUNG/dacon/titanic/data/train.csv')
test = pd.read_csv('D:/JISUNG/dacon/titanic/data/test.csv')

In [871]:
y_train = train['Survived'].values
passId = test['PassengerId']
data = pd.concat((train, test))

인덱스 리셋

In [872]:
data = data.reset_index(drop=True)

Fare 결측치를 평균으로 치환

In [873]:
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

이름에서 성만 추출

In [874]:
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])

성과 요금을 기준으로 가족 여부를 판단
- 가족이 없을 경우 생존확률 0.5
- 가족이 있을 경우 자기 자신을 제외한 나머지 가족들의 생존율을 자신의 생존확률로 채택

In [875]:
data['Probability'] = 0.5

In [876]:
for _, group_df in data.groupby(['Last_Name', 'Fare']):
    if len(group_df) != 1:
        for idx, row in group_df.iterrows():
            probability = group_df.drop(idx)['Survived'].mean()
            if math.isnan(probability):
                data.loc[data['PassengerId'] == row['PassengerId'], 'Probability'] = 0.5
            else:
                data.loc[data['PassengerId'] == row['PassengerId'], 'Probability'] = probability

티켓 번호를 기준으로 동승자 여부를 판단
- 동승자가 없을 경우 생존확률 0.5
- (가족이 없는 승객 중) 동승자가 있을 경우 자기 자신을 제외한 나머지 동승자들의 생존율을 자신의 생존확률로 채택

In [877]:
for _, group_df in data.groupby('Ticket'):
    if len(group_df) != 1:
        for idx, row in group_df.iterrows():
            if row['Probability']== 0.5:
                probability = group_df.drop(idx)['Survived'].mean()
                if math.isnan(probability):
                    data.loc[data['PassengerId'] == row['PassengerId'], 'Probability'] = 0.5
                else:
                    data.loc[data['PassengerId'] == row['PassengerId'], 'Probability'] = probability

요금 4개 범주화

In [878]:
data['Fare'] = pd.qcut(data['Fare'], 4)

In [879]:
label = LabelEncoder()
data['Fare'] = label.fit_transform(data['Fare'])

호칭 추출

In [880]:
Title_Dictionary = {
    'Capt':'ETC',
    'Col':'ETC',
    'Don':'Nobility',
    'Dr':'ETC',
    'Jonkheer':'Nobility',
    'Lady':'Nobility',
    'Major':'ETC',
    'Master':'Master',
    'Miss':'Miss',
    'Mlle':'Miss',
    'Mme':'Mrs',
    'Mr':'Mr',
    'Mrs':'Mrs',
    'Ms':'Miss',
    'Dona':'Mrs',
    'Rev':'ETC',
    'Sir':'Nobility',
    'the Countess':'Nobility'
}

In [881]:
data['Title'] = data['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
data['Title'] = data.Title.map(Title_Dictionary)

나이의 결측치를 호칭의 중앙값으로 치환

In [882]:
data['Age'] = data.groupby('Title')['Age'].apply(lambda x: x.fillna(round(x.median(), 1)))

나이 4개 범주화

In [883]:
data['Age'] = pd.qcut(data['Age'], 4)

In [884]:
label = LabelEncoder()
data['Age'] = label.fit_transform(data['Age'])

호칭 숫자로 변환

In [885]:
data['Title'] = data['Title'].replace(['Mr', 'ETC', 'Nobility', 'Miss', 'Mrs', 'Master'], [0, 1, 2, 3, 4, 5])

성별 숫자로 변환

In [886]:
data['Sex'] = data['Sex'].replace(['male', 'female'], [0, 1])

항구 결측치 최빈값으로 치환

In [887]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

항구 숫자로 변환

In [888]:
data['Embarked'] = data['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2])

객실 결측치를 N으로 치환

In [889]:
data['Cabin'].fillna('N',inplace=True)

객실의 문자 부분 추출

In [890]:
data['Cabin'] = data['Cabin'].map(lambda x: x[0])

객실 결측치 처리
- Pclass 1 : 3
- Pclass 2 : 2
- Pclass 3 : 0

In [891]:
Cabin_Dictionary = {
    'A':'3',
    'B':'3',
    'C':'3',
    'D':'2',
    'E':'2',
    'F':'1',
    'G':'1',
    'T':'3',
    'N':'0'
}

In [892]:
data['Cabin'] = data.Cabin.map(Cabin_Dictionary)

In [893]:
data['Cabin'] = data.Cabin.astype(int)

가족 컬럼 생성

In [894]:
data['Family'] = data['SibSp'] + data['Parch'] + 1

불필요 컬럼 제거

In [895]:
data = data.drop(['Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Last_Name', 'PassengerId'], axis=1)

train/test 데이터 분리

In [896]:
train = data[:891]
test = data[891:]

표준화

In [897]:
X_train = train
X_test = test

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

5개 모델 생성

In [898]:
rf = RandomForestClassifier(random_state=1)
knn = KNeighborsClassifier()
lr = LogisticRegression()
xgb = XGBClassifier()
svm = SVC(probability=True)
  
models = [rf, knn, lr, xgb, svm]
scores = []

for mod in models:
    mod.fit(X_train, y_train)
    acc = cross_val_score(mod, X_train, y_train, scoring='accuracy', cv=10)
    scores.append(acc.mean())

In [899]:
results = pd.DataFrame({
    'Model': [
        'Random Forest',
        'K Nearest Neighbour',
        'Logistic Regression',
        'XGBoost',
        'SVM'
    ],
    'First': scores})

acc_rank = results.sort_values(by='First', ascending=False).reset_index(drop=True)
acc_rank

Unnamed: 0,Model,First
0,SVM,0.849588
1,XGBoost,0.842896
2,Random Forest,0.842884
3,Logistic Regression,0.841785
4,K Nearest Neighbour,0.812534


변수 선택

In [900]:
xgb_imp = pd.DataFrame({'Feature':train.columns, 'xgb importance':xgb.feature_importances_})
rf_imp = pd.DataFrame({'Feature':train.columns, 'rf importance':rf.feature_importances_})

importances = xgb_imp.merge(rf_imp, on='Feature')
importances['Average'] = importances.mean(axis=1)
importances = importances.sort_values(by='Average', ascending=False).reset_index(drop=True)

importances

Unnamed: 0,Feature,xgb importance,rf importance,Average
0,Title,0.474218,0.26062,0.367419
1,Probability,0.154363,0.14549,0.149926
2,Pclass,0.143119,0.105489,0.124304
3,Sex,0.028464,0.144255,0.08636
4,Cabin,0.08567,0.061329,0.0735
5,Family,0.029807,0.093315,0.061561
6,Age,0.024772,0.078279,0.051525
7,Fare,0.034016,0.067181,0.050598
8,Embarked,0.025572,0.044042,0.034807


변수 제거

In [901]:
train = train.drop(['Embarked'], axis=1)
test = test.drop(['Embarked'], axis=1)

X_train = train
X_test = test

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

5개 모델 생성

In [902]:
rf = RandomForestClassifier(random_state=1)
knn = KNeighborsClassifier()
lr = LogisticRegression()
xgb = XGBClassifier(random_state=1)
svm = SVC(probability=True)
      
models = [rf, knn, lr, xgb, svm]
scores_v2 = []

for mod in models:
    mod.fit(X_train, y_train)
    acc = cross_val_score(mod, X_train, y_train, scoring='accuracy', cv=10)
    scores_v2.append(acc.mean())

In [903]:
results = pd.DataFrame({
    'Model': [
        'Random Forest',
        'K Nearest Neighbour',
        'Logistic Regression',
        'XGBoost',
        'SVM'
    ],
    'First': scores,
    'Delete feature': scores_v2})

acc_rank = results.sort_values(by='Delete feature', ascending=False).reset_index(drop=True)
acc_rank

Unnamed: 0,Model,First,Delete feature
0,XGBoost,0.842896,0.857516
1,SVM,0.849588,0.851823
2,Random Forest,0.842884,0.847416
3,K Nearest Neighbour,0.812534,0.844007
4,Logistic Regression,0.841785,0.840662


SVM

In [948]:
C = [0.001, 0.01, 0.1, 1, 5, 10, 15, 20, 50, 100]
gamma = [0.001, 0.01, 0.1, 1]

hyperparams = {'C': C, 'gamma' : gamma}

gscv = GridSearchCV(
    estimator=SVC(probability=True),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.8574414663235203
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   11.5s finished


LR

In [956]:
penalty = ['l1', 'l2']
C = np.logspace(0, 5, 10)

hyperparams = {'penalty': penalty, 'C': C}

gscv = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.8383780051471973
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


XGB

In [906]:
learning_rate = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
n_estimators = [10, 25, 50, 75, 100, 250, 500, 750, 1000]

hyperparams = {
    'learning_rate': learning_rate,
    'n_estimators': n_estimators
}

gscv = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:   47.7s finished


0.8462368966166594
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [907]:
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]
min_child_weight = [1, 2, 3, 4, 5, 6]

hyperparams = {
    'max_depth': max_depth,
    'min_child_weight': min_child_weight
}

gscv = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate=0.01,
        n_estimators=750
    ),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:   55.2s finished


0.8585650618291382
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [909]:
gamma = [i*0.1 for i in range(0,5)]

hyperparams = {'gamma': gamma}

gscv = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate=0.01,
        n_estimators=750,
        max_depth=4,
        min_child_weight=6
    ),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy')

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    4.5s finished


0.8585650618291382
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [910]:
subsample = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
colsample_bytree = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    
hyperparams = {
    'subsample': subsample,
    'colsample_bytree': colsample_bytree
}

gscv = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate=0.01,
        n_estimators=750,
        max_depth=4,
        min_child_weight=6,
        gamma=0
    ),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:  1.4min finished


0.8596823802648924
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.65,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [911]:
reg_alpha = [1e-5, 1e-2, 0.1, 1, 100]
    
hyperparams = {'reg_alpha': reg_alpha}

gscv = GridSearchCV(
    estimator=XGBClassifier(
        learning_rate=0.01,
        n_estimators=750,
        max_depth=4,
        min_child_weight=6,
        gamma=0,
        subsample=0.65,
        colsample_bytree=1
    ),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    5.0s finished


0.8596823802648924
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=4,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=1e-05,
              reg_lambda=1, scale_pos_weight=1, subsample=0.65,
              tree_method='exact', validate_parameters=1, verbosity=None)


KNN

In [934]:
n_neighbors = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 30, 35, 40, 50]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30]
p = [1, 2, 3]
n_jobs = [-1, 1, 2, 3]

hyperparams = {
    'algorithm': algorithm,
    'weights': weights,
    'leaf_size': leaf_size,
    'n_neighbors': n_neighbors,
    'p': p,
    'n_jobs': n_jobs
}

gscv = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 4560 candidates, totalling 22800 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 22800 out of 22800 | elapsed:  5.6min finished


0.8506936162199483
KNeighborsClassifier(algorithm='auto', leaf_size=3, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=50, p=1,
                     weights='uniform')


RF

In [919]:
n_estimators = [10, 25, 50, 75, 100]
max_depth = [None, 1, 2, 3]
max_features = [1, 3, 5, 7]
min_samples_split = [2, 4, 6, 8, 10]
min_samples_leaf = [2, 4, 6, 8, 10]

hyperparams = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

gscv = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=hyperparams,
    verbose=True,
    cv=5,
    scoring='accuracy'
)

gscv.fit(X_train, y_train)
print(gscv.best_score_)
print(gscv.best_estimator_)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.86305944385161
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=7,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed: 13.3min finished


In [957]:
rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=None,
    max_features=7,
    min_samples_leaf=8,
    min_samples_split=4,
)

knn = KNeighborsClassifier(
    algorithm='auto',
    leaf_size=3,
    n_neighbors=50,
    weights='uniform',
    p=1,
    n_jobs=-1
)

# lr = LogisticRegression(
#     C=1.0,
#     penalty='l2',
# )

lr = LogisticRegression(gscv.best_estimator_)

xgb = XGBClassifier(
    learning_rate=0.01,
    n_estimators=750,
    max_depth=4,
    min_child_weight=6,
    gamma=0,
    subsample=0.65,
    colsample_bytree=1,
    reg_alpha=1e-05,
)

svm = SVC(
    C=100,
    gamma=0.01,
    probability=True,
)
     
models = [rf, knn, lr, xgb, svm]
scores_v3 = []

for mod in models:
    mod.fit(X_train, y_train)
    acc = cross_val_score(mod, X_train, y_train, scoring='accuracy', cv=10)
    scores_v3.append(acc.mean())

ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False).

In [928]:
results = pd.DataFrame({
    'Model': [
        'Random Forest',
        'K Nearest Neighbour',
        'Logistic Regression',
        'XGBoost',
        'SVM'
    ],
    'First': scores,
    'Delete feature': scores_v2,
    'Hyper parameters': scores_v3})

acc_rank = results.sort_values(by='Hyper parameters', ascending=False).reset_index(drop=True)
acc_rank

Unnamed: 0,Model,First,Delete feature,Hyper parameters
0,XGBoost,0.842896,0.857516,0.85633
1,SVM,0.849588,0.851823,0.855194
2,Random Forest,0.842884,0.847416,0.850724
3,K Nearest Neighbour,0.812534,0.844007,0.849588
4,Logistic Regression,0.841785,0.840662,0.840662


In [929]:
grid_hard = VotingClassifier(
    estimators = [
        ('Random Forest', rf),
        ('Logistic Regression', lr),
        ('XGBoost', xgb),
        ('SVM', svm),
        ('K Nearest Neighbour', knn)
    ],
    voting = 'hard'
)

grid_hard_cv = model_selection.cross_validate(grid_hard, X_train, y_train, cv=10)
grid_hard.fit(X_train, y_train)

grid_hard_cv['test_score'].mean()

0.8552059925093634

In [930]:
grid_soft = VotingClassifier(
    estimators = [
        ('Random Forest', rf),
        ('Logistic Regression', lr),
        ('XGBoost', xgb),
        ('SVM', svm),
        ('K Nearest Neighbour', knn)
    ],
    voting = 'soft'
)

grid_soft_cv = model_selection.cross_validate(grid_soft, X_train, y_train, cv=10)
grid_soft.fit(X_train, y_train)

grid_soft_cv['test_score'].mean()

0.8540823970037452

In [931]:
predictions = grid_hard.predict(X_test)

submission = pd.concat([pd.DataFrame(passId), pd.DataFrame(predictions)], axis = 'columns')

submission.columns = ['PassengerId', 'Survived']
submission.to_csv('D:/JISUNG/dacon/titanic/submission.csv', header = True, index = False)