In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_df = pd.read_csv('titanic.csv')
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y_data = data_df[['Survived']]

In [4]:
del data_df['Survived']
x_data = data_df.copy()
x_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


---
> <span style='font-size:20pt;'> Feature engineering & Feature selection </span>
---

- Age 결측치 채우기

In [5]:
# 각 pclass 의 나이 중간값 구하기
P1_age = float(x_data[(x_data['Pclass']==1)]['Age'].quantile(.50))
P2_age = float(x_data[(x_data['Pclass']==2)]['Age'].quantile(.50))
P3_age = float(x_data[(x_data['Pclass']==3)]['Age'].quantile(.50))

print(P1_age)
print(P2_age)
print(P3_age)

37.0
29.0
24.0


In [6]:
# 각 Pcalss의 중간값을 같은 Pclass를 가진 행의 Age null에 넣기
for num in range(len(x_data['PassengerId'])):
    if pd.isnull(x_data['Age'][num]):
        if x_data['Pclass'][num] == 1:
            x_data['Age'][num] = P1_age
        elif x_data['Pclass'][num] == 2:
              x_data['Age'][num] = P2_age
        else:
            x_data['Age'][num] = P3_age

- Embarked 결측치 채우기

In [7]:
# Embarked 결측치 채우기
# Embarked : 가장 많이 나온 S 로 결측치 보완

x_data = x_data.fillna({'Embarked' : 'S' })
x_data['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

- 불필요한 열, 예측에 방해가 된다고 생각하는 열 지우기
 : Passengerid, name, ticket, carbin 

In [8]:
del x_data['PassengerId']
del x_data['Name']
del x_data['Ticket']
del x_data['Cabin']

- Sex, Embarked 를 '문자'를 '숫자'로 바꾸기

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
x_data['Sex'] = le.fit_transform(x_data['Sex'])
x_data['Embarked'] = le.fit_transform(x_data['Embarked'])

In [11]:
x_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


---
> <span style='font-size:20pt; color:#BF5AD8'> Train - Test split </span>
---

In [14]:
x_data = np.array(x_data)
y_data = np.array(y_data)

In [15]:
# split
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, y_data, test_size=0.3, random_state=0)

- 그리드서치를 사용해서 xgboost의 최적의 파라미터 값 찾기

In [16]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model_xgb_grid = xgb.XGBClassifier()
param_grid = {'max_depth':[4,5,6],
                 'min_child_weight':[5, 7], 
                 'gamma':[0],
                 'nthread':[-1],
                 'colsample_bytree':[0.7, 0.8, 0.9],
                 'colsample_bylevel':[0.3, 0.7, 0.9],
                 'n_estimators':[100],
                 'objective':['binary:logistic'],
                 'random_state':[0]}

cv=KFold(n_splits=2, random_state=0)

gcv = GridSearchCV(model_xgb_grid, param_grid=param_grid, cv=cv, n_jobs=-1, scoring='roc_auc') # acc , 

gcv.fit(x_train, y_train)
print()
print('final params', gcv.best_params_)
print('best score', gcv.best_score_) 


final params {'colsample_bylevel': 0.9, 'colsample_bytree': 0.9, 'gamma': 0, 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 100, 'nthread': -1, 'objective': 'binary:logistic', 'random_state': 0}
best score 0.857613947822236


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


- XGB 했을 때

In [77]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

model_xgb = xgb.XGBClassifier(max_depth=8,
                  min_child_weight=5,
                  gamma=2,
                  nthread=-1,
                  colsample_bytree=0.8,
                  colsample_bylevel=0.3, 
                  n_estimators=1300,
                  objective='binary:logistic',
                  random_state=0)
model_xgb.fit(x_train, y_train)

pred_proba = model_xgb.predict_proba(x_test)
fpr, tpr, _ = roc_curve(y_true=y_test, y_score=pred_proba[:, 1])
roc_auc = auc(fpr, tpr)

print("Acc : ", accuracy_score(y_test, model_xgb.predict(x_test)))
print('Roc_auc :', roc_auc)
print()
print('Classification_report : ')
print(classification_report(y_test, model_xgb.predict(x_test), target_names=['Survived', 'not_survived']))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Acc :  0.8432835820895522
Roc_auc : 0.8840476190476191

Classification_report : 
              precision    recall  f1-score   support

    Survived       0.85      0.92      0.88       168
not_survived       0.84      0.72      0.77       100

   micro avg       0.84      0.84      0.84       268
   macro avg       0.84      0.82      0.83       268
weighted avg       0.84      0.84      0.84       268



---
> <span style='font-size:20pt; color:#FF8200;'> Robust scale 사용한 데이터 만들기 </span>
--- 

In [44]:
pd.DataFrame(x_train).head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,1.0,51.0,0.0,0.0,26.55,2.0
1,1.0,0.0,49.0,1.0,0.0,76.7292,0.0
2,3.0,1.0,1.0,5.0,2.0,46.9,2.0
3,1.0,1.0,54.0,0.0,1.0,77.2875,2.0
4,3.0,0.0,24.0,1.0,0.0,14.4583,0.0


In [45]:
x_train_robust = x_train.copy()
x_test_robust = x_test.copy()

In [46]:
x_train_robust[:,5].reshape(-1, 1)

array([[ 26.55  ],
       [ 76.7292],
       [ 46.9   ],
       [ 77.2875],
       [ 14.4583],
       [ 36.75  ],
       [247.5208],
       [  8.05  ],
       [  8.05  ],
       [  7.1417],
       [  7.8292],
       [  7.55  ],
       [  7.25  ],
       [ 15.5   ],
       [ 35.    ],
       [ 25.9292],
       [  7.775 ],
       [  7.8958],
       [  7.2292],
       [ 25.4667],
       [ 13.5   ],
       [ 52.    ],
       [ 13.    ],
       [ 21.    ],
       [ 22.3583],
       [ 32.5   ],
       [ 13.    ],
       [ 10.5   ],
       [ 13.    ],
       [ 33.5   ],
       [ 26.55  ],
       [ 13.    ],
       [ 51.8625],
       [  8.4333],
       [ 19.9667],
       [ 29.    ],
       [ 30.    ],
       [  7.75  ],
       [ 30.5   ],
       [ 31.275 ],
       [ 10.5   ],
       [  7.8958],
       [ 13.    ],
       [ 29.125 ],
       [ 24.15  ],
       [  7.25  ],
       [  7.125 ],
       [ 26.2875],
       [ 52.    ],
       [  9.5   ],
       [ 69.55  ],
       [  7.925 ],
       [  7.

In [47]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(x_train_robust[:,5].reshape(-1, 1))
x_train_fare = scaler.transform(x_train_robust[:,5].reshape(-1, 1))
x_test_fare = scaler.transform(x_test_robust[:,5].reshape(-1, 1)) 

In [48]:
# x_train_robust, x_test_robust 데이터의 Fare열인 5번 열을 x_train_fare, x_test_fare로 바꾸기
pd.DataFrame(x_train_robust)[5] = pd.DataFrame(x_train_fare)
pd.DataFrame(x_test_robust)[5] = pd.DataFrame(x_test_fare)

In [49]:
# 적용 잘 됐는지 확인
pd.DataFrame(x_train_robust).head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,1.0,51.0,0.0,0.0,0.492275,2.0
1,1.0,0.0,49.0,1.0,0.0,2.630973,0.0
2,3.0,1.0,1.0,5.0,2.0,1.359616,2.0
3,1.0,1.0,54.0,0.0,1.0,2.654768,2.0
4,3.0,0.0,24.0,1.0,0.0,-0.023088,0.0


- 그리드서치를 사용해서 SVC의 최적의 파라미터 값 찾기

In [76]:
# 원본 데이터 사용(x_tarin, x_test)
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

model_svc = SVC()
param_grid = {'C':[1, 2, 3],
               'gamma':[0.1, 0.2, 0.3],
               'probability':[False, True]}
cv=KFold(n_splits=4, random_state=0)


model_svc.fit(x_train, y_train)
svc = GridSearchCV(model_svc, param_grid=param_grid, cv=cv, n_jobs=-1, scoring='roc_auc')
svc.fit(x_train, y_train)

print('final params', svc.best_params_)
print('best score', svc.best_score_) 

  y = column_or_1d(y, warn=True)


final params {'C': 3, 'gamma': 0.1, 'probability': False}
best score 0.7582190888326571


  y = column_or_1d(y, warn=True)


- 원본 vs 로버스트

In [70]:
# 원본 데이터 사용(x_tarin, x_test)

model_svc = SVC(C=2, gamma=0.1, probability=True)
model_svc.fit(x_train, y_train)
y_pred = model_svc.predict(x_test)

print("Acc : ", accuracy_score(y_test, y_pred))

# roc_auc 확인
pred_proba = model_svc.predict_proba(x_test)
fpr, tpr, _ = roc_curve(y_true=y_test, y_score=pred_proba[:, 1])
roc_auc = auc(fpr, tpr)
print("Roc_auc :" , roc_auc)
print()
print('Classification_report : ')
print(classification_report(y_test, y_pred, target_names=['not survived', 'survived']))

Acc :  0.7164179104477612
Roc_auc : 0.7868452380952381

Classification_report : 
              precision    recall  f1-score   support

not survived       0.76      0.81      0.78       168
    survived       0.64      0.56      0.60       100

   micro avg       0.72      0.72      0.72       268
   macro avg       0.70      0.68      0.69       268
weighted avg       0.71      0.72      0.71       268



  y = column_or_1d(y, warn=True)


In [73]:
# 로버스트 데이터 사용(x_tarin, x_test)

model_svc = SVC(C=2, gamma=0.1, probability=True)
model_svc.fit(x_train_robust, y_train)
y_pred = model_svc.predict(x_test_robust)

print("Acc : ", accuracy_score(y_test, y_pred))

pred_proba = model_svc.predict_proba(x_test_robust)
fpr, tpr, _ = roc_curve(y_true=y_test, y_score=pred_proba[:, 1])
roc_auc = auc(fpr, tpr)
print("Roc_auc :" , roc_auc)
print()
print('Classification_report : ')
print(classification_report(y_test, y_pred, target_names=['not survived', 'survived']))

Acc :  0.8059701492537313
Roc_auc : 0.8172619047619047

Classification_report : 
              precision    recall  f1-score   support

not survived       0.83      0.86      0.85       168
    survived       0.76      0.71      0.73       100

   micro avg       0.81      0.81      0.81       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.81      0.80       268



  y = column_or_1d(y, warn=True)
