## 제2유형_연습하기_타이타닉 생존자 분류

## 데이터 분석 순서
- 1. 라이브러리 및 데이터 확인
  2. 데이터 탐색(EDA)
  3. 데이터 전처리 및 분리
  4. 모델링 및 성능평가
  5. 예측값 제출

## 1. 라이브러리 및 데이터 확인

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
x = df.drop('survived', axis=1)
y = df['survived']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=2023)

x_test = pd.DataFrame(x_test)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

x_test.reset_index()
y_train.columns=['target']
y_test.columns=['target']

## 타이타닉 생존자 예측 문제
## - 데이터의 결측치, 중복 변수값에 대해 처리하고
## - 분류모델을 사용하여 Accuracy, F1 score, AUC 값을 산출하시오.

### 데이터 설명
- survival	:	0	=	No,	1	=	Yes
- pclass	:	객실	등급(1,2,3)
- sex	:	성별
- age	:	나이
- sibsp	:	타이타닉호에	탑승한	형제/배우자의	수
- parch	:	타이타닉호에	탑승한	부모/자녀의	수
- fare	:	요금
- embarked	:	탑승지	이름(C,	Q,	S)	Cherbourg	/	Queenstown	/	Southampton
- (중복)class	:	객실	등급(First,	Second,	Third)
- who	:	man,	women,	child
- adult_male	:	성인남자인지	여부(True=성인남자,	False	그외)
- deck	:	선실번호	첫	알파벳(A,B,C,D,E,F,G)
- (중복)	embark_town	:	탑승지	이름(Cherbourg,	Queenstown,	Southampton)
- (중복)	alive	:	생존여부(no:사망,	yes:생존)
- alone	:	혼자	탑승했는지	여부(True=혼자,	False=가족과	함께)

## 2. 데이터 탐색(EDA)

In [2]:
# 데이터의 행/열 확인
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(712, 14)
(179, 14)
(712, 1)


In [3]:
# 초기 데이터 확인
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

     pclass     sex   age  sibsp  parch   fare embarked   class    who  \
3         1  female  35.0      1      0  53.10        S   First  woman   
517       3    male   NaN      0      0  24.15        Q   Third    man   
861       2    male  21.0      1      0  11.50        S  Second    man   

     adult_male deck  embark_town alive  alone  
3         False    C  Southampton   yes  False  
517        True  NaN   Queenstown    no   True  
861        True  NaN  Southampton    no  False  
     pclass     sex   age  sibsp  parch   fare embarked   class    who  \
800       2    male  34.0      0      0   13.0        S  Second    man   
341       1  female  24.0      3      2  263.0        S   First  woman   
413       2    male   NaN      0      0    0.0        S  Second    man   

     adult_male deck  embark_town alive  alone  
800        True  NaN  Southampton    no   True  
341       False    C  Southampton   yes  False  
413        True  NaN  Southampton    no   True  
     target
3 

In [4]:
# 변수명과 데이터 타입이 매칭이 되는지, 결측치가 있는지 확인해보세요
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 3 to 608
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   pclass       712 non-null    int64   
 1   sex          712 non-null    object  
 2   age          579 non-null    float64 
 3   sibsp        712 non-null    int64   
 4   parch        712 non-null    int64   
 5   fare         712 non-null    float64 
 6   embarked     710 non-null    object  
 7   class        712 non-null    category
 8   who          712 non-null    object  
 9   adult_male   712 non-null    bool    
 10  deck         164 non-null    category
 11  embark_town  710 non-null    object  
 12  alive        712 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(3), object(5)
memory usage: 64.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 800 to 410
Data columns (total 14 columns):
 #   Column       No

In [5]:
# x_train과 x_test 데이터의 기초통계량을 잘 비교해보세요.
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

        count       mean        std   min      25%      50%     75%       max
pclass  712.0   2.307584   0.834926  1.00   2.0000   3.0000   3.000    3.0000
age     579.0  29.479568  14.355304  0.42  20.0000  28.0000  38.000   74.0000
sibsp   712.0   0.518258   1.094522  0.00   0.0000   0.0000   1.000    8.0000
parch   712.0   0.372191   0.792341  0.00   0.0000   0.0000   0.000    6.0000
fare    712.0  31.741836  45.403910  0.00   7.8958  14.4542  31.275  512.3292
        count       mean        std  min     25%   50%      75%       max
pclass  179.0   2.312849   0.842950  1.0   2.000   3.0   3.0000    3.0000
age     135.0  30.640741  15.258427  1.0  22.000  29.0  39.0000   80.0000
sibsp   179.0   0.541899   1.137797  0.0   0.000   0.0   1.0000    8.0000
parch   179.0   0.418994   0.859760  0.0   0.000   0.0   0.0000    5.0000
fare    179.0  34.043364  64.097184  0.0   7.925  14.5  30.2854  512.3292
        count      mean       std  min  25%  50%  75%  max
target  712.0  0.383427  0.48

In [6]:
# object, category 데이터도 추가 확인
print(x_train.describe(include='object').T)
print(x_test.describe(include='object').T)

print(x_train.describe(include='category').T)
print(x_test.describe(include='category').T)

            count unique          top freq
sex           712      2         male  469
embarked      710      3            S  518
who           712      3          man  432
embark_town   710      3  Southampton  518
alive         712      2           no  439
            count unique          top freq
sex           179      2         male  108
embarked      179      3            S  126
who           179      3          man  105
embark_town   179      3  Southampton  126
alive         179      2           no  110
      count unique    top freq
class   712      3  Third  391
deck    164      7      C   47
      count unique    top freq
class   179      3  Third  100
deck     39      7      C   12


In [7]:
# y데이터도 구체적으로 살펴보세요.
print(y_train.head())

     target
3         1
517       0
861       0
487       0
58        1


In [8]:
print(y_train.value_counts())

target
0         439
1         273
Name: count, dtype: int64


## 3. 데이터 전처리 및 분리

### 1)결측치, 2)이상치, 3)변수 처리하기

In [9]:
# 결측치 확인
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

pclass           0
sex              0
age            133
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           548
embark_town      2
alive            0
alone            0
dtype: int64
pclass           0
sex              0
age             44
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           140
embark_town      0
alive            0
alone            0
dtype: int64
target    0
dtype: int64


In [10]:
# 결측치 제거
#df = df.dropna()
#df.dropna().shape

In [11]:
# 결측치 대체
# x_train(712, 14) = age(133) embarked(2) deck(548) embark_town(2) 
# x_test(179, 14) = age(44) deck(140)
# 변수제거(중복)
# class
# embark_town
# alive
# deck(결측치 다수)

In [12]:
# 중복변수 제거
x_train = x_train.drop(['class','embark_town','alive','deck'], axis=1)
x_test = x_test.drop(['class','embark_town','alive','deck'], axis=1)

In [13]:
# 변수제거 확인
print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 3 to 608
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      712 non-null    int64  
 1   sex         712 non-null    object 
 2   age         579 non-null    float64
 3   sibsp       712 non-null    int64  
 4   parch       712 non-null    int64  
 5   fare        712 non-null    float64
 6   embarked    710 non-null    object 
 7   who         712 non-null    object 
 8   adult_male  712 non-null    bool   
 9   alone       712 non-null    bool   
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 51.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 800 to 410
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      179 non-null    int64  
 1   sex         179 non-null    object 
 2   age         135 non-null    float64
 3   sibsp       179 non

In [14]:
# 결측치 대체
# x_train(712, 14) = age(133) embarked(2) 
# x_test(179, 14) = age(44) 

# age 변수
med_age = x_train['age'].median()
x_train['age'] = x_train['age'].fillna(med_age)
x_test['age'] = x_test['age'].fillna(med_age) # train data의 중앙값으로

# embarkde
mode_et = x_train['embarked'].mode()
x_train['embarked'] = x_train['embarked'].fillna(mode_et[0]) # 최빈값 [0] 주의

In [15]:
# 결측치 대체 여부 확인
print(x_train.isnull().sum())
print(x_test.isnull().sum())

pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
who           0
adult_male    0
alone         0
dtype: int64
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
who           0
adult_male    0
alone         0
dtype: int64


In [16]:
# 변수처리(원핫인코딩)
x_train = pd.get_dummies(x_train, dtype=np.uint8)
x_test = pd.get_dummies(x_test, dtype=np.uint8)
print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 3 to 608
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      712 non-null    int64  
 1   age         712 non-null    float64
 2   sibsp       712 non-null    int64  
 3   parch       712 non-null    int64  
 4   fare        712 non-null    float64
 5   adult_male  712 non-null    bool   
 6   alone       712 non-null    bool   
 7   sex_female  712 non-null    uint8  
 8   sex_male    712 non-null    uint8  
 9   embarked_C  712 non-null    uint8  
 10  embarked_Q  712 non-null    uint8  
 11  embarked_S  712 non-null    uint8  
 12  who_child   712 non-null    uint8  
 13  who_man     712 non-null    uint8  
 14  who_woman   712 non-null    uint8  
dtypes: bool(2), float64(2), int64(3), uint8(8)
memory usage: 40.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 800 to 410
Data columns (total 15 columns):
 #   Column      Non-Null 

In [17]:
# advanced 버전 사용
x_train_ad = x_train.copy()
x_test_ad = x_test.copy()
y_train_ad = y_train.copy()

In [18]:
# (참고사항) 원핫인코딩 후 변수의 수가 다른 경우
# => x_test의 변수의 수가 x_train 보다 많은 경우 (혹은 그 반대인 경우)
# 원핫인코딩 후 Feature 수가 다른 경우
# x_train = x_train.get_dummies(x_train)
# x_test = x_test.get_dummies(x_test)
# print(x_train.info())
# print(x_test.info())

# 해결방법(x_test의 변수가 수가 더 많은 경우의 코드)
# x_train = x_train.reindex(columns=x_test.columns, fill_value=0)
# x_train.info()

## 데이터 분리

In [19]:
# 데이터를 훈련 세트와 검증용 세트로 분할 (80% 훈련, 20% 검증용)
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size=0.2,
                                                  stratify=y_train['target'],
                                                  random_state=2023)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(569, 15)
(143, 15)
(569,)
(143,)


## 4. 모델링 및 성능평가

In [20]:
# 랜덤포레스트 모델 사용 (참고 : 회귀모델은 RandomForestRegressor)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=2023)
model.fit(x_train, y_train)

In [21]:
# 모델을 사용하여 테스트 데이터 예측
y_pred = model.predict(x_val)

In [22]:
# 모델 성능평가 (정확도, f1 score, 정밀도, roc_auc_score, 민감도, 특이도 등)
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score
acc = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

In [23]:
print(acc)
print(auc)
print(recall)
print(precision)
print(f1)

0.8531468531468531
0.8465909090909092
0.8181818181818182
0.8035714285714286
0.8108108108108109


In [24]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred)
print(cm)

[[77 11]
 [10 45]]


## 실제 test 셋으로 성능평가를 한다면?

In [25]:
y_pred_f = model.predict(x_test)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
acc_f = accuracy_score(y_test, y_pred_f)
f1_f = f1_score(y_test, y_pred_f)
auc_f = roc_auc_score(y_test, y_pred_f)

In [26]:
print(acc_f)
print(f1_f)
print(auc_f)

0.7821229050279329
0.7153284671532847
0.7687088274044797


## Advanced 버전

### (주의) 전체 코드 실행시간이 1분으로 제한되어 있기 때문에, 가능하면 30초 미만으로 할 것!

In [27]:
# GridSearch CV를 활용한 하이퍼파라미터 최적화
# - GridSearch : 격자탐색
# - CV = CrossValidation, 교차검증

# (주의) 별도로 train/val 분리가 필요하지 않음
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_params = { 'n_estimators':[30,70,100],
              'max_depth':[6,8,10],
              'min_samples_leaf':[1,2,3],
            }
# n_estimators : tree의 개수
# max_depth : tree의 최대 깊이
# min_samples_leaf : leaf node(더이상 분할되지 않는 마지막노드)가 되기 위해 필요한 최소 샘플 수
# (이 값보다 작은 수의 샘플이 해당 노드에 있을 경우, 더이상 분할하지 않음)

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf = RandomForestClassifier(random_state=2023)
grid_rf = GridSearchCV(rf, param_grid=rf_params, cv=10)
grid_rf.fit(x_train_ad, y_train_ad['target'])
# y값 입력 시 주의(1차원 형태로 들어가야 함)

print('최적 하이퍼파라미터: ',grid_rf.best_params_)
print('Best 예측정확도: ',grid_rf.best_score_)

최적 하이퍼파라미터:  {'max_depth': 10, 'min_samples_leaf': 3, 'n_estimators': 70}
Best 예측정확도:  0.8427034428794992


In [29]:
# 위의 최적 하이퍼파라미터로 랜덤포레스트 모델을 생성
from sklearn.ensemble import RandomForestClassifier
model_h = RandomForestClassifier(n_estimators=70,
                                 max_depth=10,
                                 min_samples_leaf=3,
                                 random_state=2023)
model_h.fit(x_train_ad, y_train_ad['target']) # y데이터 입력시 주의

In [30]:
# 모델 성능 평가
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
y_pred_h = model_h.predict(x_test)

acc_h = accuracy_score(y_test, y_pred_h)
f1_h = f1_score(y_test, y_pred_h)
auc_h = roc_auc_score(y_test, y_pred_h)

print('HP 최적화 Acc : ', acc_h)
print('HP 최적화 f1 : ',f1_h)
print('HP 최적화 AUC : ',auc_h)

HP 최적화 Acc :  0.7988826815642458
HP 최적화 f1 :  0.7272727272727273
HP 최적화 AUC :  0.7796442687747035


In [34]:
## GridSearch 연습
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rmc = RandomForestClassifier(random_state=2023)

grid_rmc_params = {'n_estimators':[30,70,100],
                   'max_depth':[5,8,10],
                   'min_samples_leaf':[1,2,5],
                  }

grid_rmc = GridSearchCV(rmc, param_grid=grid_rmc_params, cv=10)

grid_rmc.fit(x_train_ad, y_train_ad['target'])

print('HP test 최적:', grid_rmc.best_params_)
print('HP test 정확:', grid_rmc.best_score_)

HP test 최적: {'max_depth': 8, 'min_samples_leaf': 5, 'n_estimators': 30}
HP test 정확: 0.8399256651017215


In [35]:
# 최적 하이퍼파라미터 학습
from sklearn.ensemble import RandomForestClassifier
model_t = RandomForestClassifier(n_estimators=30,
                                 max_depth=8,
                                 min_samples_leaf=5,
                                 random_state=2023)
model_t.fit(x_train_ad, y_train_ad['target'])

y_pred_t = model_t.predict(x_test)

y_pred_t

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0], dtype=int64)

In [36]:
# score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
acc_t = accuracy_score(y_test, y_pred_t)
f1_t = f1_score(y_test, y_pred_t)
auc_t = roc_auc_score(y_test, y_pred_t)

print(acc_t)
print(f1_t)
print(auc_t)

0.7988826815642458
0.7272727272727273
0.7796442687747035


## 5. 예측값 제출

### (주의) x_test셋을 모델에 넣어 나온 예측값을 제출해야 함

In [37]:
# 특정 클래스로 분류할 경우
y_result = model_t.predict(x_test)
print(y_result[:5])
# 특정 클래스로 분류될 확률을 구할 경우
y_result_prob = model_t.predict_proba(x_test)
print(y_result_prob[:5])

# 이해해보기
result_prob = pd.DataFrame({
    'result':y_result,
    'prob0':y_result_prob[:,0]
})
print(result_prob[:5])

[0 1 0 0 0]
[[0.88628377 0.11371623]
 [0.32582125 0.67417875]
 [0.92343871 0.07656129]
 [0.92478507 0.07521493]
 [0.91572401 0.08427599]]
   result     prob0
0       0  0.886284
1       1  0.325821
2       0  0.923439
3       0  0.924785
4       0  0.915724


In [38]:
pd.DataFrame({'result':y_result}).to_csv('0012.csv',index=False)

In [40]:
df2 = pd.read_csv('0012.csv')
print(df2.head())

   result
0       0
1       1
2       0
3       0
4       0


In [41]:
# (참고) help 함수를 통한 함수 사용법 확인
from sklearn.model_selection import GridSearchCV
help(GridSearchCV)

Help on class GridSearchCV in module sklearn.model_selection._search:

class GridSearchCV(BaseSearchCV)
 |  GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
 |
 |  Exhaustive search over specified parameter values for an estimator.
 |
 |  Important members are fit, predict.
 |
 |  GridSearchCV implements a "fit" and a "score" method.
 |  It also implements "score_samples", "predict", "predict_proba",
 |  "decision_function", "transform" and "inverse_transform" if they are
 |  implemented in the estimator used.
 |
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated grid-search over a parameter grid.
 |
 |  Read more in the :ref:`User Guide <grid_search>`.
 |
 |  Parameters
 |  ----------
 |  estimator : estimator object
 |      This is assumed to implement the scikit-learn estimator interface.
 |      Either estimator needs

In [42]:
from sklearn.ensemble import RandomForestClassifier
help(RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble._forest:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)
 |
 |  A random forest classifier.
 |
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  Trees in the forest use the best split strategy, i.e. equivalent to passing
 |  `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`.
 |  The sub-sample size is controlled with the `max_samples` parameter if