In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [5]:
df = sns.load_dataset('titanic')
df.head(3)
# survived, alive / sex, who / pclass, class 등 중복되고 불필요한 자료 존재하는지 확인 
# 선형 대수는 독립적인 다수의 변수로 이루어져 있음!

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


## 1. 데이터 전처리
---
- Feature selection

In [9]:
# DataFrame에서 우선적으로 해야할 중요한 작업 -> selection/Filtering 

df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'deck' ]]
print(df.shape)
df.head(3)

(891, 9)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,


- 결측지(NaN) 처리

In [8]:
df.isna().sum()
# 전체 데이터 891 중 'age'데이터 177개 - 20% 정도 차지하는데 어떻게 할까?

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [10]:
# 'age' -> 평균으로 대체
df.age.fillna(df.age.mean(), inplace=True)
#df['age'].fillna(df['age'].mean(), inplace=True)

In [11]:
df.age.isna().sum()
#df.age.isnull().sum()

0

In [12]:
# 'embarked' -> 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [13]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [14]:
# 'deck' -> 열 삭제 (결측지 너무 많아서,,,,)
df.drop(columns=['deck'], inplace=True)

In [15]:
df.isna().sum()
# 모든 결측지 정리됨 확인 

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

- 카테고리 값(필드: sex, embarked)을 숫자로 변환 

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [19]:
df.sex = le.fit_transform(df.sex) # 카테고리가 알아서 숫자로 바뀌어 들어감 !! (중요한 작업)
df.embarked =  le.fit_transform(df.embarked)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2


## 2. Train / Test dataset으로 분리

In [20]:
# X와 y를 넘파이 배열로
X = df.iloc[:, 1:].values # 뒤에 .values:  DataFrame 구조 -> np array 배열로 변경 
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [21]:
# y값의 분포
#df.survived.value_counts()
np.unique(y, return_counts=True)

(array([0, 1]), array([549, 342]))

In [22]:
# train/test 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((712, 7), (179, 7), (712,), (179,))

In [23]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([435, 277]))

## 3. `RandomForest` 모델

In [36]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [38]:
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2022)

## 4. 모델 예측 및 평가

In [39]:
rfc.score(X_test, y_test)

0.8044692737430168

## 5. 3,4번 과정 대신에 `GridSearchCV` 수행

In [40]:
params = {'max_depth':[2,4,6,8],
          'min_samples_split':[2,4,6]}

In [44]:
from sklearn.model_selection import GridSearchCV
grid_rf =GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [45]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 4}

In [49]:
params = {'max_depth':[3,4,5],
          'min_samples_split':[3,4,5]}
grid_rf =GridSearchCV(rfc, params, scoring='accuracy', cv=5)
%time grid_rf.fit(X_train, y_train)                              

CPU times: user 8.15 s, sys: 55.4 ms, total: 8.2 s
Wall time: 8.31 s


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [3, 4, 5],
                         'min_samples_split': [3, 4, 5]},
             scoring='accuracy')

In [50]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 4}

In [52]:
best_rf = grid_rf.best_estimator_
best_rf.score(X_test, y_test)

0.8156424581005587

## 6. Test 데이터에 적용 

In [53]:
X_test[25], y_test[25]

(array([ 3.        ,  0.        , 29.69911765,  2.        ,  0.        ,
        23.25      ,  1.        ]), 1)

In [54]:
best_rf.predict(X_test[25].reshape(1,-1))[0]

1

## 7. 엉터리 분류기

In [55]:
# 여성의 상존률 
df.groupby(['sex', 'pclass'])['survived'].mean()

sex  pclass
0    1         0.968085
     2         0.921053
     3         0.500000
1    1         0.368852
     2         0.157407
     3         0.135447
Name: survived, dtype: float64

In [70]:
from sklearn.base import BaseEstimator


class MyClassifier(BaseEstimator): # MyClassifier는 (부모 매서드)인 BaseEstimator 매서드를 상속 받음
  # fit(), predict() method만 재정의(Overriding)
  def fit(self, x, y): # 무조건 앞에 self 붙여줘야 해~~~!!!
    pass
  def predict(self,X):
    pred = np.zeros(X.shape[0], int) # X 데이터의 행의 갯수만큼 0로 초기화한 배열을 생성 
    # X_test.shape -> (179, 7) => 179개의 0으로 데이터 초기화
    for i in range(X.shape[0]):
      if X[i, 1] ==0: # 여성이면
        pred[i] = 1
    return pred

In [71]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train) # fit method에서는 self는 생략, 두개만 적어주면 돼
pred_my = my_clf.predict(X_test)

In [72]:
X_test[8, 1], pred_my[6]

(1.0, 0)

In [73]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my) # 여성이면 생존, 남성이면 사망

0.7932960893854749

In [76]:
pred_rf = best_rf.predict(X_test)
sdf = pd.DataFrame({'y_test': y_test, 'RF':pred_rf , 'MY': pred_my})
sdf.head()

Unnamed: 0,y_test,RF,MY
0,0,0,0
1,0,0,0
2,1,0,1
3,0,0,0
4,0,0,0


- 모델 성능 평가시 무조건적으로 정확도를 사용하는 것은 지양해야 함 !

- 오차 행렬(Confusion matrix)
 - TN
 - FP
 - FN
 - TP

In [77]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_rf) # y 테스트값, 예측값
# 결과값 순서 
# TN(True Negative) ,FP(False Positive)
# FN(False Negative), TP(True Positive)

array([[109,   5],
       [ 28,  37]])

In [78]:
from sklearn.metrics import precision_score, recall_score

In [81]:
# 정밀도 (Precision score = TP / (FP + TP))
precision_score(y_test, pred_rf), precision_score(y_test, pred_my)

(0.8809523809523809, 0.7333333333333333)

In [82]:
# 재현율 (Recall = TP / (FN + TP))
recall_score(y_test, pred_rf), recall_score(y_test, pred_my)

(0.5692307692307692, 0.676923076923077)

- F1 Score 
 - 정밀도와 재현율의 조화 평균
 - 정밀도와 재현율이 어느 한쪽으로 치우치지 않는 경우 -> 높은 수치 나타냄

In [86]:
# F1 Score (정밀도와 재현율의 조화 평균)
from sklearn.metrics import f1_score
f1_score(y_test, pred_rf)

0.6915887850467289

- `ROC` (Reciver Operation Characteristic) 곡선 
- `AUC` (Area Under Curve)

In [87]:
# AUC Score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred_rf), roc_auc_score(y_test, pred_my)

(0.762685560053981, 0.7682860998650473)