In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
titanic = sns.load_dataset('titanic')
df = pd.DataFrame(titanic)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


* 데이터 전처리

In [5]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'deck']]
print(df.shape)
df.head(3)

(891, 9)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,


* 결측치 처리

In [6]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [7]:
# age - 평균값으로 대체
df.age.fillna(df.age.mean(), inplace=True)
df.age.isnull().sum()

0

In [8]:
# embaeked-최빈값
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [9]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

* 카테고리 값(sex, embarked)을 숫자로 변환

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [11]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,1,22.0,1,0,7.25,2,
1,1,1,0,38.0,1,0,71.2833,0,C


In [12]:
df.drop(columns=['deck'], inplace=True)
df.isna().sum().sum()

0

* Train/Test dataset 분리

In [13]:
# x와 y를 넘파이 배열로
X = df.iloc[:,1:].values
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [14]:
# np.unique(y, return_counts=True)
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2022
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [16]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([439, 273]))

* RandoemForest 모델로 학습

In [17]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)

In [18]:
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2022)

In [19]:
rfc.score(X_test, y_test)

0.8324022346368715

* GridSearchCV

In [20]:
params = {
    'max_depth' : [2, 4, 6, 8],
    'min_samples_split' : [2, 4, 6]
}

In [21]:
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [22]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 4}

In [23]:
params = {
    'max_depth' : [3, 4, 5],
    'min_samples_split' : [3, 4, 5]
}
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
%time grid_rf.fit(X_train, y_train)

CPU times: user 7.55 s, sys: 60.9 ms, total: 7.61 s
Wall time: 7.61 s


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [3, 4, 5],
                         'min_samples_split': [3, 4, 5]},
             scoring='accuracy')

In [24]:
grid_rf.best_params_

{'max_depth': 4, 'min_samples_split': 3}

In [25]:
best_rf = grid_rf.best_estimator_
best_rf.score(X_test, y_test)

0.8212290502793296

In [26]:
X_test[25],y_test[25]

(array([ 3.  ,  1.  , 45.  ,  0.  ,  0.  ,  8.05,  2.  ]), 1)

* 엉터리 분류기를 만들어보자

In [27]:
#여성의 생존률
df.groupby('sex')['survived'].mean()

sex
0    0.742038
1    0.188908
Name: survived, dtype: float64

In [28]:
df.groupby(['sex','pclass'])['survived'].mean()

sex  pclass
0    1         0.968085
     2         0.921053
     3         0.500000
1    1         0.368852
     2         0.157407
     3         0.135447
Name: survived, dtype: float64

In [29]:
from sklearn.base import BaseEstimator
class MyClassifier(BaseEstimator):
  #fit(),predict() method 만 재정의(Overriding)
  def fit(self,X,y):
    pass
  def predict(self,X):
    pred=np.zeros(X.shape[0],int)
    for i in range(X.shape[0]):
      if X[i,1] == 0: #여성이면 무조건 생존으로 처리
        pred[i] = 1 
    return pred

In [30]:
my_clf = MyClassifier()
my_clf.fit(X_train,y_train)
pred_my = my_clf.predict(X_test)

In [31]:
X_test[8,1],pred_my[6]

(0.0, 0)

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred_my)

0.7877094972067039

In [33]:
pred_rf = best_rf.predict(X_test)
sdf = pd.DataFrame({'y_test':y_test,'RF':pred_rf,'MY':pred_my})
sdf.head()

Unnamed: 0,y_test,RF,MY
0,1,0,0
1,0,0,0
2,1,0,1
3,0,0,0
4,0,0,0


* 모델의 성능을 평가할 때 무조건 정확도를 사용하는 것을 지양해야한다

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred_rf)

array([[103,   7],
       [ 25,  44]])

In [35]:
confusion_matrix(y_test,pred_my)

array([[96, 14],
       [24, 45]])

In [36]:
from sklearn.metrics import precision_score,recall_score

In [37]:
#정밀도(precision_score TP/(FP+TP))
precision_score(y_test,pred_rf) , precision_score(y_test,pred_my)

(0.8627450980392157, 0.7627118644067796)

In [38]:
#재현율(recakk_score TP/(FN+TP))
recall_score(y_test,pred_rf) , recall_score(y_test,pred_my)

(0.6376811594202898, 0.6521739130434783)

In [39]:
#F1 Score(정밀도와 재현율의 조화 평균)
from sklearn.metrics import f1_score
f1_score(y_test,pred_rf), f1_score(y_test,pred_my)

(0.7333333333333333, 0.703125)

In [40]:
#AUC score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,pred_rf), roc_auc_score(y_test,pred_my)

(0.787022397891963, 0.7624505928853755)