In [1]:
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
#Null 처리 함수
def fillna(df):
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Cabin"] = df["Cabin"].fillna('N')
    df["Embarked"] = df["Embarked"].fillna('N')
    df["Fare"] = df["Fare"].fillna(0)
    return df
    

In [3]:
#머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(["PassengerId","Name","Ticket"],axis = 1, inplace=True)
    return df

In [4]:
# 레이블 인코딩 수행
def format_features(df):
    df["Cabin"] = df["Cabin"].str[:1]
    features = ["Cabin","Sex","Embarked"]
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature]= le.transform(df[feature])
    return df

In [5]:
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [6]:
class MyDummyClassifer(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1: # 남자이면
                pred[i] = 0
            else:
                pred[i] =1
        return pred

In [7]:
np.zeros((10,1))

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
titanic_df = pd.read_csv("../Ch01/titanic_train.csv")
y_titanic_df = titanic_df["Survived"]
X_titanic_df = titanic_df.drop("Survived", axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df,test_size=0.2, random_state=0)


In [10]:
myclf = MyDummyClassifer()
myclf.fit(X_train, y_train)

mypredictions = myclf.predict(X_test)
print(f"Dimmy Classifier의 정확도는:{accuracy_score(y_test, mypredictions)}")

Dimmy Classifier의 정확도는:0.7877094972067039


In [11]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [12]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        return np.zeros((len(X),1),dtype=bool)

In [13]:
digits = load_digits()

In [14]:
y = (digits.target == 7).astype(int)
X_train,X_test, y_train, y_test = train_test_split(digits.data, y, random_state=11)

In [15]:
print(f"레이블 테스트 세트 크기:{y_test.shape}")
print("테스트 세트 레이블 0과 1의 분포도")
print(pd.Series(y_test).value_counts()) #pandas Series로 변환

레이블 테스트 세트 크기:(450,)
테스트 세트 레이블 0과 1의 분포도
0    405
1     45
Name: count, dtype: int64


In [16]:
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, y_train)
fakepred = fakeclf.predict(X_test)
print(f"모든 예측을 0으로 하여도 정확도는 : {accuracy_score(y_test, fakepred):.3f}")

모든 예측을 0으로 하여도 정확도는 : 0.900


In [17]:
from sklearn.metrics import accuracy_score, \
    precision_score, recall_score, confusion_matrix

In [18]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print("오차 행렬")
    print(confusion)
    print(f"정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}")

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [20]:
titanic_df = pd.read_csv("../Ch01/titanic_train.csv")
y_titanic_df = titanic_df["Survived"]
X_titanic_df = titanic_df.drop("Survived", axis=1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [21]:
lr_clf = LogisticRegression(solver="liblinear")
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659, 정밀도: 0.8246, 재현율: 0.7705


In [22]:
pred_proba = lr_clf.predict_proba(X_test)
pred = lr_clf.predict(X_test)
print(f"pred_proba()결과 shape: {pred_proba.shape}")
print(f"pred_proba array에서 앞 3개만 샘플로 추출 \n: {pred_proba[:3]}")

pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1,1)],axis=1)
print(f"두 개의 class 중에서 더 큰 확률을 클래스 값으로 예측\n {pred_proba_result[:3]}")

pred_proba()결과 shape: (179, 2)
pred_proba array에서 앞 3개만 샘플로 추출 
: [[0.44935226 0.55064774]
 [0.86335512 0.13664488]
 [0.86429644 0.13570356]]
두 개의 class 중에서 더 큰 확률을 클래스 값으로 예측
 [[0.44935226 0.55064774 1.        ]
 [0.86335512 0.13664488 0.        ]
 [0.86429644 0.13570356 0.        ]]


In [23]:
from sklearn.preprocessing import Binarizer
X = [[1, -1, 2],
     [2, 0, 0],
     [0, 1.1, 1.2]]

binarizer=Binarizer(threshold=1.1)
print(binarizer.fit_transform(X)) 

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [24]:
custom_threshold = 0.5

pred_proba_1 = pred_proba[:,1].reshape(-1, 1)

binarizer=Binarizer(threshold=custom_threshold).fit(pred_proba_1)
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659, 정밀도: 0.8246, 재현율: 0.7705


In [25]:
thresholds = [0.4, 0.45, 0.50, 0.55, 0.6]

def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print("\n임계값 :", custom_threshold)
        get_clf_eval(y_test, custom_predict)

get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds)


임계값 : 0.4
오차 행렬
[[97 21]
 [11 50]]
정확도: 0.8212, 정밀도: 0.7042, 재현율: 0.8197

임계값 : 0.45
오차 행렬
[[105  13]
 [ 13  48]]
정확도: 0.8547, 정밀도: 0.7869, 재현율: 0.7869

임계값 : 0.5
오차 행렬
[[108  10]
 [ 14  47]]
정확도: 0.8659, 정밀도: 0.8246, 재현율: 0.7705

임계값 : 0.55
오차 행렬
[[111   7]
 [ 16  45]]
정확도: 0.8715, 정밀도: 0.8654, 재현율: 0.7377

임계값 : 0.6
오차 행렬
[[113   5]
 [ 17  44]]
정확도: 0.8771, 정밀도: 0.8980, 재현율: 0.7213


In [26]:
from sklearn.metrics import precision_recall_curve
#레이블 값이 1일때의 예측 확률을 추출

pred_proba_class1= lr_clf.predict_proba(X_test)[:,1]

precision, recalls, thresholds = precision_recall_curve(y_test, pred_proba_class1)
print("반환된 분류 결정 임계값 배열의 Shape : ", thresholds.shape)

thr_index = np.arange(0, thresholds.shape[0],17)
print("샘플 추출을 위한 임계값 배열의 index 10개: ",thr_index)
print("샘플용 10개의 임계값: ",thresholds[thr_index])
print("샘플 임계값별 정밀도 : ", precision[thr_index])
print("샘플 재현율 정밀도 :", recalls[thr_index])

반환된 분류 결정 임계값 배열의 Shape :  (165,)
샘플 추출을 위한 임계값 배열의 index 10개:  [  0  17  34  51  68  85 102 119 136 153]
샘플용 10개의 임계값:  [0.01974988 0.11399916 0.13196917 0.15029262 0.21146796 0.28879776
 0.43966455 0.61651658 0.74146854 0.89611669]
샘플 임계값별 정밀도 :  [0.34078212 0.37654321 0.43065693 0.46610169 0.54455446 0.65853659
 0.75384615 0.89583333 0.96666667 1.        ]
샘플 재현율 정밀도 : [1.         1.         0.96721311 0.90163934 0.90163934 0.8852459
 0.80327869 0.70491803 0.47540984 0.21311475]


In [27]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


def precision_recall_curve_plot(y_test, pred_proba_c1):
    precisions, recalls, threshold = precision_recall_curve(y_test, pred_proba_c1)

    plt.figure(figsize=(5,4))
    threshold_boundary = threshold.shape[0]
    plt.plot(threshold, precisions[0:threshold_boundary], linestyle="--", label="precision")
    plt.plot(threshold, recalls[0:threshold_boundary], label = "recall")

    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))

    plt.xlabel("Threshold value");plt.ylabel("Precision and Recall value")
    plt.legend();plt.grid()
    plt.show()

In [28]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred)
print(f"F1 스코어: {f1:.4f}")

F1 스코어: 0.7966


In [29]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    f1 = f1_score(y_test, pred)
    print("오차 행렬")
    print(confusion)

    print(f"정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f}, 재현율 : {recall:.4f}, F1: {f1:.4f}")

In [30]:
thresholds = [0.4,0.45,0.50,0.55,0.60]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1, 1), thresholds) 


임계값 : 0.4
오차 행렬
[[97 21]
 [11 50]]
정확도 : 0.8212, 정밀도 : 0.7042, 재현율 : 0.8197, F1: 0.7576

임계값 : 0.45
오차 행렬
[[105  13]
 [ 13  48]]
정확도 : 0.8547, 정밀도 : 0.7869, 재현율 : 0.7869, F1: 0.7869

임계값 : 0.5
오차 행렬
[[108  10]
 [ 14  47]]
정확도 : 0.8659, 정밀도 : 0.8246, 재현율 : 0.7705, F1: 0.7966

임계값 : 0.55
오차 행렬
[[111   7]
 [ 16  45]]
정확도 : 0.8715, 정밀도 : 0.8654, 재현율 : 0.7377, F1: 0.7965

임계값 : 0.6
오차 행렬
[[113   5]
 [ 17  44]]
정확도 : 0.8771, 정밀도 : 0.8980, 재현율 : 0.7213, F1: 0.8000


In [31]:
from sklearn.metrics import roc_curve

pred_proba_class1=lr_clf.predict_proba(X_test)[:,1]

fprs, tprs, thresholds = roc_curve(y_test, pred_proba_class1)

thr_index = np.arange(1, thresholds.shape[0], 5)
print("샘플 추출을 위한 임계값 배열의 index : ", thr_index)
print("샘플 index로 추출한 임계값 : ", np.round(thresholds[thr_index],2))

print("샘플 임계값별 FPR : ", np.round(fprs[thr_index], 3))
print("샘플 임계값별 TPR : ", np.round(tprs[thr_index], 3))

샘플 추출을 위한 임계값 배열의 index :  [ 1  6 11 16 21 26 31 36 41 46]
샘플 index로 추출한 임계값 :  [0.94 0.73 0.62 0.52 0.44 0.28 0.15 0.14 0.13 0.12]
샘플 임계값별 FPR :  [0.    0.008 0.025 0.076 0.127 0.254 0.576 0.61  0.746 0.847]
샘플 임계값별 TPR :  [0.016 0.492 0.705 0.738 0.803 0.885 0.902 0.951 0.967 1.   ]


In [32]:
def roc_curve_plot(y_test, pred_proba_c1):
    fprs, tprs, thresholds = roc_curve(y_test, pred_proba_c1)

    plt.plot(fprs, tprs, label="ROC")

    plt.plot([0,1],[0,1], "k--", lebel="Rondom")

    start, end = plt.xilm()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1);plt.ylim(0,1)
    plt.xlabel("FPR(1-Specificity)");plt.ylabel("TPR(Recall)")
    plt.legend


In [33]:
from sklearn.metrics import roc_auc_score

pred_proba = lr_clf.predict_proba(X_test)[:,1]
roc_score = roc_auc_score(y_test, pred_proba)
print(f"ROC AUC값 : {roc_score:.4f}")

ROC AUC값 : 0.8987


In [34]:
from sklearn.metrics import accuracy_score, \
    precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

In [35]:
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    f1 = f1_score(y_test, pred)

    roc_auc = roc_auc_score(y_test, pred_proba)
    print("오차행렬")
    print(confusion)

    print(f"정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f}, 재현율 : {recall:.4f},\
          F1 : {f1:.4f}, AUC : {roc_auc:.4f}")