In [1]:
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,f1_score,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Binarizer
import pandas as pd
#임계값 변경해 가면서 출력
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    for treshold in thresholds:
        binarizer = Binarizer(threshold=treshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print(f'현재 treshold 값: {treshold}')
        get_clf_eval(y_test,custom_predict)

#출력
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)
    print('오차행렬')
    print(confusion)
    print(f'정확도: {round(accuracy,2)}, 정밀도: {round(precision,2)}, 재현율: {round(recall,2)}, F1: {round(f1,2)}, AUC:{round(roc_auc,2)}\n')
    
def get_category(age):
    cat = ''
    if age < 10:
        cat = 'Child'
    elif 10 <= age < 20:
        cat = 'Teenage'
    elif 20 <= age < 30:
        cat = 'Twenty'
    elif 30 <= age < 40:
        cat = 'Thirty'
    elif 40 <= age < 50:
        cat = 'Forty'
    elif 10 <= age < 60:
        cat = 'Fifty'
    else:
        cat = 'Elderiy'
    return cat

def drop_feature(df):
    df.drop(['SibSp','Parch','Ticket', 'Name', 'PassengerId','Cabin','Age', 'Title','Fare','Fare_bins'], axis=1,inplace=True)
    return df
def fillna(df):
    df['Age'] = df.groupby(['Pclass', 'Sex', 'Title'])['Age'].apply(lambda x: x.fillna(x.mean()))
    df['Embarked'].fillna('S', inplace=True)
    return df
def add_feature(df):
    df['Title'] = df['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(0).strip())
    df['Age_cat'] = df['Age'].apply(lambda x: get_category(x))
    _, bins = np.histogram(df.Fare, bins=5)
    df['Fare_bins'] = pd.cut(x=df.Fare, bins=bins, labels=[0, 1, 2, 3, 4], include_lowest=True)
    df['Family'] = df['SibSp']+df['Parch']
    df.loc[df['Family']>1 ,'Family'] = 1
    return df
def pre_treatment(df):
    add_feature(df)
    fillna(df)
    drop_feature(df)
    return df
def run(df):
    df1 = df.copy()
    df1 = df.drop(['Survived'],axis=1)
    df1['Pclass'] = df1['Pclass'].astype('str')
    df1['Family'] = df1['Family'].astype('str')
    df1 = pd.get_dummies(df1)
    display(df1)
    X_train, X_test, y_train, y_test = train_test_split(df1, df.Survived, test_size=0.2,random_state=121)
    dtree = RandomForestClassifier(random_state=121)
    parameters = {'max_depth': [3,4,5],
                  'min_samples_split': [2, 3, 4]}
    grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=5, refit=True)
    grid_dtree.fit(X_train, y_train)
    estimator = grid_dtree.best_estimator_
    pred = estimator.predict(X_test)
    print('GridSearchCV 최적 파라미터 :', grid_dtree.best_params_)
    print(f'GridSearchCV 최고 정확도 : {grid_dtree.best_score_.round(3)}')
    print(f'데스트 정확도 : {accuracy_score(y_test, pred)}\n')
    thresholds = [0.45,0.46,0.47,0.48,0.49, 0.5,0.51,0.52,0.53,0.54, 0.55,0.56,0.57,0.58,0.59, 0.60]
    pred_proba = estimator.predict_proba(X_test)
    get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1, 1), thresholds)

df = pd.read_csv('./dataset/train.csv')
pre_treatment(df)
run(df)

# 랜덤에 따라 테스트 정확도가 달라집니다. 재현율 정밀도 부분은 값이 잘 나오지 않는걸 보아 뭔가 많이 잘못한거 같습니다.

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age_cat_Child,Age_cat_Elderiy,Age_cat_Fifty,Age_cat_Forty,Age_cat_Teenage,Age_cat_Thirty,Age_cat_Twenty,Family_0,Family_1
0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,1
1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1
2,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0
3,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1
4,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0
887,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0
888,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1
889,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0


GridSearchCV 최적 파라미터 : {'max_depth': 5, 'min_samples_split': 3}
GridSearchCV 최고 정확도 : 0.816
데스트 정확도 : 0.8044692737430168

현재 treshold 값: 0.45
오차행렬
[[85 22]
 [22 50]]
정확도: 0.75, 정밀도: 0.69, 재현율: 0.69, F1: 0.69, AUC:0.74

현재 treshold 값: 0.46
오차행렬
[[86 21]
 [22 50]]
정확도: 0.76, 정밀도: 0.7, 재현율: 0.69, F1: 0.7, AUC:0.75

현재 treshold 값: 0.47
오차행렬
[[95 12]
 [23 49]]
정확도: 0.8, 정밀도: 0.8, 재현율: 0.68, F1: 0.74, AUC:0.78

현재 treshold 값: 0.48
오차행렬
[[98  9]
 [23 49]]
정확도: 0.82, 정밀도: 0.84, 재현율: 0.68, F1: 0.75, AUC:0.8

현재 treshold 값: 0.49
오차행렬
[[99  8]
 [26 46]]
정확도: 0.81, 정밀도: 0.85, 재현율: 0.64, F1: 0.73, AUC:0.78

현재 treshold 값: 0.5
오차행렬
[[99  8]
 [27 45]]
정확도: 0.8, 정밀도: 0.85, 재현율: 0.62, F1: 0.72, AUC:0.78

현재 treshold 값: 0.51
오차행렬
[[101   6]
 [ 29  43]]
정확도: 0.8, 정밀도: 0.88, 재현율: 0.6, F1: 0.71, AUC:0.77

현재 treshold 값: 0.52
오차행렬
[[102   5]
 [ 29  43]]
정확도: 0.81, 정밀도: 0.9, 재현율: 0.6, F1: 0.72, AUC:0.78

현재 treshold 값: 0.53
오차행렬
[[102   5]
 [ 29  43]]
정확도: 0.81, 정밀도: 0.9, 재현율: 0.6, F1: 0.72, AUC:0.78

현재 tre