# 3-1 Accuracy  (정확도)

In [3]:
import numpy as np
from sklearn.base import BaseEstimator


class MyDummyClassifier(BaseEstimator):
    # fit 메소드는 아무것도 학습하지 않음
    def fit(self, X, y=None):
        pass
    
    # predict() 메소드는 단순히 Sex feature가 1이면 0, 그렇지 않으면 1로 예측함
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i]==1:
                pred[i] = 0
            else:
                pred[i] = 1
        return pred

In [4]:
from sklearn.preprocessing import LabelEncoder

def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace = True)
    df['Embarked'].fillna('N', inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace = True)
    return df


def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Embarked', 'Sex']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

def transform_feature(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df
    

In [5]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

titanic_df = pd.read_csv('./train.csv')
y_t_df = titanic_df['Survived']
x_t_df = titanic_df.drop('Survived', axis = 1)
x_t_df = transform_feature(x_t_df)

X_train, X_test, y_train, y_test = train_test_split(x_t_df, y_t_df, \
                                                   test_size = 0.2, random_state = 11)

In [6]:
mclf = MyDummyClassifier()
mclf.fit(X_train, y_train)

mypredictions = mclf.predict(X_test)
print(accuracy_score(y_test, mypredictions))

0.8324022346368715


# 3-2 Confusion Matrix (오차 행렬)

- 정밀도:예측을 positive로 한 대상 중에 예측과 실제 값이 positive로 일치한 데이터의 비율
- 재현율:실제값이 positive인 대상 중에 예측과 실제 값이 positive로 일치한 데이터의 비율

In [7]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, mypredictions)

array([[103,  15],
       [ 15,  46]])

## 정밀도(precision)와 재현율(recall)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(precision_score(y_test, mypredictions))
print(recall_score(y_test, mypredictions))

0.7540983606557377
0.7540983606557377


- 오차행렬 정확도 정밀도 재현율을 한꺼번에 계산하는 함수 생성

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print('정확도 : {0:.4f}, 정밀도 : {1:.4f}, 재현율 : {2:.4f}'.format(accuracy, precision, recall))

In [10]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

titanic_df = pd.read_csv('./train.csv')
y_t_df = titanic_df['Survived']
X_t_df = titanic_df.drop('Survived', axis = 1)
X_t_df = transform_feature(X_t_df)

X_train, X_test, y_train, y_test = train_test_split(X_t_df, y_t_df,
                                                   test_size = 0.2, random_state = 11)
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test, pred)

오차 행렬
[[104  14]
 [ 13  48]]
정확도 : 0.8492, 정밀도 : 0.7742, 재현율 : 0.7869


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Precicion/Recall Trade-off