In [1]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_digits

import numpy as np
import pandas as pd

In [2]:
class MyDummyClassifier(BaseEstimator):
#     fit() 메서드는 아무것도 학습하지 않음
    def fit(self, X, y=None):
        pass
    
# predict() 메서드는 단순히 Sex피처가 1이면 0,그렇지 않으면 1로 예측함
    def predict(self, X):
        pred = np.zeros(( X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i]==1:
                pred[i]=0
            else:
                pred[i]=1
                
        return pred

In [3]:
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace =True)
    df['Fare'].fillna(0,inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

def format_features(df):
    df['Cabin']=df['Cabin'].str[:1]
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df
    
    
def transform_features(df):
    df=fillna(df)
    df=drop_features(df)
    df=format_features(df)
    return df

In [4]:
# 원본 데이터를 재로딩, 데이터 가공, 학습 데이터/테스트 데이터 분할.
titanic_df = pd.read_csv('csv/titanic_train.csv')
y_titanic_df=titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived',axis=1)
X_titanic_df=transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=0)

In [5]:
myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

mypredictions = myclf.predict(X_test)
print('Dummy Classifier의 정확도는: {0:.4f}'.format(accuracy_score(y_test, mypredictions)))

Dummy Classifier의 정확도는: 0.7877


## 데이터 분포도가 균일하지 않은 경우 

In [6]:
class MyFakeClassifier(BaseEstimator):
    def fit(self,X,y):
        pass
#     입력값으로 들어오는 x데이터 세트의 크기만큼 모두 0값으로 만들어서 변환
    def predict(self,X):
        return np.zeros((len(X),1),dtype=bool)

In [7]:
digits = load_digits()

In [8]:
y=(digits.target==7).astype(int)
X_train,X_test,y_train,y_test=train_test_split(digits.data,y,random_state=11)

In [9]:
print('레이블 테스트 세트크기: ', y_test.shape)
print('테스트 세트 레이블 0과 의 분포도')
print(pd.Series(y_test).value_counts())

레이블 테스트 세트크기:  (450,)
테스트 세트 레이블 0과 의 분포도
0    405
1     45
dtype: int64


##### Dummy Classifier로 학습/예측/정확도 평가

In [10]:
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, y_train)
fakepred=fakeclf.predict(X_test)

In [11]:
print('모든 예측을 0으로 하여도 정확도는:{:.3f}'.format(accuracy_score(y_test,fakepred)))

모든 예측을 0으로 하여도 정확도는:0.900


### 오차 행렬

In [12]:
from sklearn.metrics import confusion_matrix

In [13]:
confusion_matrix(y_test, fakepred)

array([[405,   0],
       [ 45,   0]], dtype=int64)

# 정밀도와 재현율

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    print('오차 행렬')
    print(confusion)
    print('정확도:{0:.4f}, 정밀도: {1:.4f}, 재현율:{2:.4f}'.format(accuracy,precision,recall))

In [17]:
titanic_df = pd.read_csv('csv/titanic_train.csv')
y_titanic_df=titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived',axis=1)
X_titanic_df=transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.20, random_state=11)

In [18]:
lr_clf=LogisticRegression()

In [19]:
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
get_clf_eval(y_test,pred)

오차 행렬
[[104  14]
 [ 13  48]]
정확도:0.8492, 정밀도: 0.7742, 재현율:0.7869


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
