In [1]:
import sklearn
print(sklearn.__version__)

1.2.0


In [26]:
import numpy as np
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
  #fit( ) 메소드는 아무것도 학습하지 않음
  def fit(self, X, y=None):
    pass

#predict 함수는 Sex피처가 1이면 0 그렇지 않으면 1로 예측함.
  def predict (self, X):
    pred = np.zeros((X.shape[0],1))
    for i in range(X.shape[0]):
      if X['Sex'].iloc[i]== 1:
        pred[i] = 0
      else:
        pred[i] = 1
    return pred

In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

#null 처리 함수
def fillna(df):
  df['Age'].fillna(df['Age'].mean(), inplace=True)
  df['Cabin'].fillna('N', inplace=True)
  df['Embarked'].fillna('N',inplace=True)
  df['Fare'].fillna(0, inplace=True)
  return df
  
#머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
  df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
  return df

#레이블 인코딩 수행
def format_features(df):
  df['Cabin'] = df['Cabin'].str[:1]
  features = ['Cabin', 'Sex', 'Embarked']
  for feature in features:
    le = LabelEncoder()
    le = le.fit(df[feature])
    df[feature] = le.transform(df[feature])
  return df

#Data Preprocessing 호출
def transform_feature(df):
  df = fillna(df)
  df = drop_features(df)
  df = format_features(df)
  return df

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#원본 데이터 재 로딩, 데이터 가공, 학습데이터/테스트데이터 분할
titanic_df = pd.read_csv('./data_set/titanic_train.csv')
y_df = titanic_df['Survived']
x_df = titanic_df.drop('Survived', axis=1)
x_df = transform_feature(x_df)

X_train,X_test, y_train, y_test = train_test_split(x_df,y_df, test_size=0.2, random_state=44)

#더미 분류기를 이용하여 학습/예측/평가 수행
myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

mypredictions = myclf.predict(X_test)
print(f'Dummy Classification의 정확도는 {accuracy_score(y_test, mypredictions):.4f}')

Dummy Classification의 정확도는 0.7263


In [29]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [30]:
class MyFakeClassifier (BaseEstimator):
  def fit(self, X, y):
    pass
  
  #입력값으로 들어오는 X데이터셋의 크기만큼 모두 0값으로 만들어서 반환
  def predict (self, X):
    return np.zeros((len(X), 1), dtype=bool)
  

digits = load_digits()

print(digits.data)
print('### digits data shape', digits.data.shape)
print(digits.target)
print('### digits target shape', digits.target.shape)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
### digits data shape (1797, 64)
[0 1 2 ... 8 9 8]
### digits target shape (1797,)


In [34]:
digits.target ==2

array([False, False,  True, ..., False, False, False])

In [35]:
#digit의 번호가 7이면 True이고 이를 1로 변환, 아니면 0
y=(digits.target==7).astype(int)

X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=44)

In [36]:
#불균형한 레이블 데이터 분포도 확인
print('레이블 테스트 크기:', y_test.shape)
print('테스트 세트 레이블 0과 1의 분포도')
print(pd.Series(y_test).value_counts())

#더미 분류기로 평가
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, y_train)
fakepred = fakeclf.predict(X_test)

print(f'모든 예측을 0으로 해도 정확도는 {accuracy_score(y_test, fakepred):.4f}')

레이블 테스트 크기: (450,)
테스트 세트 레이블 0과 1의 분포도
0    411
1     39
dtype: int64
모든 예측을 0으로 해도 정확도는 0.9133


### 오차행렬 (Confusion Matrix)

In [None]:
from sklearn.metrics import confusion_matrix 

#실제와 가짜 예측결과의 confusion 매트릭스 출력
confusion_matrix(y_test, fakepred)

array([[411,   0],
       [ 39,   0]], dtype=int64)

### 정밀도와 재현율

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(f'정밀도 : {precision_score(y_test, fakepred)}')
print(f'재현율 : {recall_score(y_test, fakepred)}')

정밀도 : 0.0
재현율 : 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def get_clf_eval(y_test, pred):
  confusion = confusion_matrix(y_test, pred)
  accuracy = accuracy_score(y_test, pred)
  precision = precision_score(y_test, pred)
  recall = recall_score(y_test, pred)

  print('오차행렬')
  print(confusion)
  print(f'정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f}, 재현율 : {recall:.4f}')

In [40]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

#원본데이터 로딩, 데이터 분할
titanic_df = pd.read_csv('./data_set/titanic_train.csv')
y_df = titanic_df['Survived']
x_df = titanic_df.drop('Survived', axis=1)
x_df = transform_feature(x_df)


X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=44)

lr_clf = LogisticRegression()

lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

get_clf_eval(y_test, pred)

오차행렬
[[81 25]
 [22 51]]
정확도 : 0.7374, 정밀도 : 0.6711, 재현율 : 0.6986


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Precision / Recall Trade-Off