In [1]:
import numpy as np
from sklearn.base import BaseEstimator

class DummyClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            pred[i] = 0 if X['Sex'].iloc[i] == 1 else 1
        
        return pred

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    
    return df


def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    
    return df


def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
        
    return df


def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    
    return df

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

titanic_df = pd.read_csv('../data/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, 
                                                    test_size=0.2, random_state=0)

In [4]:
clf = DummyClassifier()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print(np.round(accuracy_score(y_test, pred), 4))

0.7877


In [5]:
class FakeClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [6]:
from sklearn.datasets import load_digits

digits = load_digits()

print(digits.data)
print(digits.data.shape)
print(digits.target)
print(digits.target.shape)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
(1797, 64)
[0 1 2 ... 8 9 8]
(1797,)


In [7]:
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=11)

In [8]:
print(y_test.shape)
print(pd.Series(y_test).value_counts())

(450,)
0    405
1     45
dtype: int64


In [9]:
fclf = FakeClassifier()
fclf.fit(X_train, y_train)
fpred = fclf.predict(X_test)
print(accuracy_score(y_test, fpred))

0.9


In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, fpred)

array([[405,   0],
       [ 45,   0]], dtype=int64)

In [11]:
from sklearn.metrics import precision_score, recall_score

print(f'precision : {precision_score(y_test, fpred)}')
print(f'recall : {recall_score(y_test, fpred)}')

precision : 0.0
recall : 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print(f'confusion matrix :\n{confusion}')
    print(f'accuracy : {accuracy:.4f}\nprecision : {precision:.4f}\nrecall : {recall:.4f}')

In [13]:
from sklearn.linear_model import LogisticRegression

titanic_df = pd.read_csv('../data/titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)

X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, 
                                                    test_size=0.2, random_state=11)

In [14]:
lr_clf = LogisticRegression(max_iter=150)
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

get_clf_eval(y_test, pred)

confusion matrix :
[[104  14]
 [ 13  48]]
accuracy : 0.8492
precision : 0.7742
recall : 0.7869
