In [1]:
import numpy as np
with open('data40k.npy', 'rb') as fl:
    train_data = np.load(fl)
    train_labels = np.load(fl)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [3]:
def f_05_u_score(true_y, pred_y, pos_label=1, threshold=0.5):
    """
    Return F0.5u score of prediction.
    :param true_y: true labels
    :param pred_y: predicted labels
    :param threshold: indication for non-decisions (default = 0.5)
    :param pos_label: positive class label (default = 1)
    :return: F0.5u score
    """

    n_tp = 0
    n_fn = 0
    n_fp = 0
    n_u = 0

    for i, pred in enumerate(pred_y):
        if pred == threshold:
            n_u += 1
        elif pred == pos_label and pred == true_y[i]:
            n_tp += 1
        elif pred == pos_label and pred != true_y[i]:
            n_fp += 1
        elif true_y[i] == pos_label and pred != true_y[i]:
            n_fn += 1

    return (1.25 * n_tp) / (1.25 * n_tp + 0.25 * (n_fn + n_u) + n_fp)

In [4]:
train_data = num_pipeline.fit_transform(train_data)

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

In [6]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [8]:
y_pred = lin_reg.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

In [9]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.58      0.68     16390
           1       0.82      0.93      0.87     32440

    accuracy                           0.81     48830
   macro avg       0.81      0.76      0.77     48830
weighted avg       0.81      0.81      0.80     48830

[[ 9582  6808]
 [ 2358 30082]]


In [10]:
print('AUC ' + str(roc_auc_score(y_test, y_pred)))
print('f05 ' + str(f_05_u_score(y_test, y_pred, pos_label=0)))

AUC 0.7559683658722461
f05 0.7468433359314107


In [11]:
from sklearn.tree import DecisionTreeClassifier
trees = DecisionTreeClassifier()
trees.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
y_pred = trees.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)

In [13]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.56      0.55     16390
           1       0.77      0.77      0.77     32440

    accuracy                           0.70     48830
   macro avg       0.66      0.66      0.66     48830
weighted avg       0.70      0.70      0.70     48830

[[ 9171  7219]
 [ 7577 24863]]


In [14]:
print('AUC ' + str(roc_auc_score(y_test, y_pred)))
print('f05 ' + str(f_05_u_score(y_test, y_pred, pos_label=0)))

AUC 0.6629894190542036
f05 0.5499388357199395


In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(x_train, y_train)

  warn("The default value of n_estimators will change from "


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
y_pred = forest.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)

In [17]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.58      0.62     16390
           1       0.80      0.85      0.83     32440

    accuracy                           0.76     48830
   macro avg       0.73      0.72      0.72     48830
weighted avg       0.76      0.76      0.76     48830

[[ 9462  6928]
 [ 4715 27725]]


In [18]:
print('AUC ' + str(roc_auc_score(y_test, y_pred)))
print('f05 ' + str(f_05_u_score(y_test, y_pred, pos_label=0)))

AUC 0.71597899045236
f05 0.6472133300500698
