In [1]:
from catboost import CatBoostClassifier, Pool, metrics, cv
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
class FocalLossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers with only __len__ and __getitem__ defined).
        # weights parameter can be None.
        # Returns list of pairs (der1, der2)
        gamma = 2.
        # alpha = 1.
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        exponents = []
        for index in xrange(len(approxes)):
            exponents.append(math.exp(approxes[index]))

        result = []
        for index in xrange(len(targets)):
            p = exponents[index] / (1 + exponents[index])

            if targets[index] > 0.0:
                der1 = -((1-p)**(gamma-1))*(gamma * math.log(p) * p + p - 1)/p
                der2 = gamma*((1-p)**gamma)*((gamma*p-1)*math.log(p)+2*(p-1))
            else:
                der1 = (p**(gamma-1)) * (gamma * math.log(1 - p) - p)/(1 - p)
                der2 = p**(gamma-2)*((p*(2*gamma*(p-1)-p))/(p-1)**2 + (gamma-1)*gamma*math.log(1 - p))

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))

        return result


In [13]:
X_train = np.load('X_train_mfcc.npy')
X_test = np.load('X_test.npy')
X_val = np.load('X_val.npy')
y_train = np.load('y_train_mfcc.npy')
y_test = np.load('y_test.npy')
y_val = np.load('y_val.npy')

In [46]:
indx0 = np.where(y_train == 0)[0]
np.random.shuffle(indx0)
indx1 = np.where(y_train == 1)[0]

In [47]:
new_indx = np.hstack((indx0[:len(indx1)], indx1))
X_train = X_train[new_indx]
y_train = y_train[new_indx]

In [12]:
X_train.shape

(3772800, 16)

In [35]:
model = CatBoostClassifier(
    class_weights=[1, 7],
    # scale_pos_weight=5,
    random_seed=42,
    logging_level='Silent',
    # od_type='Iter',
    # od_wait=10,
    # use_best_model=True,
    loss_function=metrics.Logloss(),
    # eval_metric=metrics.Precision(),
    iterations=200,
)

In [36]:
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2c74bc5ae50>

In [37]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [38]:
print(classification_report(y_train_pred, y_train))
confusion_matrix(y_train_pred, y_train)

              precision    recall  f1-score   support

         0.0       0.92      0.93      0.93   3453638
         1.0       0.17      0.15      0.16    322603

    accuracy                           0.86   3776241
   macro avg       0.54      0.54      0.54   3776241
weighted avg       0.86      0.86      0.86   3776241



array([[3211586,  242052],
       [ 274196,   48407]], dtype=int64)

In [39]:
print(classification_report(y_val_pred, y_val))
confusion_matrix(y_val_pred, y_val)

              precision    recall  f1-score   support

         0.0       0.89      0.93      0.91    721363
         1.0       0.20      0.13      0.16     95512

    accuracy                           0.84    816875
   macro avg       0.54      0.53      0.53    816875
weighted avg       0.81      0.84      0.82    816875



array([[669710,  51653],
       [ 82720,  12792]], dtype=int64)

In [43]:
np.where(y_train_pred == 1)

(array([    976,     977,     978, ..., 3775369, 3775452, 3775453],
       dtype=int64),)

In [44]:
np.where(y_train == 1)

(array([    502,     509,     513, ..., 3775470, 3775479, 3775487],
       dtype=int64),)

In [45]:
clf = RandomForestClassifier(random_state=42, verbose=1, n_jobs=6)
clf.fit(X_train, y_train)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  7.5min
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed: 18.2min finished


RandomForestClassifier(n_jobs=6, random_state=42, verbose=1)

In [46]:
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   39.4s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:  1.6min finished
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    8.5s
[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:   21.1s finished


In [47]:
print(classification_report(y_train_pred, y_train))
confusion_matrix(y_train_pred, y_train)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   3499225
         1.0       0.95      0.99      0.97    277016

    accuracy                           1.00   3776241
   macro avg       0.97      0.99      0.98   3776241
weighted avg       1.00      1.00      1.00   3776241



array([[3483707,   15518],
       [   2075,  274941]], dtype=int64)

In [48]:
print(classification_report(y_val_pred, y_val))
confusion_matrix(y_val_pred, y_val)

              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92    751323
         1.0       0.10      0.09      0.10     65552

    accuracy                           0.86    816875
   macro avg       0.51      0.51      0.51    816875
weighted avg       0.85      0.86      0.86    816875



array([[693057,  58266],
       [ 59373,   6179]], dtype=int64)