In [4]:
#Begin by loading the training and testing data, importing a decision tree, as well
#as some libraries we will be using to score performance:

from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
import numpy as np
import scipy.sparse
import collections


X_train = scipy.sparse.load_npz("assets/resources/training_data.npz")
y_train = np.load("assets/resources/training_labels.npy")
X_test = scipy.sparse.load_npz("assets/resources/test_data.npz")
y_test = np.load("assets/resources/test_labels.npy")

In [8]:
#Train and test a simple Decision Tree classifier:

dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(collections.Counter(dt_pred))
print(balanced_accuracy_score(y_test, dt_pred))

Counter({np.int32(0): 120, np.int32(1): 11})
0.8290229885057472


Next, we test several techniques to improve performance.
Weighting: We set the class weights of our classifier to "balanced" and train
and test this new classifier:


In [9]:
dt_weighted = tree.DecisionTreeClassifier(class_weight="balanced")
dt_weighted.fit(X_train, y_train)
dt_weighted_pred = dt_weighted.predict(X_test)
print(collections.Counter(dt_weighted_pred))
print(balanced_accuracy_score(y_test, dt_weighted_pred))

Counter({np.int32(0): 114, np.int32(1): 17})
0.9913793103448276


In [10]:
#Upsampling the minor class: We extract all test samples from class 0 and class 1:
from sklearn.utils import resample

X_train_np = X_train.toarray()
class_0_indices = [i for i, x in enumerate(y_train == 0) if x]
class_1_indices = [i for i, x in enumerate(y_train == 1) if x]
size_class_0 = sum(y_train == 0)
X_train_class_0 = X_train_np[class_0_indices, :]
y_train_class_0 = [0] * size_class_0
X_train_class_1 = X_train_np[class_1_indices, :]

In [11]:
#We upsample the elements of class 1 with replacements until the number of samples of class 1 and class 0 are equal:
X_train_class_1_resampled = resample(
 X_train_class_1, replace=True, n_samples=size_class_0
)
y_train_class_1_resampled = [1] * size_class_0

In [12]:
#We combine the newly upsampled samples into a single training set:
X_train_resampled = np.concatenate([X_train_class_0,
X_train_class_1_resampled])
y_train_resampled = y_train_class_0 + y_train_class_1_resampled

In [13]:
#We train and test a Random Forest classifier on our upsampled training set:

from scipy import sparse
X_train_resampled = sparse.csr_matrix(X_train_resampled)
dt_resampled = tree.DecisionTreeClassifier()
dt_resampled.fit(X_train_resampled, y_train_resampled)
dt_resampled_pred = dt_resampled.predict(X_test)
print(collections.Counter(dt_resampled_pred))
print(balanced_accuracy_score(y_test, dt_resampled_pred))

Counter({np.int64(0): 115, np.int64(1): 16})
0.9580459770114942


Downsampling the major class: We perform similar steps to the preceding
upsampling, except this time we down-sample the major class until it is of the
same size as the minor class:

In [14]:
X_train_np = X_train.toarray()
class_0_indices = [i for i, x in enumerate(y_train == 0) if x]
class_1_indices = [i for i, x in enumerate(y_train == 1) if x]
size_class_1 = sum(y_train == 1)
X_train_class_1 = X_train_np[class_1_indices, :]
y_train_class_1 = [1] * size_class_1
X_train_class_0 = X_train_np[class_0_indices, :]
X_train_class_0_downsampled = resample(
 X_train_class_0, replace=False, n_samples=size_class_1
)
y_train_class_0_downsampled = [0] * size_class_1

In [15]:
#We create a new training set from the downsampled data:
X_train_downsampled = np.concatenate([X_train_class_1,
X_train_class_0_downsampled])
y_train_downsampled = y_train_class_1 + y_train_class_0_downsampled

In [17]:
#We train a Random Forest classifier on this dataset:
X_train_downsampled = sparse.csr_matrix(X_train_downsampled)
dt_downsampled = tree.DecisionTreeClassifier()
dt_downsampled.fit(X_train_downsampled, y_train_downsampled)
dt_downsampled_pred = dt_downsampled.predict(X_test)
print(collections.Counter(dt_downsampled_pred))
print(balanced_accuracy_score(y_test, dt_downsampled_pred))

Counter({np.int64(0): 108, np.int64(1): 23})
0.9655172413793103


Classifier including inner balancing samplers: We utilize the imbalanced-learn
package classifiers that resample subsets of data before the training estimators:

In [21]:
from imblearn.ensemble import BalancedBaggingClassifier
balanced_clf = BalancedBaggingClassifier(
 estimator=tree.DecisionTreeClassifier(),
 sampling_strategy="auto",
 replacement=True,
)
balanced_clf.fit(X_train, y_train)
balanced_clf_pred = balanced_clf.predict(X_test)
print(collections.Counter(balanced_clf_pred))
print(balanced_accuracy_score(y_test, balanced_clf_pred))

Counter({np.int32(0): 111, np.int32(1): 20})
0.978448275862069
