# This section will focus on creating Cost sensative Learning on imbalanced datasets

## Logistic Regression Model

In [1]:
# fit a logistic regression model on an imbalanced classification dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=2)
# define model
model = LogisticRegression(solver='lbfgs')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) # summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.985


## Weighted Logistic Regression with Scikit-Learn

#### The LogisticRegression class provides the class weight argument that can be specified as a model hyperparameter. The class weight is a dictionary that defines each class label (e.g. 0 and 1) and the weighting to apply in the calculation of the negative log likelihood when fitting the model. For example, a 1 to 1 weighting for each class 0 and 1 can be defined as follows:

In [2]:
# weighted logistic regression model on an imbalanced classification dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=2)
# define model
weights = {0:0.01, 1:1.0}
model = LogisticRegression(solver='lbfgs', class_weight=weights)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) # summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.989


#### The scikit-learn library provides an implementation of the best practice heuristic for the class weighting. It is implemented via the compute class weight() function and is calculated as:

#### n_samples/ n_classes * n_samples_with_class



In [6]:
import numpy as np
from sklearn.utils import class_weight
from sklearn.datasets import make_classification

# Generate a 2-class dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, 
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=2)

# Dynamically identify classes
classes = np.unique(y)

# Calculate class weighting
weight = class_weight.compute_class_weight('balanced', classes=classes, y=y)

# Create a dictionary mapping class labels to weights
class_weights = dict(zip(classes, weight))

print("Class Weights:", class_weights)

Class Weights: {0: 0.5050505050505051, 1: 50.0}


In [7]:

# weighted logistic regression for class imbalance with heuristic weights
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=2)
# define model
model = LogisticRegression(solver='lbfgs', class_weight='balanced')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) # summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))


Mean ROC AUC: 0.989


## Grid Search Weighted Logistic Regression

In [9]:
# grid search class weights with logistic regression for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=2)
# define model
model = LogisticRegression(solver='lbfgs')
# define grid
balance = [{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}] 
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv,
scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_)) # report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('%f (%f) with: %r' % (mean, stdev, param))

Best: 0.988943 using {'class_weight': {0: 1, 1: 100}}
0.982148 (0.017020) with: {'class_weight': {0: 100, 1: 1}}
0.983465 (0.015555) with: {'class_weight': {0: 10, 1: 1}}
0.985242 (0.013456) with: {'class_weight': {0: 1, 1: 1}}
0.987973 (0.009846) with: {'class_weight': {0: 1, 1: 10}}
0.988943 (0.006354) with: {'class_weight': {0: 1, 1: 100}}


## Cost-Sensative Decision Trees

In [10]:
# grid search class weights with decision tree for imbalance classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=3)
# define model
model = DecisionTreeClassifier()
# define grid
balance = [{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_)) # report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('%f (%f) with: %r' % (mean, stdev, param))

Best: 0.755892 using {'class_weight': {0: 1, 1: 10}}
0.740488 (0.079039) with: {'class_weight': {0: 100, 1: 1}}
0.735690 (0.072483) with: {'class_weight': {0: 10, 1: 1}}
0.735623 (0.070122) with: {'class_weight': {0: 1, 1: 1}}
0.755892 (0.075222) with: {'class_weight': {0: 1, 1: 10}}
0.747609 (0.070646) with: {'class_weight': {0: 1, 1: 100}}


## Cost-Sensitive Support Vector Machines

### Testing SVM model on an imbalanced dataset

In [11]:

# fit a svm on an imbalanced classification dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
# define model
model = SVC(gamma='scale')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) # summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.808


## SVM with Class Weight

In [12]:
# svm with class weight on an imbalanced classification dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
# define model
model = SVC(gamma='scale', class_weight='balanced')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) # summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))


Mean ROC AUC: 0.967


## Grid Search Class Weight SVM

In [13]:

# grid search class weights with svm for imbalance classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
# define model
model = SVC(gamma='scale')
# define grid
balance = [{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}] 
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_)) # report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('%f (%f) with: %r' % (mean, stdev, param))


Best: 0.967747 using {'class_weight': {0: 1, 1: 100}}
0.742936 (0.149898) with: {'class_weight': {0: 100, 1: 1}}
0.748636 (0.149871) with: {'class_weight': {0: 10, 1: 1}}
0.807963 (0.126343) with: {'class_weight': {0: 1, 1: 1}}
0.934495 (0.065221) with: {'class_weight': {0: 1, 1: 10}}
0.967747 (0.038636) with: {'class_weight': {0: 1, 1: 100}}


## Cost-Sensitive Deep Learning in Keras

In [15]:
# standard neural network on an imbalanced classification dataset
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from keras.layers import Dense
from keras.models import Sequential
# prepare train and test dataset
def prepare_data():
  # generate 2d classification dataset
  X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
      n_clusters_per_class=2, weights=[0.99], flip_y=0, random_state=4)
  # split into train and test
  n_train = 5000
  trainX, testX = X[:n_train, :], X[n_train:, :]
  trainy, testy = y[:n_train], y[n_train:]
  return trainX, trainy, testX, testy
# define the neural network model
def define_model(n_input):
  # define model
  model = Sequential()
  # define first hidden layer and visible layer 
  model.add(Dense(10, input_dim=n_input, activation='relu',
  kernel_initializer='he_uniform'))
  # define output layer
  model.add(Dense(1, activation='sigmoid'))
  # define loss and optimizer 
  model.compile(loss='binary_crossentropy', optimizer='sgd') 
  return model
# prepare dataset
trainX, trainy, testX, testy = prepare_data()
# define the model
n_input = trainX.shape[1]
model = define_model(n_input)
# fit model
model.fit(trainX, trainy, epochs=100, verbose=0) # make predictions on the test dataset
yhat = model.predict(testX)
# evaluate the ROC AUC of the predictions
score = roc_auc_score(testy, yhat)
print('ROC AUC: %.3f' % score)

2024-03-28 17:34:29.197654: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ROC AUC: 0.931


## Class Weighting with Neural Networks

In [16]:

# class weighted neural network on an imbalanced classification dataset
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from keras.layers import Dense
from keras.models import Sequential
# prepare train and test dataset
def prepare_data():
  # generate 2d classification dataset
  X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
      n_clusters_per_class=2, weights=[0.99], flip_y=0, random_state=4)
  # split into train and test
  n_train = 5000
  trainX, testX = X[:n_train, :], X[n_train:, :]
  trainy, testy = y[:n_train], y[n_train:]
  return trainX, trainy, testX, testy

# define the neural network model
def define_model(n_input):
  # define model
  model = Sequential()
  # define first hidden layer and visible layer 
  model.add(Dense(10, input_dim=n_input, activation='relu',
  kernel_initializer='he_uniform'))
  # define output layer
  model.add(Dense(1, activation='sigmoid'))
  # define loss and optimizer 
  model.compile(loss='binary_crossentropy', optimizer='sgd') 
  return model

# prepare dataset
trainX, trainy, testX, testy = prepare_data()
# get the model

n_input = trainX.shape[1]
model = define_model(n_input)
# fit model
weights = {0:1, 1:100}
history = model.fit(trainX, trainy, class_weight=weights, epochs=100, verbose=0) # evaluate model
yhat = model.predict(testX)
score = roc_auc_score(testy, yhat)
print('ROC AUC: %.3f' % score)

ROC AUC: 0.971
