In [1]:
import pandas as pd
import numpy as np


In [2]:
# Read dataset
df = pd.read_csv('balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])
 
# Display example observations
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [3]:
df['balance'].value_counts()

L    288
R    288
B     49
Name: balance, dtype: int64

### We're going to label each observation as 1 (positive class) if the scale is balanced or 0 (negative class) if the scale is not balanced:

In [4]:
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
 
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

## The Danger of Imbalanced Classes
### Now that we have a dataset, we can really show the dangers of imbalanced classes.

In [8]:
from sklearn.linear_model import LogisticRegression
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)

In [10]:
from sklearn.metrics import accuracy_score
#How's the accuracy?
print( accuracy_score(pred_y_0, y) )
# 0.9216
# Should we be excited?
print( np.unique( pred_y_0 ) )
# [0]

0.9216
[0]


### As you can see, this model is only predicting 0, which means it's completely ignoring the minority class in favor of the majority class.

# Up-sample Minority Class


In [11]:
from sklearn.utils import resample

### Next, we'll create a new DataFrame with an up-sampled minority class. Here are the steps:

### First, we'll separate observations from each class into different DataFrames.
### Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
### Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.


In [12]:
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.balance.value_counts()
# 1    576
# 0    576
# Name: balance, dtype: int64

1    576
0    576
Name: balance, dtype: int64

In [13]:
# Separate input features (X) and target variable (y)
y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)
 
# Train model
clf_1 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889

[0 1]
0.5138888888888888


# Down-sample Majority Class
Down-sampling involves randomly removing observations from the majority class to prevent its signal from dominating the learning algorithm.

The most common heuristic for doing so is resampling without replacement.

The process is similar to that of up-sampling. Here are the steps:

First, we'll separate observations from each class into different DataFrames.
Next, we'll resample the majority class without replacement, setting the number of samples to match that of the minority class.
Finally, we'll combine the down-sampled majority class DataFrame with the original minority class DataFrame.


In [14]:
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=49,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.balance.value_counts()
# 1    49
# 0    49
# Name: balance, dtype: int64

1    49
0    49
Name: balance, dtype: int64

In [15]:
# Separate input features (X) and target variable (y)
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)
 
# Train model
clf_2 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_2 = clf_2.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )
# 0.581632653061

[0 1]
0.5816326530612245


In [16]:

from sklearn.metrics import roc_auc_score
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(X)
 
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2]
 
prob_y_2[:5] # Example
# [0.45419197226479618,
#  0.48205962213283882,
#  0.46862327066392456,
#  0.47868378832689096,
#  0.58143856820159667]

[0.4541919722647967,
 0.4820596221328393,
 0.4686232706639249,
 0.4786837883268909,
 0.5814385682015961]

In [17]:
print( roc_auc_score(y, prob_y_2) )
# 0.568096626406

0.5680966264056644


### Imbalanced Dataset Model AUROC score

In [18]:
prob_y_0 = clf_0.predict_proba(X)
prob_y_0 = [p[1] for p in prob_y_0]
 
print( roc_auc_score(y, prob_y_0) )
# 0.530718537415

0.4773011245314452


# 4. Penalize Algorithms (Cost-Sensitive Training)
The next tactic is to use penalized learning algorithms that increase the cost of classification mistakes on the minority class.

 A popular algorithm for this technique is Penalized-SVM:

During training, we can use the argument class_weight='balanced'  to penalize mistakes on the minority class by an amount proportional to how under-represented it is.

 We also want to include the argument probability=True  if we want to enable probability estimates for SVM algorithms.

### Let's train a model using Penalized-SVM on the original imbalanced dataset:

In [19]:
from sklearn.svm import SVC

In [20]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_3 = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)
 
clf_3.fit(X, y)
 
# Predict on training set
pred_y_3 = clf_3.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_3 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_3) )
# 0.688
 
# What about AUROC?
prob_y_3 = clf_3.predict_proba(X)
prob_y_3 = [p[1] for p in prob_y_3]
print( roc_auc_score(y, prob_y_3) )
# 0.5305236678

[0 1]
0.688
0.46947633219954643


# TREE BASED
The final tactic we'll consider is using tree-based algorithms. Decision trees often perform well on imbalanced datasets because their hierarchical structure allows them to learn signals from both classes.

In modern applied machine learning, tree ensembles (Random Forests, Gradient Boosted Trees, etc.) almost always outperform singular decision trees, so we'll jump right into those:

In [23]:

from sklearn.ensemble import RandomForestClassifier
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X, y)
 
# Predict on training set
pred_y_4 = clf_4.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )
# 0.9744
 
# What about AUROC?
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y, prob_y_4) )
# 0.999078798186

[0 1]
0.9808
0.9996456916099774
