# The problem of class imbalanced in supervised classification

## Highlight the issue of working with imbalanced classes

Let's open the "clean" version of the dataset.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("cleaner_dataframe.csv", index_col=0)

In [None]:
df.head()

We can analyze the target which we will be using to train a model.

In [None]:
df['Refusal_Flag']

In [None]:
from collections import Counter

In [None]:
counter = Counter(df['Refusal_Flag'])
counter

By looking at the classes, we can observe that there is 2 classes: `'Yes'` and `'No'`.
In addition, we can see that there is a difference regarding the classes frequencies. We can compute the balancing ratio.

In [None]:
counter['Yes'] / counter['No']

We can check what it would imply if we are not careful when evaluating our model. We will train a `DummyClassifier` which will not predict by learning anything from the data but rather predict the most frequent class in the dataset.

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(df, df['Refusal_Flag'])

Let's check if the classifier the most frequent class.

In [None]:
dummy_clf.predict([[0]])

In [None]:
y_pred = dummy_clf.predict(df)

Now, let's see what it implies regarding the default metric with scikit-learn classifier.

In [None]:
print(f"The accuracy of the default model is "
      f"{dummy_clf.score(df, df['Refusal_Flag']):.3f}")

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(f"The accuracy of the default model is "
      f"{accuracy_score(df['Refusal_Flag'], y_pred):.3f}")

## Step 1: Use one or several informative metrics to detect the issue

Instead of using the `accuracy_score`, one could use the `balanced_accuracy_score`. Slight issue: not everyone agrees on the definition of the metric.

In [None]:
from sklearn.metrics import balanced_accuracy_score

print(f"The balanced accuracy of the default model is "
      f"{balanced_accuracy_score(df['Refusal_Flag'], y_pred):.3f}")

Then, we can have a look at the confusion matrix to a real idea of what is happening.

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(df['Refusal_Flag'], y_pred)

In [None]:
%matplotlib inline
import seaborn as sns

In [None]:
cm_df = pd.DataFrame(
    confusion_matrix(df['Refusal_Flag'], y_pred),
    columns=dummy_clf.classes_,
    index=dummy_clf.classes_
)
sns.heatmap(cm_df, annot=True, fmt='g')

From this confusion matrix, we could compute several matrix: https://en.wikipedia.org/wiki/Confusion_matrix

* precision and recall
* sensitivity and specificity
* area under the roc curve (ROC-AUC)

NB: sensitivity = recall

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

precision = precision_score(
    df['Refusal_Flag'], y_pred, pos_label='Yes')
recall = recall_score(
    df['Refusal_Flag'], y_pred, pos_label='Yes')

print(f"The precision of the dummy model is "
      f"{precision}")
print(f"The recall of the dummy model is "
      f"{recall}")

In [None]:
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score

sensitivity = sensitivity_score(
    df['Refusal_Flag'], y_pred, pos_label='Yes')
specificity = specificity_score(
    df['Refusal_Flag'], y_pred, pos_label='Yes')

print(f"The sensitivity of the dummy model is "
      f"{sensitivity}")
print(f"The specificity of the dummy model is "
      f"{specificity}")

## Step 2: How to solve the issue during `fit`

In [None]:
df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/adult-census.csv")

In [None]:
target_name = "class"
target = df[target_name].to_numpy()
data = df.drop(columns=[target_name, "fnlwgt"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, target, random_state=0
)

### 2.1 Baseline classifier

In [None]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test, y_test)

### 2.2 Make use of the `class_weight` parameter

#### In linear model

In [None]:
binary_encoding_columns = ['sex']
one_hot_encoding_columns = ['workclass', 'education', 'marital-status',
                            'occupation', 'relationship',
                            'race', 'native-country']
scaling_columns = ['age', 'education-num', 'hours-per-week',
                   'capital-gain', 'capital-loss']

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

preprocessor_lr = ColumnTransformer([
    ('binary-encoder', OrdinalEncoder(), binary_encoding_columns),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'),
     one_hot_encoding_columns),
    ('standard-scaler', StandardScaler(), scaling_columns)
])
model_lr = make_pipeline(
    preprocessor_lr,
    LogisticRegression(solver='lbfgs', max_iter=1000)
)

In [None]:
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
model_lr.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

We can set the `class_weight='balanced'` uses the values of `y` to automatically adjust weights inversely proportional to class frequencies in the input data.

In [None]:
model_lr.set_params(logisticregression__class_weight='balanced')
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
model_lr.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

#### In tree-based model

In [None]:
ordinal_encoding_columns = ['workclass', 'education', 'marital-status',
                            'occupation', 'relationship', 'sex',
                            'race', 'native-country']
scaling_columns = ['age', 'education-num', 'hours-per-week',
                   'capital-gain', 'capital-loss']

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier

preprocessor_rf = ColumnTransformer([
    ('binary-encoder', OrdinalEncoder(), ordinal_encoding_columns),
    ('standard-scaler', FunctionTransformer(validate=False), scaling_columns)
])
model_rf = make_pipeline(
    preprocessor_rf,
    RandomForestClassifier(n_estimators=100, random_state=42)
)

In [None]:
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
model_rf.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
model_rf.set_params(randomforestclassifier__class_weight='balanced')
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
model_rf.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
model_rf.set_params(randomforestclassifier__class_weight='balanced_subsample')
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
model_rf.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

### 2.3 Resample the training set to have balanced classes

#### Random under-sampling during training

In [None]:
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from imblearn.under_sampling import RandomUnderSampler

In [None]:
model_lr_balanced = make_pipeline_imblearn(
    preprocessor_lr,
    RandomUnderSampler(),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)

In [None]:
model_lr_balanced.fit(X_train, y_train)
y_pred = model_lr_balanced.predict(X_test)
model_lr_balanced.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

#### Random over-sampling during testing

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
model_lr_balanced = make_pipeline_imblearn(
    preprocessor_lr,
    RandomOverSampler(),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)

In [None]:
model_lr_balanced.fit(X_train, y_train)
y_pred = model_lr_balanced.predict(X_test)
model_lr_balanced.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

#### More fancy methods

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
model_lr_balanced = make_pipeline_imblearn(
    preprocessor_lr,
    SMOTE(),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)

In [None]:
model_lr_balanced.fit(X_train, y_train)
y_pred = model_lr_balanced.predict(X_test)
model_lr_balanced.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

### 2.4 Used balanced algorithms: `BalancedBaggingClassifier` and `BalancedRandomForest`

#### Example of `BalancedBaggingClassifier`

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier

In [None]:
model_bagging = make_pipeline(
    preprocessor_rf,
    BaggingClassifier(base_estimator=HistGradientBoostingClassifier(),
                      n_estimators=10, random_state=42)
)
model_bagging.fit(X_train, y_train)
y_pred = model_bagging.predict(X_test)
model_bagging.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
model_bagging = make_pipeline(
    preprocessor_rf,
    BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(),
                              n_estimators=10, random_state=42)
)
model_bagging.fit(X_train, y_train)
y_pred = model_bagging.predict(X_test)
model_bagging.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

#### Example of `BalancedRandomForestClassifier`

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

model_rf = make_pipeline(
    preprocessor_rf,
    BalancedRandomForestClassifier(n_estimators=100, random_state=42)
)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
model_rf.score(X_test, y_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)