<a href="https://colab.research.google.com/github/Nekokan1500/Machine-Learning/blob/main/Imbalanced_Learning/Example_Imbalanced_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U imbalanced-learn

In [17]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score
from imblearn.metrics import geometric_mean_score

In [4]:
data = fetch_openml(data_id=1220)

df = pd.DataFrame(data['data'], columns=data['feature_names']).astype(float)
df['target'] = pd.Series(data['target'].astype(int))
df.sample(n=5, random_state=42)

Unnamed: 0,impression,ad_id,advertiser_id,depth,position,keyword_id,title_id,description_id,user_id,target
12831,1.0,20884487.0,22234.0,1.0,1.0,19303.0,83270.0,74248.0,70.0,0
34147,29.0,10593104.0,1268.0,2.0,1.0,3275.0,2136.0,128.0,0.0,1
7656,1.0,21319021.0,20551.0,2.0,2.0,4766.0,15510.0,15125.0,11569401.0,0
8659,1.0,3831882.0,27486.0,2.0,1.0,1007.0,2869.0,387.0,12581245.0,0
17460,1.0,21442048.0,37039.0,3.0,1.0,35349.0,399482.0,328000.0,0.0,0


In [11]:
print('{:.3f}'.format(df['target'].mean()))

0.168


In [18]:
# check cardinality of each feature
for feature in data['feature_names']:
  print('Cardinality of {}: {:,}'.format(
      feature, df[feature].value_counts().shape[0]
  ))

# create training and testing samples
x, y = df[data['feature_names']], df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

Cardinality of impression: 99
Cardinality of ad_id: 19,228
Cardinality of advertiser_id: 6,064
Cardinality of depth: 3
Cardinality of position: 3
Cardinality of keyword_id: 19,803
Cardinality of title_id: 25,321
Cardinality of description_id: 22,381
Cardinality of user_id: 30,114


In [19]:
def predict_and_evaluate(x_train, y_train, x_test, y_test, sample_weight=None, title='Unweighted'):
  clf = Pipeline(
      [
       ('Embedder', RandomTreesEmbedding(n_estimators=10, max_leaf_nodes=20, random_state=42)),
       ('Scaler', MaxAbsScaler()),
       ('Classifier', LogisticRegression(solver='saga', max_iter=1000, random_state=42))
      ]
  )
  clf.fit(x_train, y_train, Classifier__sample_weight=sample_weight)
  y_test_pred = clf.predict(x_test)
  print(
      'Precision: {:.02%}, Recall: {:.02%}; G-mean: {:.02%} @ {}'.format(
          precision_score(y_test, y_test_pred),
          recall_score(y_test, y_test_pred),
          geometric_mean_score(y_test, y_test_pred),
          title
      )
  )
  return clf

In [20]:
clf = predict_and_evaluate(x_train, y_train, x_test, y_test)

Precision: 62.50%, Recall: 0.30%; G-mean: 5.45% @ Unweighted


In [22]:
# Applying sample weights. The number of samples in the majority class is about
# five times that of the minority class. So apply sampwt of 5 to the minority samples.
sample_weight = (1*(y_train == 0)) + (5*(y_train==1))
clf = predict_and_evaluate(x_train, y_train, x_test, y_test, 
                           sample_weight=sample_weight, title="Weighted")

Precision: 24.78%, Recall: 13.38%; G-mean: 35.04% @ Weighted




In [32]:
def calculate_feature_coeff(clf):
  return pd.DataFrame(
      {
          'Features': [
                       f'EmbFeature{e}' for e in range(len(clf[-1].coef_[0]))
          ] + ['Intercept'],
          'Coeff': list(clf[-1].coef_[0]) + [clf[-1].intercept_[0]]
      }
  ).set_index('Features').tail(10)

In [33]:
calculate_feature_coeff(clf)

Unnamed: 0_level_0,Coeff
Features,Unnamed: 1_level_1
EmbFeature187,0.442264
EmbFeature188,-1.540624
EmbFeature189,-0.427968
EmbFeature190,0.596631
EmbFeature191,0.51436
EmbFeature192,0.86612
EmbFeature193,-0.925437
EmbFeature194,-0.150263
EmbFeature195,-0.294957
Intercept,0.69207


In [34]:
# Compare three weighting strategies
df_coef_list = []
weight_options = [1,2,5]

for w in weight_options:
  print(f'\nMinority Class (Positive Class) Weight = Weight x {w}')
  sample_weight = (1*(y_train == 0)) + (w*(y_train == 1))
  clf = predict_and_evaluate(x_train, y_train, x_test, y_test, sample_weight=sample_weight, title='Unbalanced')
  df_coef = calculate_feature_coeff(clf)
  df_coef = df_coef.rename(columns={'Coeff': f'Coeff [w={w}]'})
  df_coef_list.append(df_coef)

pd.concat(df_coef_list, axis=1).round(2).style.bar(
    subset=[f'Coeff [w={w}]' for w in weight_options],
    color='#999', align='zero'
)


Minority Class (Positive Class) Weight = Weight x 1
Precision: 62.50%, Recall: 0.30%; G-mean: 5.45% @ Unbalanced

Minority Class (Positive Class) Weight = Weight x 2
Precision: 36.36%, Recall: 2.14%; G-mean: 14.57% @ Unbalanced

Minority Class (Positive Class) Weight = Weight x 5
Precision: 24.78%, Recall: 13.38%; G-mean: 35.04% @ Unbalanced




Unnamed: 0_level_0,Coeff [w=1],Coeff [w=2],Coeff [w=5]
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EmbFeature187,0.09,0.17,0.44
EmbFeature188,-1.18,-1.37,-1.54
EmbFeature189,-0.17,-0.26,-0.43
EmbFeature190,0.37,0.45,0.6
EmbFeature191,0.37,0.44,0.51
EmbFeature192,0.6,0.76,0.87
EmbFeature193,-0.92,-0.88,-0.93
EmbFeature194,-0.24,-0.2,-0.15
EmbFeature195,-0.13,-0.18,-0.29
Intercept,-0.15,0.1,0.69
