In [1]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import clone
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification, make_multilabel_classification
from sklearn.metrics import classification_report,accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np
from multi_label import MultilabelPredictor

#### Make a classification problem

In [2]:
def show_ratio(y):
    minor, major = sorted(Counter(y).items(), key = lambda x:x[1])
    resampled_ratio = minor[1] / major[1]
    counts = Counter(y)
    print(counts, resampled_ratio)

In [3]:
X,y = make_classification(n_samples = 1000,n_classes = 2,weights = [0.9,0.1])
Counter(y)

Counter({0: 896, 1: 104})

#### A multilabel predictor wraps a base predictor

In [4]:
predictor = MultilabelPredictor(under_sample_minor_to_major= 0.2, 
                                over_sample_minor_to_major= 'auto')

In [5]:
predictor

MultilabelPredictor(base_predictor=LogisticRegression(),
                    over_sampler=RandomOverSampler(),
                    under_sample_minor_to_major=0.2,
                    under_sampler=RandomUnderSampler())

#### This predictor is able to perform sampling tasks

In [6]:
x_resampled, y_resampled = predictor.under_sampling(X,y)
show_ratio(y_resampled)

Counter({0: 520, 1: 104}) 0.2


In [7]:
x_resampled, y_resampled = predictor.over_sampling(x_resampled, y_resampled)
show_ratio(y_resampled)

Counter({0: 520, 1: 520}) 1.0


In [8]:
x_resampled, y_resampled = predictor.combined_resampling(X,y)
show_ratio(y_resampled)

Counter({0: 520, 1: 520}) 1.0


#### Multilabel predictions

In [32]:
X,y = make_multilabel_classification(n_samples = 5000,n_labels = 2,n_classes = 10)
X_train ,X_test ,y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state =42)


In [36]:
from sklearn.tree import DecisionTreeClassifier
predictor = MultilabelPredictor(DecisionTreeClassifier(),under_sample_minor_to_major= 0.1, 
                                over_sample_minor_to_major= 'auto',over_sampler = RandomOverSampler())


predictor.fit(X_train,y_train)

fitting target: 0 >>>>>> Done.
fitting target: 1 >>>>>> Done.
fitting target: 2 >>>>>> Done.
fitting target: 3 >>>>>> Done.
fitting target: 4 >>>>>> Done.
fitting target: 5 >>>>>> Done.
fitting target: 6 >>>>>> Done.
fitting target: 7 >>>>>> Done.
fitting target: 8 >>>>>> Done.
fitting target: 9 >>>>>> Done.


MultilabelPredictor(base_predictor=DecisionTreeClassifier(),
                    over_sampler=RandomOverSampler(),
                    under_sample_minor_to_major=0.1,
                    under_sampler=RandomUnderSampler(sampling_strategy=0.4905414551607445))

In [37]:
from multi_label import MultiLabelEvaluator

eva = MultiLabelEvaluator(predictor, X_test, pd.DataFrame(y_test))
cmats, aucs, accs = eva.evaluate()

In [38]:
aucs

{0: 0.7061578308967705,
 1: 0.6574088521350185,
 2: 0.5814286634045841,
 3: 0.6592885945132845,
 4: 0.6555865187444135,
 5: 0.6105801059251903,
 6: 0.6686741335049231,
 7: 0.5827050424022866,
 8: 0.6697909933204051,
 9: 0.7182860318681789}