In [1261]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_blobs, make_moons, make_circles
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [1597]:
data = make_classification(n_features = 80, n_samples = 2000, n_informative = 5, n_redundant = 3, class_sep = 0.4, weights=[0.15, 0.85])

In [1598]:
xtrain, xtest, ytrain, ytest = train_test_split(data[0], data[1], test_size = 0.3)

In [1599]:
xtrain_res, ytrain_res = RandomUnderSampler(sampling_strategy = 'majority').fit_resample(xtrain, ytrain)

In [1600]:
confusion_matrix(ytest, LogisticRegression(max_iter=1000).fit(xtrain, ytrain).predict(xtest))

array([[  7,  86],
       [  1, 506]], dtype=int64)

In [1601]:
confusion_matrix(ytest, LogisticRegression(max_iter=1000).fit(xtrain_res, ytrain_res).predict(xtest))

array([[ 67,  26],
       [276, 231]], dtype=int64)

In [1555]:
confusion_matrix(ytest, GaussianNB().fit(xtrain, ytrain).predict(xtest))

array([[ 29,  61],
       [ 52, 458]], dtype=int64)

In [1567]:
confusion_matrix(ytest, GaussianNB().fit(xtrain_res, ytrain_res).predict(xtest))

array([[ 56,  34],
       [175, 335]], dtype=int64)

In [1202]:
from scipy.stats import norm, entropy, gaussian_kde

In [1602]:
class WeightedEntropyClassifier:
    minority_class = None
    majority_class = None
    use_normal = False
    proportions = []
    
    def __init__(self, use_normal = False):
        self.use_normal = use_normal
        
    # get entropy weights (e.g, "fitting" the model)
    def fit(self, xtrain, ytrain, use_normal = False):
        full_train = pd.DataFrame(np.concatenate((xtrain, ytrain.reshape(-1,1)), axis=1))
        full_train.columns = [*full_train.columns[:-1], 'target']
        self.minority_class = full_train[full_train.target == 0]
        self.majority_class = full_train[full_train.target == 1]

        feature_max_entropy = []
        # clear proportions vector to "refit" model (if already fitted)
        self.proportions = []

        for i in range(xtrain.shape[1]):
            current_feature = i
            points = np.linspace(min(min(self.minority_class.iloc[:,current_feature]), min(self.majority_class.iloc[:,current_feature])), max(max(self.minority_class.iloc[:,current_feature]), max(self.majority_class.iloc[:,current_feature])), 50)
            if self.use_normal == True:
                min_pdf = norm.pdf(points, loc = np.mean(self.minority_class.iloc[:,current_feature]), scale = np.std(self.minority_class.iloc[:,current_feature]))
                maj_pdf = norm.pdf(points, loc = np.mean(self.majority_class.iloc[:,current_feature]), scale = np.std(self.majority_class.iloc[:,current_feature]))
            else:
                min_pdf = gaussian_kde(self.minority_class.iloc[:,current_feature]).evaluate(points)
                maj_pdf = gaussian_kde(self.majority_class.iloc[:,current_feature]).evaluate(points)
                # relative entropy is not commutative
                entrp =  max(entropy(min_pdf, maj_pdf), entropy(maj_pdf, min_pdf))
                feature_max_entropy.append(entrp)

        for fme in feature_max_entropy:
                self.proportions.append(fme / sum(feature_max_entropy))

    # compare new data points with weighted entropies and likelihood
    def predict(self, xtest, min_bias = 0, maj_bias = 0):
        preds = []
        for i in range(xtest.shape[0]):
            min_score = 0
            maj_score = 0
            for j in range(len(self.proportions)):
                data_point = xtest[i][j]
                if self.use_normal == True:
                    min_score += self.proportions[j]*norm.pdf(data_point, loc = np.mean(self.minority_class.iloc[:,j]), scale = np.std(self.minority_class.iloc[:,j]))
                    maj_score += self.proportions[j]*norm.pdf(data_point, loc = np.mean(self.majority_class.iloc[:,j]), scale = np.std(self.majority_class.iloc[:,j]))
                else:
                    min_score += self.proportions[j]*gaussian_kde(self.minority_class.iloc[:,j]).evaluate(data_point)
                    maj_score += self.proportions[j]*gaussian_kde(self.majority_class.iloc[:,j]).evaluate(data_point)
            preds.append(np.argmax([min_score + min_bias, maj_score + maj_bias]))

        return preds

In [1611]:
wec = WeightedEntropyClassifier()
wec.fit(xtrain_res[:,selected_feats], ytrain_res)

In [1634]:
mypreds = wec.predict(xtest[:,selected_feats], min_bias = 0.02, maj_bias = 0.01)

In [1635]:
confusion_matrix(ytest, mypreds)

array([[ 49,  44],
       [165, 342]], dtype=int64)

In [1609]:
wec.proportions

[0.010063729692778226,
 0.003093895323275523,
 0.005803595718118471,
 0.011733614642745553,
 0.0037222767352874902,
 0.006104648574081191,
 0.006860177961012832,
 0.00700967765397867,
 0.015379389456804058,
 0.013955950988463316,
 0.01134431069732189,
 0.015933815476529563,
 0.006102526162443828,
 0.031898354689956054,
 0.006567671170839126,
 0.015283740269908787,
 0.04199528560315403,
 0.006495814281874749,
 0.0029049848171185918,
 0.004569179443700789,
 0.006147697412063598,
 0.003489406713163303,
 0.004043037505978401,
 0.009724445915576396,
 0.004017918125390716,
 0.004348075862064463,
 0.003684463492318325,
 0.0018484966472288205,
 0.005303526354525885,
 0.0038049187479611214,
 0.0056048833416901265,
 0.021606592174664468,
 0.01941441107853151,
 0.0057848195823186005,
 0.011030493347722567,
 0.003629044976086319,
 0.004751460960514179,
 0.0039933389532311354,
 0.006882224237559627,
 0.0039582750729574845,
 0.0027941181174918244,
 0.008731751658328119,
 0.020430465497193965,
 0.012

In [1610]:
selected_feats = []
for i in range(len(wec.proportions)):
    if wec.proportions[i] >= 0.05:
        selected_feats.append(i)
        
print("Total selected features: ", len(selected_feats))

Total selected features:  2
