In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_blobs, make_moons, make_circles
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [83]:
data = make_classification(n_features = 100, n_samples = 2000, n_informative = 2, n_redundant = 2, class_sep = 0.3, weights=[0.15, 0.85])

In [114]:
xtrain, xtest, ytrain, ytest = train_test_split(data[0], data[1], test_size = 0.2)
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size = 0.25)

In [116]:
xtrain_res, ytrain_res = SMOTE(sampling_strategy = 'minority').fit_resample(xtrain, ytrain)

In [117]:
confusion_matrix(ytest, LogisticRegression(max_iter=1000).fit(xtrain, ytrain).predict(xtest))

array([[  1,  47],
       [ 13, 339]], dtype=int64)

In [149]:
confusion_matrix(ytest, LogisticRegression(max_iter=1000).fit(xtrain_res, ytrain_res).predict(xtest))

array([[ 20,  28],
       [117, 235]], dtype=int64)

In [119]:
confusion_matrix(ytest, GaussianNB().fit(xtrain, ytrain).predict(xtest))

array([[ 18,  30],
       [ 55, 297]], dtype=int64)

In [180]:
confusion_matrix(ytest, GaussianNB().fit(xtrain_res, ytrain_res).predict(xtest))

array([[ 14,  34],
       [ 54, 298]], dtype=int64)

In [11]:
from scipy.stats import norm, entropy, gaussian_kde

In [121]:
class WeightedEntropyClassifier:
    minority_class = None
    majority_class = None
    use_normal = False
    proportions = []
    
    def __init__(self, use_normal = False):
        self.use_normal = use_normal
        
    # get entropy weights (e.g, "fitting" the model)
    def fit(self, xtrain, ytrain, use_normal = False):
        full_train = pd.DataFrame(np.concatenate((xtrain, ytrain.reshape(-1,1)), axis=1))
        full_train.columns = [*full_train.columns[:-1], 'target']
        self.minority_class = full_train[full_train.target == 0]
        self.majority_class = full_train[full_train.target == 1]

        feature_max_entropy = []
        # clear proportions vector to "refit" model (if already fitted)
        self.proportions = []

        for i in range(xtrain.shape[1]):
            current_feature = i
            points = np.linspace(min(min(self.minority_class.iloc[:,current_feature]), min(self.majority_class.iloc[:,current_feature])), max(max(self.minority_class.iloc[:,current_feature]), max(self.majority_class.iloc[:,current_feature])), 50)
            if self.use_normal == True:
                min_pdf = norm.pdf(points, loc = np.mean(self.minority_class.iloc[:,current_feature]), scale = np.std(self.minority_class.iloc[:,current_feature]))
                maj_pdf = norm.pdf(points, loc = np.mean(self.majority_class.iloc[:,current_feature]), scale = np.std(self.majority_class.iloc[:,current_feature]))
            else:
                min_pdf = gaussian_kde(self.minority_class.iloc[:,current_feature]).evaluate(points)
                maj_pdf = gaussian_kde(self.majority_class.iloc[:,current_feature]).evaluate(points)
                # relative entropy is not commutative
                entrp =  max(entropy(min_pdf, maj_pdf), entropy(maj_pdf, min_pdf))
                feature_max_entropy.append(entrp)

        for fme in feature_max_entropy:
                self.proportions.append(fme / sum(feature_max_entropy))

    # compare new data points with weighted entropies and likelihood
    def predict(self, xtest, min_bias = 0, maj_bias = 0):
        preds = []
        for i in range(xtest.shape[0]):
            min_score = 0
            maj_score = 0
            for j in range(len(self.proportions)):
                data_point = xtest[i][j]
                if self.use_normal == True:
                    min_score += self.proportions[j]*norm.pdf(data_point, loc = np.mean(self.minority_class.iloc[:,j]), scale = np.std(self.minority_class.iloc[:,j]))
                    maj_score += self.proportions[j]*norm.pdf(data_point, loc = np.mean(self.majority_class.iloc[:,j]), scale = np.std(self.majority_class.iloc[:,j]))
                else:
                    min_score += self.proportions[j]*gaussian_kde(self.minority_class.iloc[:,j]).evaluate(data_point)
                    maj_score += self.proportions[j]*gaussian_kde(self.majority_class.iloc[:,j]).evaluate(data_point)
            preds.append(np.argmax([min_score + min_bias, maj_score + maj_bias]))

        return preds

In [157]:
wec = WeightedEntropyClassifier()
wec.fit(xtrain[:,selected_feats], ytrain)

In [178]:
mypreds = wec.predict(xtest[:,selected_feats], min_bias = 0.35, maj_bias = 0.4)

In [179]:
confusion_matrix(ytest, mypreds)

array([[ 33,  15],
       [170, 182]], dtype=int64)

In [128]:
selected_feats = []
for i in range(len(wec.proportions)):
    if wec.proportions[i] >= 0.003:
        selected_feats.append(i)
        
print("Total selected features: ", len(selected_feats))

Total selected features:  6
