In [1363]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_blobs, make_moons, make_circles
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, auc, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [1364]:
data = make_classification(n_features = 5, n_samples = 1000, class_sep = 0.5, weights=[0.3, 0.7])

In [1365]:
xtrain, xtest, ytrain, ytest = train_test_split(data[0], data[1], test_size = 0.2)
#xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size = 0.25)

In [868]:
xtrain_res, ytrain_res = SMOTE().fit_resample(xtrain, ytrain)

In [797]:
round(roc_auc_score(ytest, LogisticRegression(max_iter=1000).fit(xtrain, ytrain).predict(xtest)),3)

0.813

In [798]:
round(roc_auc_score(ytest, LogisticRegression(max_iter=1000).fit(xtrain_res, ytrain_res).predict(xtest)),3)

0.813

In [799]:
round(roc_auc_score(ytest, RandomForestClassifier().fit(xtrain, ytrain).predict(xtest)),3)

0.886

In [800]:
round(roc_auc_score(ytest, RandomForestClassifier().fit(xtrain_res, ytrain_res).predict(xtest)),3)

0.981

In [801]:
round(roc_auc_score(ytest, KNeighborsClassifier().fit(xtrain, ytrain).predict(xtest)),3)

0.844

In [802]:
round(roc_auc_score(ytest, KNeighborsClassifier().fit(xtrain_res, ytrain_res).predict(xtest)),3)

0.901

In [803]:
round(roc_auc_score(ytest, GaussianNB().fit(xtrain, ytrain).predict(xtest)),3)

0.802

In [804]:
round(roc_auc_score(ytest, GaussianNB().fit(xtrain_res, ytrain_res).predict(xtest)),3)

0.802

In [619]:
from scipy.stats import norm, entropy, gaussian_kde

In [620]:
class WeightedEntropyClassifier:
    minority_class = None
    majority_class = None
    use_normal = False
    proportions = []
    
    def __init__(self, use_normal = False):
        self.use_normal = use_normal
        
    # get entropy weights (e.g, "fitting" the model)
    def fit(self, xtrain, ytrain, use_normal = False):
        full_train = pd.DataFrame(np.concatenate((xtrain, ytrain.reshape(-1,1)), axis=1))
        full_train.columns = [*full_train.columns[:-1], 'target']
        self.minority_class = full_train[full_train.target == 0]
        self.majority_class = full_train[full_train.target == 1]

        feature_max_entropy = []
        # clear proportions vector to "refit" model (if already fitted)
        self.proportions = []

        for i in range(xtrain.shape[1]):
            current_feature = i
            points = np.linspace(min(min(self.minority_class.iloc[:,current_feature]), min(self.majority_class.iloc[:,current_feature])), max(max(self.minority_class.iloc[:,current_feature]), max(self.majority_class.iloc[:,current_feature])), 50)
            if self.use_normal == True:
                min_pdf = norm.pdf(points, loc = np.mean(self.minority_class.iloc[:,current_feature]), scale = np.std(self.minority_class.iloc[:,current_feature]))
                maj_pdf = norm.pdf(points, loc = np.mean(self.majority_class.iloc[:,current_feature]), scale = np.std(self.majority_class.iloc[:,current_feature]))
            else:
                min_pdf = gaussian_kde(self.minority_class.iloc[:,current_feature]).evaluate(points)
                maj_pdf = gaussian_kde(self.majority_class.iloc[:,current_feature]).evaluate(points)
                # relative entropy is not commutative
                entrp =  max(entropy(min_pdf, maj_pdf), entropy(maj_pdf, min_pdf))
                feature_max_entropy.append(entrp)

        for fme in feature_max_entropy:
                self.proportions.append(fme / sum(feature_max_entropy))

    # compare new data points with weighted entropies and likelihood
    def predict(self, xtest, min_bias = 0, maj_bias = 0):
        preds = []
        for i in range(xtest.shape[0]):
            min_score = 0
            maj_score = 0
            for j in range(len(self.proportions)):
                data_point = xtest[i][j]
                if self.use_normal == True:
                    min_score += self.proportions[j]*norm.pdf(data_point, loc = np.mean(self.minority_class.iloc[:,j]), scale = np.std(self.minority_class.iloc[:,j]))
                    maj_score += self.proportions[j]*norm.pdf(data_point, loc = np.mean(self.majority_class.iloc[:,j]), scale = np.std(self.majority_class.iloc[:,j]))
                else:
                    min_score += self.proportions[j]*gaussian_kde(self.minority_class.iloc[:,j]).evaluate(data_point)
                    maj_score += self.proportions[j]*gaussian_kde(self.majority_class.iloc[:,j]).evaluate(data_point)
            preds.append(np.argmax([min_score + min_bias, maj_score + maj_bias]))

        return preds

In [1324]:
skf = StratifiedKFold(n_splits=3) # min_bias = 0.05 : 0.621
aucs = []
for train_index, test_index in skf.split(xtrain, ytrain):
    X_train, X_test = xtrain[train_index], xtrain[test_index]
    y_train, y_test = ytrain[train_index], ytrain[test_index]
    wec.fit(X_train, y_train)
    p = wec.predict(X_test, min_bias = 0.08)
    aucs.append(roc_auc_score(y_test, p))
print(round(np.mean(aucs),3))

0.609


In [1385]:
wec = WeightedEntropyClassifier()
wec.fit(xtrain[:,[1,2]], ytrain)

In [1386]:
mypreds = wec.predict(xtest[:,[1,2]])

In [1387]:
round(roc_auc_score(ytest, mypreds),3)

0.607

In [1369]:
wec.proportions

[0.16254059373377344,
 0.02634486944576904,
 0.0029665955296301534,
 0.5580116132550033,
 0.25013632803582414]

In [389]:
selected_feats = []
for i in range(len(wec.proportions)):
    if wec.proportions[i] >= 0.05:
        selected_feats.append(i)
        
print("Total selected features: ", len(selected_feats))

Total selected features:  4
