In [1]:
# from MEM import MEM
from datasets import get_dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
import warnings

warnings.filterwarnings('ignore')

## Adults dataset

In [2]:
X_train, y_train, X_test, y_test = get_dataset('adults', discrete=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((32561, 101), (32561, 1), (16281, 101), (16281, 1))

### TAN

In [6]:
from pytan import CLGBayesNetClassifier, DiscreteBayesNetClassifier

In [None]:
clf = DiscreteBayesNetClassifier()

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Feature selection

$l1$-regularized logistic regression maximizes the worst-case mutual information over
$Γ(Q)$, which seems superior to the methods maximizing a heusristic instead of the mutual information

In [5]:
logreg = LogisticRegression(penalty='l1', C=10)
logreg.fit(X_train, y_train)
logreg.score(X_test,y_test)

0.8352681039248203

In [5]:
threshold = 0.4
selected_columns = [i for i, score in enumerate(logreg.coef_[0]) if abs(score) >threshold]

In [6]:
X_train_selected = X_train[:,selected_columns]
X_test_selected = X_test[:,selected_columns]
X_train_selected.shape, X_test_selected.shape

((32561, 55), (16281, 55))

### Vanilla SVM

In [18]:
svc=SVC(kernel='linear')
svc.fit(X_train, y_train)
svc.score(X_test,y_test)

0.8527731711811314

### Minimize based implementation

In [8]:
lmbd = 0.001
y = y_train
X = X_train_selected
def MEM(alpha):
    w = (y*X)@alpha
    return np.mean(np.maximum(0,(1-w)*0.5, 1 -w))+lmbd*np.linalg.norm(alpha,ord=2)**2

In [9]:
MEM(np.ones(X.shape[1]))

1.4659275500334625

In [10]:
res = minimize(MEM, np.ones(X.shape[1]))

In [11]:
# test score
accuracy_score(y_test, np.sign(X_test_selected@res.x))

0.8448498249493275

### SGD-based implementation

In [7]:
from MEM import MEM

In [8]:
X_train_selected.shape

(32561, 55)

In [16]:
clf = MEM()
clf.fit(X_train_selected, y_train.flatten())

In [17]:
# test score
accuracy_score(y_test, clf.predict(X_test_selected))

0.8406731773232602