In [1]:
# from MEM import MEM
from datasets import get_dataset, one_hot_df, get_adults_dataframes
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
from pytan import CLGBayesNetClassifier, DiscreteBayesNetClassifier
import warnings

warnings.filterwarnings('ignore')

In [2]:
datasets = ['credits', 'promoters', 'hepatitis', 'adult', 'kr-vs-kp', 'votes']

data = {}

for dataset in datasets:
    print(dataset)
    X_train, y_train, X_test, y_test = get_dataset(dataset, discrete=True)
    df_train = pd.DataFrame(X_train)
    df_test= pd.DataFrame(X_test)
    drops = df_train.loc[:,df_train.apply(pd.Series.nunique) == 1].columns
    X_train = df_train.drop(columns = drops).values
    X_test = df_test.drop(columns = drops).values
    data[dataset] = (X_train, y_train, X_test, y_test)

credits
promoters
hepatitis
adult
kr-vs-kp
votes


In [6]:
for dataset in datasets:
    print(np.unique(data[dataset][1]), np.unique(data[dataset][-1]))

[-1  1] [-1  1]
[-1  1] [-1  1]
[-1  1] [-1  1]
[-1  1] [-1  1]
[-1  1] [-1  1]
[-1  1] [-1  1]


In [7]:
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm_notebook as tqdm

In [8]:
def process_MEM(num_iters, data=data):
    lmbd = 0.01

    def MEM(alpha):
        w = (y*X)@alpha
        return np.mean(np.maximum(0,(1-w)*0.5, 1 -w))+lmbd*np.linalg.norm(alpha,ord=2)**2
    
    
    results = [[] for _ in range(len(data))]
    for i in tqdm(range(num_iters)):
        for j, (dataset, sample) in enumerate(data.items()):
            print(dataset)
            X_train, y_train, X_test, y_test = sample
            y = y_train
            X = X_train
#             clf.fit(X_train, y_train)
            res = minimize(MEM, np.ones(X.shape[1]), tol=1e-3)
#             print(np.sign(X_test@res.x).astype(int))
            results[j].append(1 - accuracy_score(y_test, np.sign(X_test@res.x).astype(int)))
            print(results[j])
    results = np.array(results)
    mean, var = results.mean(axis=1), results.var(axis=1)
    for name, vmean, vvar in zip(data.keys(), mean, var):
        print(f'{name:<15} mean: {vmean:<5},\t var: {vvar}')

In [11]:
def process_clf(clf, num_iters, data=data):
    results = [[] for _ in range(len(data))]
    for i in tqdm(range(num_iters)):
        for j, (dataset, sample) in enumerate(data.items()):
            print(dataset)
            X_train, y_train, X_test, y_test = sample
            clf.fit(X_train, y_train)
#             print(y_test, X_test.shape)
            results[j].append(1 - accuracy_score(y_test, clf.predict(X_test)))
    results = np.array(results)
    mean, var = results.mean(axis=1), results.var(axis=1)
    for name, vmean, vvar in zip(data.keys(), mean, var):
        print(f'{name:<15} mean: {vmean:<5},\t var: {vvar}')
    return

In [13]:
clf = DiscreteBayesNetClassifier(alpha=1)
process_clf(clf, 1, data=data)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

credits
promoters
hepatitis
adult
kr-vs-kp
votes

credits         mean: 0.138728323699422,	 var: 0.0
promoters       mean: 0.18518518518518523,	 var: 0.0
hepatitis       mean: 0.1282051282051282,	 var: 0.0
adult           mean: 0.18536944905104114,	 var: 0.0
kr-vs-kp        mean: 0.07259073842302877,	 var: 0.0
votes           mean: 0.07339449541284404,	 var: 0.0


In [27]:
process_MEM(num_iters=1, data=data)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

credits
[0.10982658959537572]
promoters
[0.11111111111111116]
hepatitis
[0.17948717948717952]

credits         mean: 0.10982658959537572,	 var: 0.0
promoters       mean: 0.11111111111111116,	 var: 0.0
hepatitis       mean: 0.17948717948717952,	 var: 0.0


## Adult dataset

In [2]:
X_train, y_train, X_test, y_test = get_dataset('adult', discrete=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((32561, 101), (32561, 1), (16281, 101), (16281, 1))

### TAN

In [15]:
clf = DiscreteBayesNetClassifier(alpha=100)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8244579571279406

### Feature selection

$l1$-regularized logistic regression maximizes the worst-case mutual information over
$Î“(Q)$, which seems superior to the methods maximizing a heusristic instead of the mutual information

In [5]:
logreg = LogisticRegression(penalty='l1', C=10)
logreg.fit(X_train, y_train)
logreg.score(X_test,y_test)

0.8352681039248203

In [5]:
threshold = 0.4
selected_columns = [i for i, score in enumerate(logreg.coef_[0]) if abs(score) >threshold]

In [6]:
X_train_selected = X_train[:,selected_columns]
X_test_selected = X_test[:,selected_columns]
X_train_selected.shape, X_test_selected.shape

((32561, 55), (16281, 55))

### Vanilla SVM

In [18]:
svc=SVC(kernel='linear')
svc.fit(X_train, y_train)
svc.score(X_test,y_test)

0.8527731711811314

### Minimize based implementation

In [8]:
lmbd = 0.001
y = y_train
X = X_train_selected
def MEM(alpha):
    w = (y*X)@alpha
    return np.mean(np.maximum(0,(1-w)*0.5, 1 -w))+lmbd*np.linalg.norm(alpha,ord=2)**2

In [9]:
MEM(np.ones(X.shape[1]))

1.4659275500334625

In [10]:
res = minimize(MEM, np.ones(X.shape[1]))

In [11]:
# test score
accuracy_score(y_test, np.sign(X_test_selected@res.x))

0.8448498249493275

### SGD-based implementation

In [7]:
from MEM import MEM

In [8]:
X_train_selected.shape

(32561, 55)

In [16]:
clf = MEM()
clf.fit(X_train_selected, y_train.flatten())

In [17]:
# test score
accuracy_score(y_test, clf.predict(X_test_selected))

0.8406731773232602