In [1]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
# Dataset
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

mnist = input_data.read_data_sets("MNIST_data/")

mnist_images = mnist.train.images
mnist_labels = mnist.train.labels

n_three, n_five = sum(mnist_labels==3), sum(mnist_labels==5)

X_all = np.vstack([
    mnist_images[mnist_labels==3,:],
    mnist_images[mnist_labels==5,:]
])

y_all = np.array([1]*n_three + [0]*n_five)

# make it more sparse
X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.8)

# make one-hot
X_all = (X_all > 0).astype(np.int64)

print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {:.05f}'.format(np.mean(X_all != 0)))
print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_all==0), np.mean(y_all==1)))

X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Dataset shape: (10625, 784)
Non-zeros rate: 0.04032
Classes balance: 0.469 / 0.531


In [3]:
# Column indexed data
# convert data to svmlight format
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
fname = './dump_svmlight.txt'
feature_num=X_all.shape[1]
dump_svmlight_file(X_all, y_all, fname)

import utils
X_cid_all, y_cid_all = utils.read_zipped_column_indexed_data_from_svmlight_file(fname)
X_cid_tr, X_cid_te, y_cid_tr, y_cid_te = train_test_split(X_cid_all, y_cid_all, random_state=42, test_size=0.3)

X_ind_tr, X_val_tr, y_cid_tr = utils.convert_to_column_indexed_data(X_cid_tr, y_cid_tr)
X_ind_te, X_val_te, y_cid_te = utils.convert_to_column_indexed_data(X_cid_te, y_cid_te)

X_ind_tr, X_val_tr, y_cid_tr = utils.convert_to_fully_column_indexed_data(
    X_ind_tr, X_val_tr, y_cid_tr, feature_num=feature_num)
X_ind_te, X_val_te, y_cid_te = utils.convert_to_fully_column_indexed_data(
    X_ind_te, X_val_te, y_cid_te, feature_num=feature_num)

In [4]:
# Baseline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
for model in [
    LogisticRegression(), 
    #SVC(kernel='linear'),
    RandomForestClassifier(n_jobs=-1, n_estimators=200)]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    f1 = f1_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('accuracy: {}'.format(acc))
    print('f1 score: {}'.format(f1))

model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
accuracy: 0.8676286072772899
f1 score: 0.8771112405358182
model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
accuracy: 0.8604140526976161
f1 score: 0.8718687014108839


In [5]:
# tf lr standard
import imp
import tflrclassifier
tflrclassifier = imp.reload(tflrclassifier)

for model in [
        tflrclassifier.TFLRClassifier(l2_weight=0.01, learning_rate=1e-2, \
            batch_size=64, epoch_num=10, print_step=1000, input_type='dense')
    ]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(y_tr, model.predict(X_tr))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 8, global_step: 1000, loss: 0.5132330060005188
model: <tflrclassifier.TFLRClassifier object at 0x7fd50c1131d0>
train accuracy: 0.826408498050289
test accuracy: 0.8193224592220828


In [6]:
# tf lr column indexed
import tflrclassifier_column_indexed
import imp
tflrclassifier_column_indexed = imp.reload(tflrclassifier_column_indexed)
for model in [tflrclassifier_column_indexed.TFLRClassifier(
    feature_num=X_all.shape[1], # feature num must set
    l2_weight=0.01, learning_rate=1e-2,
    batch_size=64, epoch_num=10, print_step=1000, random_seed=42)]:
    model.fit(np.array(X_ind_tr), np.array(X_val_tr), np.array(y_cid_tr))
    predictions = model.predict(np.array(X_ind_te), np.array(X_val_te))
    acc = accuracy_score(np.array(y_cid_te), predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(np.array(y_cid_tr), model.predict(np.array(X_ind_tr), np.array(X_val_tr)))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 8, global_step: 1000, loss: 0.4947774112224579
model: <tflrclassifier_column_indexed.TFLRClassifier object at 0x7fd50c0ed358>
train accuracy: 0.836493209627538
test accuracy: 0.8318695106649937


In [13]:
# tf fm standard
import imp
import tffmclassifier
tffmclassifier = imp.reload(tffmclassifier)

for model in [
        tffmclassifier.TFFMClassifier(l2_weight=0.01, factor_num=10, learning_rate=1e-2, \
            batch_size=64, epoch_num=10, print_step=1000, input_type='dense', random_seed=42)
    ]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(y_tr, model.predict(X_tr))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 8, global_step: 1000, loss: 0.4319288432598114
model: <tffmclassifier.TFFMClassifier object at 0x7fd4e02734a8>
train accuracy: 0.8939088342073417
test accuracy: 0.8616687578419071


In [16]:
# tf fm column indexed
import tffmclassifier_column_indexed
import imp
tffmclassifier_column_indexed = imp.reload(tffmclassifier_column_indexed)
for model in [tffmclassifier_column_indexed.TFFMClassifier(
    feature_num=X_all.shape[1], # feature num must set
    factor_num=10,
    l2_weight=0.01, learning_rate=1e-2,
    batch_size=64, epoch_num=10, print_step=1000, random_seed=42)]:
    model.fit(np.array(X_ind_tr), np.array(X_val_tr), np.array(y_cid_tr))
    predictions = model.predict(np.array(X_ind_te), np.array(X_val_te))
    acc = accuracy_score(np.array(y_cid_te), predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(np.array(y_cid_tr), model.predict(np.array(X_ind_tr), np.array(X_val_tr)))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 8, global_step: 1000, loss: 0.4941784739494324
model: <tffmclassifier_column_indexed.TFFMClassifier object at 0x7fd4e04e55c0>
train accuracy: 0.836762135269598
test accuracy: 0.8334378920953576


In [32]:
# tf deep fm column indexed
import tfdeepfmclassifier_column_indexed
import imp
tfdeepfmclassifier_column_indexed = imp.reload(tfdeepfmclassifier_column_indexed)
for model in [tfdeepfmclassifier_column_indexed.TFDeepFMClassifier(
    feature_num=X_all.shape[1], # feature num must set
    field_num=X_all.shape[1], # field num must set
    factor_num=3,
    l2_weight=0.01, learning_rate=1e-2,
    batch_size=10, epoch_num=10, print_step=1000, random_seed=42)]:
    model.fit(np.array(X_ind_tr), np.array(X_val_tr), np.array(y_cid_tr))
    predictions = model.predict(np.array(X_ind_te), np.array(X_val_te))
    acc = accuracy_score(np.array(y_cid_te), predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(np.array(y_cid_tr), model.predict(np.array(X_ind_tr), np.array(X_val_tr)))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 1, global_step: 1000, loss: 0.30500078201293945
epoch: 2, global_step: 2000, loss: 0.7178893089294434
epoch: 4, global_step: 3000, loss: 0.9371939301490784
epoch: 5, global_step: 4000, loss: 0.44909366965293884
epoch: 6, global_step: 5000, loss: 0.6699583530426025
epoch: 8, global_step: 6000, loss: 0.3177235722541809
epoch: 9, global_step: 7000, loss: 0.46628567576408386
model: <tfdeepfmclassifier_column_indexed.TFDeepFMClassifier object at 0x7fd50c106160>
train accuracy: 0.872932634126664
test accuracy: 0.849435382685069


In [33]:
# tf DCN column indexed
import tfdcnclassifier_column_indexed
import imp
tfdcnclassifier_column_indexed = imp.reload(tfdcnclassifier_column_indexed)
for model in [tfdcnclassifier_column_indexed.TFDCNClassifier(
    feature_num=X_all.shape[1], # feature num must set
    field_num=X_all.shape[1], # field num must set
    factor_num=3,
    deep_layer_nodes=[32, 32],
    cross_layer_num=1,
    l2_weight=0.01, learning_rate=1e-3,
    batch_size=10, epoch_num=10, print_step=1000, random_seed=42)]:
    model.fit(np.array(X_ind_tr), np.array(X_val_tr), np.array(y_cid_tr))
    predictions = model.predict(np.array(X_ind_te), np.array(X_val_te))
    acc = accuracy_score(np.array(y_cid_te), predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(np.array(y_cid_tr), model.predict(np.array(X_ind_tr), np.array(X_val_tr)))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 1, global_step: 1000, loss: 0.7164245247840881
epoch: 2, global_step: 2000, loss: 0.7275373935699463
epoch: 4, global_step: 3000, loss: 0.695078432559967
epoch: 5, global_step: 4000, loss: 0.8130803108215332
epoch: 6, global_step: 5000, loss: 0.7130793929100037
epoch: 8, global_step: 6000, loss: 0.6983160972595215
epoch: 9, global_step: 7000, loss: 0.7133716344833374
model: <tfdcnclassifier_column_indexed.TFDCNClassifier object at 0x7fd4e05a45c0>
train accuracy: 0.527497646900632
test accuracy: 0.5379548306148055
