In [1]:
# Dataset
import numpy as np

from tensorflow.examples.tutorials.mnist import input_data
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

mnist = input_data.read_data_sets("MNIST_data/")

mnist_images = mnist.train.images
mnist_labels = mnist.train.labels

n_three, n_five = sum(mnist_labels==3), sum(mnist_labels==5)

X_all = np.vstack([
    mnist_images[mnist_labels==3,:],
    mnist_images[mnist_labels==5,:]
])

y_all = np.array([1]*n_three + [0]*n_five)
# make it more sparse
X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.8)

print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {:.05f}'.format(np.mean(X_all != 0)))
print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_all==0), np.mean(y_all==1)))

X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)

  from ._conv import register_converters as _register_converters


Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Dataset shape: (10625, 784)
Non-zeros rate: 0.04035
Classes balance: 0.469 / 0.531


In [2]:
# Baseline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
for model in [
    LogisticRegression(), 
    RandomForestClassifier(n_jobs=-1, n_estimators=200)]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('accuracy: {}'.format(acc))
    print()

model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
accuracy: 0.8745294855708908

model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
accuracy: 0.882685069008783



In [3]:
# TFLR sklearn-API
import tflrclassifier
import imp
tflrclassifier = imp.reload(tflrclassifier)
for model in [tflrclassifier.TFLRClassifier(l2_weight=0.01, learning_rate=1e-2,
    batch_size=10, epoch_num=10, print_step=1000, input_type='dense', random_seed=42)]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(y_tr, model.predict(X_tr))))
    print('test accuracy: {}'.format(acc))
    print()

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 1, global_step: 1000, loss: 0.6029930710792542
epoch: 2, global_step: 2000, loss: 0.5037059783935547
epoch: 4, global_step: 3000, loss: 0.6486674547195435
epoch: 5, global_step: 4000, loss: 0.42504045367240906
epoch: 6, global_step: 5000, loss: 0.5727302432060242
epoch: 8, global_step: 6000, loss: 0.41611286997795105
epoch: 9, global_step: 7000, loss: 0.5183256268501282
model: <tflrclassifier.TFLRClassifier object at 0x7fe0a80217f0>
train accuracy: 0.863116848191475
test accuracy: 0.855081555834379



In [7]:
# TFLR svmlight-API
# convert data to svmlight format
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
fname = './dump_svmlight.txt'
feature_num=X_all.shape[1]
dump_svmlight_file(X_all, y_all, fname)

import utils
X_cid_all, y_cid_all = utils.read_zipped_column_indexed_data_from_svmlight_file(fname)
X_cid_tr, X_cid_te, y_cid_tr, y_cid_te = train_test_split(X_cid_all, y_cid_all, random_state=42, test_size=0.3)

X_ind_tr, X_val_tr, y_cid_tr = convert_to_column_indexed_data(X_cid_tr, y_cid_tr)
X_ind_te, X_val_te, y_cid_te = convert_to_column_indexed_data(X_cid_te, y_cid_te)

X_ind_tr, X_val_tr, y_cid_tr = utils.convert_to_fully_column_indexed_data(
    X_ind_tr, X_val_tr, y_cid_tr, feature_num=feature_num)
X_ind_te, X_val_te, y_cid_te = utils.convert_to_fully_column_indexed_data(
    X_ind_te, X_val_te, y_cid_te, feature_num=feature_num)

In [8]:
# TFLR svmlight-API
import tflrclassifier_column_indexed
import imp
tflrclassifier_column_indexed = imp.reload(tflrclassifier_column_indexed)
for model in [tflrclassifier_column_indexed.TFLRClassifier(
    feature_num=X_all.shape[1], # feature num must set
    l2_weight=0.01, learning_rate=1e-2,
    batch_size=10, epoch_num=10, print_step=1000, random_seed=42)]:
    model.fit(np.array(X_ind_tr), np.array(X_val_tr), np.array(y_cid_tr))
    predictions = model.predict(np.array(X_ind_te), np.array(X_val_te))
    acc = accuracy_score(np.array(y_cid_te), predictions)
    print('model: {}'.format(model.__str__()))
    print('train accuracy: {}'.format(accuracy_score(np.array(y_cid_tr), model.predict(np.array(X_ind_tr), np.array(X_val_tr)))))
    print('test accuracy: {}'.format(acc))

tensorboard --logdir=/tmp/tflog --port 8080
epoch: 1, global_step: 1000, loss: 0.5667834877967834
epoch: 2, global_step: 2000, loss: 0.5114297270774841
epoch: 4, global_step: 3000, loss: 0.6157576441764832
epoch: 5, global_step: 4000, loss: 0.435439795255661
epoch: 6, global_step: 5000, loss: 0.5564510822296143
epoch: 8, global_step: 6000, loss: 0.41495373845100403
epoch: 9, global_step: 7000, loss: 0.5230531096458435
model: <tflrclassifier_column_indexed.TFLRClassifier object at 0x7fe0a316ff28>
train accuracy: 0.8621756084442651
test accuracy: 0.8579046424090339
