In [None]:
import logging
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import StratifiedKFold

In [None]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

In [None]:
variable_name = 'x'
target_name = 'y'

num_variables = 5000
num_features = 100
num_samples = 50

batch_size = 10
num_epochs = 100

# Data

In [None]:
random = np.random.RandomState(42)
data = random.uniform(size=(num_samples, num_variables + 1))
column_names = ['{}_{}'.format(variable_name, i) for i in range(num_variables)] + [target_name]
data = pd.DataFrame(data, columns=column_names)
data[target_name] = data[target_name] < 0.5

# Feature Selection

In [None]:
correlation = data.filter(like=variable_name).corrwith(data[target_name]).abs()
correlation.sort_values(ascending=False, inplace=True)
feature_names = sorted(correlation.index[:num_features])
data = data[feature_names + [target_name]]

# Modeling

In [None]:
def create_input_fn(data):

    def _input_fn():
        feature_tensor, target_tensor = (
            tf.data.Dataset
                .from_tensor_slices((data[feature_names], data[target_name]))
                .batch(batch_size)
                .repeat(num_epochs)
                .make_one_shot_iterator()
                .get_next()
        )
        feature_dictionary = {
            feature_name: feature_tensor[:, i]
            for i, feature_name in enumerate(feature_names)
        }
        return feature_dictionary, target_tensor

    return _input_fn

def create_model_fn():

    feature_columns = [
        tf.feature_column.numeric_column(column)
        for column in data.columns[:-1]
    ]

    def _model_fn():
        return tf.estimator.LinearClassifier(
            feature_columns=feature_columns,
            optimizer='Adam',
        )

    return _model_fn

# Cross-Validation

In [None]:
model_fn = create_model_fn()

results = []
splits = StratifiedKFold(n_splits=10).split(data[feature_names], data[target_name])
for i, (train_index, eval_index) in enumerate(splits):
    logging.info('Evaluating split {}...'.format(i))
    train_spec = tf.estimator.TrainSpec(
        input_fn=create_input_fn(data.iloc[train_index]),
    )
    eval_spec = tf.estimator.EvalSpec(
        input_fn=create_input_fn(data.iloc[eval_index]),
    )
    result, _ = tf.estimator.train_and_evaluate(
        estimator=model_fn(),
        train_spec=train_spec,
        eval_spec=eval_spec,
    )
    results.append(result)

results = pd.DataFrame(results)
results