In [1]:
import import_data
import tensorflow as tf
import tempfile
import pandas as pdj
from sklearn import model_selection
from sklearn import preprocessing


In [2]:
# Categorical base columns:
date_ = tf.contrib.layers.sparse_column_with_hash_bucket("date_", hash_bucket_size=356)
store = tf.contrib.layers.sparse_column_with_hash_bucket("store", hash_bucket_size=1000)
department = tf.contrib.layers.sparse_column_with_hash_bucket("department", hash_bucket_size=1000)
item = tf.contrib.layers.sparse_column_with_hash_bucket("item", hash_bucket_size=10000)
on_promotion = tf.contrib.layers.sparse_column_with_keys(column_name="on_promotion", keys=['0', '1'])
promotion_type = tf.contrib.layers.sparse_column_with_hash_bucket("promotion_type", hash_bucket_size=100)

# Continuous base colunms
unit_price = tf.contrib.layers.real_valued_column("unit_price")

# Wide and Deep columns:
wide_columns = [
    date_, store, department, item, on_promotion, promotion_type,
    tf.contrib.layers.crossed_column([store, department], hash_bucket_size=int(1e4)),
    tf.contrib.layers.crossed_column([department, item], hash_bucket_size=int(1e6))
]
deep_columns = [
    tf.contrib.layers.embedding_column(date_, dimension=8),
    tf.contrib.layers.embedding_column(store, dimension=8),
    tf.contrib.layers.embedding_column(department, dimension=8),
    tf.contrib.layers.embedding_column(item, dimension=8),
    tf.contrib.layers.embedding_column(on_promotion, dimension=8),
    tf.contrib.layers.embedding_column(promotion_type, dimension=8),
    unit_price
]



In [3]:
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50],
    fix_global_step_increment_bug=True)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_master': '', '_session_config': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fefcd26aa90>, '_num_worker_replicas': 0, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_log_step_count_steps': 100, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_is_chief': True, '_task_id': 0, '_environment': 'local', '_model_dir': '/tmp/tmp_x2bg_z4', '_evaluation_master': '', '_num_ps_replicas': 0}


In [4]:
# Column names:
COLUMNS = ["date_", "store", "department", "item",
           "unit_price", "on_promotion", "promotion_type"]
LABEL_COLUMN = 'quantity'
CATEGORICAL_COLUMNS = ["date_", "store", "department",
                       "item", "on_promotion", "promotion_type"]
CONTINUOUS_COLUMNS = ["unit_price"]

# CATEGORICAL_COLUMNS = ["item"]
# CONTINUOUS_COLUMNS = ["unit_price"]

# Get data and split it into training and test:
#data = pd.read_csv("./hackathon_data/test_data.csv")

data = import_data.import_data(file_regex="./hackathon_data/*20*.dat")

data[CATEGORICAL_COLUMNS] = data[CATEGORICAL_COLUMNS].astype(str)
data.info()
data.describe()

df_train, df_test = model_selection.train_test_split(
    data, test_size=0.2, random_state=42)
# df_train[CATEGORICAL_COLUMNS] = df_train[CATEGORICAL_COLUMNS].astype(str)
# df_test[CATEGORICAL_COLUMNS] = df_test[CATEGORICAL_COLUMNS].astype(str)
# x = data[COLUMNS]
# y = data[LABEL_COLUMN]

# x_train, x_test, y_train, y_test = model_selection.train_test_split(
#     x, y, test_size=0.2, random_state=42)


# Scale training data to 0 mean and unit sd
#scaler = preprocessing.StandardScaler()
#x_train = scaler.fit_transform(x_train)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1352325 entries, 0 to 451254
Data columns (total 8 columns):
date_             1352325 non-null object
store             1352325 non-null object
department        1352325 non-null object
item              1352325 non-null object
unit_price        1352325 non-null float64
quantity          1352325 non-null float64
on_promotion      1352325 non-null object
promotion_type    1352325 non-null object
dtypes: float64(2), object(6)
memory usage: 92.9+ MB


In [None]:
# Create dictionary mappings between columns and their corresponding values:
def input_fn(df):
#     continuous_cols = {k: tf.constant(df[k].values)
#                       for k in CONTINUOUS_COLUMNS}
#     categorical_cols = {k: tf.SparseTensor(
#       indices=[[i, 0] for i in range(df[k].size)],
#       values=df[k].values,
#       dense_shape=[df[k].size, 1])
#                       for k in CATEGORICAL_COLUMNS}
    
    continuous_cols = {k: tf.constant(df[k].values,
                                     shape=[df[k].size, 1])
                      for k in CONTINUOUS_COLUMNS}
    categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      dense_shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
    
    # Merge the dictionaries:
    feature_cols = dict(continuous_cols.items() | categorical_cols.items())
    label = tf.constant(df[LABEL_COLUMN].values) # label column into a const Tensor
    return feature_cols, label

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)


In [None]:
m.fit(input_fn=train_input_fn, steps=200)
result = m.evaluate(input_fn=eval_input_fn, steps=1)


Instructions for updating:
The default behavior of sparse_feature_cross is changing, the default
value for hash_key will change to SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY.
From that point on sparse_feature_cross will always use FingerprintCat64
to concatenate the feature fingerprints. And the underlying
_sparse_feature_cross_op.sparse_feature_cross operation will be marked
as deprecated.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp_x2bg_z4/model.ckpt.
INFO:tensorflow:loss = -3.8317518, step = 1
