In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import shutil
import sys
import numpy as np
import pandas as pd
import tensorflow as tf

Add a parser for the program, specify column names for the data and assign a default value for each column if the data is not available in that column.

In [2]:
phase = 5

column_default = [
    ('', 0), 
    ('msno','missing_msno'),
    ('song_id','missing_song_id'),
    ('source_system_tab','missing_tab'),
    ('source_screen_name','missing_screen'),
    ('source_type','missing_source_type'),
    ('target',''),
    ('song_length',241812.0),
    ('composer','empty'),
    ('lyricist','empty'),
    ('language',0.0),
    ('short_song',0.0),
    ('mean_length_distance',0.0),
    ('city',-1.0),
    ('bd',29.0),
    ('gender','empty'),
    ('registered_via',0),
    ('membership_days', 0),
    ('registration_timestamp',0.0),
    ('expiration_timestamp',0.0),
    ('country_code','empty'),
    ('issuer','empty'),
    ('issue_year',0.0),
    ('unique_id','empty'),
    ('genre_ids_count',0),
    ('lyricists_count',0),
    ('composer_count',0),
    ('artist_count',0),
    ('is_featured',0),
    ('feat_artist','empty'),
    ('main_artist','empty'),
    ('feat_song_name','empty'),
    ('main_song_name','empty'),
    ('is_remix',0),
    ('is_live',0),
    ('is_acoustic',0),
    ('is_instrumental',0),
    ('artist_is_composer',0),
    ('artist_is_composer_is_lyricist',0),
    ('song_lang_magic',0),
    ('count_song_played',425),
    ('count_artist_played',406),
    ('count_composer_played',171),
    ('count_lyricist_played',226),
    ('count_member_action',848),
    ('member_action_per_day',0.0),
    ('artist_main','empty'),
    ('song_main','empty'),
    ('artist_in_parenthesis','empty'),
    ('artist_in_titlemark','empty'),
    ('song_pre_parenthesis','empty'),
    ('song_in_parenthesis','empty'),
    ('song_pre_titlemark','empty'),
    ('song_in_titlemark','empty'),
    ('genre_pca_1',0.0),
    ('genre_pca_2',0.0),
    ('genre_pca_3',0.0),
    ('genre_pca_4',0.0),
    ('genre_pca_5',0.0),
    ('genre_pca_6',0.0),
    ('genre_pca_7',0.0),
    ('genre_pca_8',0.0),
    ('genre_pca_9',0.0),
    ('genre_pca_10',0.0),
    ('genre_pca_11',0.0),
    ('genre_pca_12',0.0),
    ('keyword_cluster_pca_0', 0.0),
    ('keyword_cluster_pca_1', 0.0),
]

_CSV_COLUMNS = []
_CSV_COLUMN_DEFAULTS = []

# print(column_default)

for key in column_default:
    _CSV_COLUMNS.append(key[0])
    _CSV_COLUMN_DEFAULTS.append([key[1]])

#print(_CSV_COLUMNS)
parser = argparse.ArgumentParser()

parser.add_argument(
    '--model_dir', type=str, default='/tmp/census_model',
    help='Base directory for the model.')

parser.add_argument(
    '--model_type', type=str, default='wide_deep',
    help="Valid model types: {'wide', 'deep', 'wide_deep'}.")

parser.add_argument(
    '--train_epochs', type=int, default=40, help='Number of training epochs.')

parser.add_argument(
    '--epochs_per_eval', type=int, default=2,
    help='The number of training epochs to run between evaluations.')

parser.add_argument(
    '--batch_size', type=int, default=256, help='Number of examples per batch.')

parser.add_argument(
    '--train_data', type=str, default='../../data/train_phase' + str(phase) + '.csv',
    help='Path to the training data.')

parser.add_argument(
    '--test_data', type=str, default='../../data/valid_phase' + str(phase) + '.csv',
    help='Path to the test data.')

_StoreAction(option_strings=['--test_data'], dest='test_data', nargs=None, const=None, default='../../data/valid_phase5.csv', type=<class 'str'>, choices=None, help='Path to the test data.', metavar=None)

Split the data into a training set and a validation set.

In [3]:
_NUM_EXAMPLES = {
    'train': 5901935 + 1474484 + 1000,# - 300000,
    'validation': 0#300000,#1474484 + 1000,
}


Give parameters for hashing categorical features. Create a crossed feature with several categorical features. Feed these features into the wide neural network. Embed several categorical features to put them and other numerical features in the deep neural network.

In [4]:
def build_model_columns():
  """Builds a set of wide and deep feature columns."""
  # Continuous columns
  categorical_features = {}
  numerical_features = {}
  for key in column_default:
    if type(key[1])!=str:
        numerical_features[key[0]] = tf.feature_column.numeric_column(key[0])
    elif key[0] != 'target':
        categorical_features[key[0]] = tf.feature_column.categorical_column_with_hash_bucket(key[0], hash_bucket_size=3200000)

  categorical_features['gender'] = tf.feature_column.categorical_column_with_vocabulary_list(
      'gender', ['male', 'female'])
        

  # Transformations.
  age_buckets = tf.feature_column.bucketized_column(
      numerical_features['bd'], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
  song_length_buckets = tf.feature_column.bucketized_column(
      numerical_features['song_length'], boundaries=[160000, 320000, 480000, 640000])

  # Wide columns and deep columns.
  base_columns = []
  for key in categorical_features:
    base_columns.append(categorical_features[key])


  wide_columns = base_columns #+ crossed_columns

  deep_columns = []
  for key in numerical_features:
    deep_columns.append(numerical_features[key])
  for key in categorical_features:
    if (key=='song_length'):
      deep_columns.append(tf.feature_column.numeric_column('song_length', normalizer_fn=lambda x: (x - 241812.0) / 67351))
    else:
      deep_columns.append(tf.feature_column.embedding_column(categorical_features[key], 24))

  return wide_columns, deep_columns

Specify number of hidden layers and hidden units in the deep neural network. Return a wide network estimator, a deep network estimator or a wide and deep network estimator according to the model type provided in the parser.

In [5]:
def build_estimator(model_dir, model_type):
  """Build an estimator appropriate for the given model type."""
  wide_columns, deep_columns = build_model_columns()
  hidden_units = [128, 128, 64, 64, 32, 32]#[100, 75, 50, 25]

  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
  run_config = tf.estimator.RunConfig().replace(
      keep_checkpoint_max=2, session_config=tf.ConfigProto(device_count={'GPU': 0}))

  #config.intra_op_parallelism_threads = 44
  #config.intra_op_parallelism_threads = 44
    
  if model_type == 'wide':
    return tf.estimator.LinearClassifier(
        model_dir=model_dir,
        feature_columns=wide_columns,
        config=run_config)
  elif model_type == 'deep':
    return tf.estimator.DNNClassifier(
        model_dir=model_dir,
        feature_columns=deep_columns,
        hidden_units=hidden_units,
        dropout=0.05,
        config=run_config)
  else:
    return tf.estimator.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units,
        dnn_dropout=0.05,
        config=run_config)

Generate a function that produces input for the tensorflow graph. The function will parse the csv file, do the shuffling, extract the label, and feed the features to the estimator.

In [6]:
def input_fn(data_file, num_epochs, shuffle, batch_size):
  """Generate an input function for the Estimator."""
  assert tf.gfile.Exists(data_file), (
      '%s not found. Please make sure you have either run data_download.py or '
      'set both arguments --train_data and --test_data.' % data_file)

  def parse_csv(value):
    print('Parsing', data_file)
    columns = tf.decode_csv(value, field_delim = ',', record_defaults=_CSV_COLUMN_DEFAULTS)
    features = dict(zip(_CSV_COLUMNS, columns))
    labels = features.pop('target')
    return features, tf.equal(labels, '1')

  # Extract lines from input files using the Dataset API.
  dataset = tf.data.TextLineDataset(data_file)
  dataset = dataset.skip(1)
  if shuffle:
    dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])

  dataset = dataset.map(parse_csv, num_parallel_calls=5) #skip(7376419)

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)

  iterator = dataset.make_one_shot_iterator()
  features, labels = iterator.get_next()
  return features, labels

A logger that helps to keep track of the training process.

In [7]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog5.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

Train the wide and deep neural network for epochs specified in the parser, and then predict the labels for the test set.

In [8]:
def predict(model, epoch, phase):
  fea_str = ",id,msno,song_id,source_system_tab,source_screen_name,source_type,song_length,composer,lyricist,language,short_song,mean_length_distance,city,bd,gender,registered_via,membership_days,registration_timestamp,expiration_timestamp,country_code,issuer,issue_year,unique_id,genre_ids_count,lyricists_count,composer_count,artist_count,is_featured,feat_artist,main_artist,feat_song_name,main_song_name,is_remix,is_live,is_acoustic,is_instrumental,artist_is_composer,artist_is_composer_is_lyricist,song_lang_magic,count_song_played,count_artist_played,count_composer_played,count_lyricist_played,count_member_action,member_action_per_day,artist_main,song_main,artist_in_parenthesis,artist_in_titlemark,song_pre_parenthesis,song_in_parenthesis,song_pre_titlemark,song_in_titlemark,genre_pca_1,genre_pca_2,genre_pca_3,genre_pca_4,genre_pca_5,genre_pca_6,genre_pca_7,genre_pca_8,genre_pca_9,genre_pca_10,genre_pca_11,genre_pca_12,keyword_cluster_pca_0,keyword_cluster_pca_1"
  column_names2 = fea_str.split(",")

  def conv(val):  
    try:
        return np.float(val)
    except:        
        return np.float64(0)

  def conv1(val):  
    try:
        return np.float(val)
    except:        
        return np.float64(-1.0)

  df = pd.read_csv('../../data/test_phase' + str(phase) + '.csv', index_col=False, names=column_names2,
                            skip_blank_lines=True, keep_default_na=False, skiprows=1,
                            converters={'language':conv1, 'mean_length_distance': conv1, 
                                        'issue_year':conv1, 'song_length': conv, 'short_song':conv1})
  predictions = list(model.predict(input_fn=tf.estimator.inputs.pandas_input_fn(
    x=df, num_epochs=1,shuffle=False)))
  res = pd.DataFrame(predictions)
  #temp = model
  #temp2 = predictions
  res.to_csv('./pred' + str(phase) + '_' + str(epoch + 1)+ '.csv', sep = ',', header = False)
  column_names3=['id', 'label', 'score', 'target', 's2', 's']
  def conv3(val):  
    return np.int(val[1])
  def conv4(val):
    return np.float(val[1:-1])
  census_pred = pd.read_csv('./pred' + str(phase) + '_' + str(epoch + 1)+ '.csv', index_col=False, names=column_names3,
                          skip_blank_lines=True, keep_default_na=False, converters={'target': conv4})
  header=['id', 'target']
  census_pred.to_csv('output_float_phase' + str(phase) + '_epoch_' + str(epoch + 1) + '.csv', index=False, columns = header)

In [None]:
temp = 0
temp2 = 0
model = None
def main(unused_argv):
  global temp
  global temp2
  global model
  # Clean up the model directory if present
  shutil.rmtree(FLAGS.model_dir, ignore_errors=True)
  model = build_estimator(FLAGS.model_dir, FLAGS.model_type)

  # Train and evaluate the model every `FLAGS.epochs_per_eval` epochs.
  for n in range(20):#range(FLAGS.train_epochs // FLAGS.epochs_per_eval):
    model.train(input_fn=lambda: input_fn(
        FLAGS.train_data, FLAGS.epochs_per_eval, True, FLAGS.batch_size))

    '''results = model.evaluate(input_fn=lambda: input_fn(
        FLAGS.test_data, 1, False, FLAGS.batch_size))
    for key in sorted(results):
        print('%s: %s' % (key, results[key]))'''
    
    if n == 0 or (n + 1) % 5 == 0:
      # Display evaluation metrics
      logger.info(('Results at epoch', (n + 1) * FLAGS.epochs_per_eval))
      print('-' * 60)
      predict(model, n, phase)
    

if __name__ == '__main__':
  tf.logging.set_verbosity(tf.logging.INFO)
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed + [temp])

INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_task_id': 0, '_num_ps_replicas': 0, '_master': '', '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': None, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd8224ebe80>, '_task_type': 'worker', '_num_worker_replicas': 1, '_save_checkpoints_steps': None, '_session_config': device_count {
  key: "GPU"
}
, '_keep_checkpoint_max': 2, '_service': None, '_model_dir': '/tmp/census_model'}
Parsing ../../data/train_phase5.csv
INFO:tensorflow:Create CheckpointSaverHook.


In [None]:
model

Extract the labels from the prediction result to generate submission file output.csv