In [2]:
%matplotlib inline
import os, sys, numpy as np, pandas as pd, tensorflow as tf, re, codecs, seaborn as sns, json, time, csv, datetime as dt
import pickle, collections, random, math, numbers, scipy.sparse as sp, matplotlib.pyplot as plt, scipy.sparse as sp

from pprint import pprint
from tensorflow.contrib.training.python.training.hparam import HParams

def reload(mName):
    import importlib
    if mName in sys.modules:
        del sys.modules[mName]
    return importlib.import_module(mName)


from collections import deque, defaultdict, OrderedDict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, minmax_scale
from matplotlib import pyplot as plt
plt.style.use('ggplot')

# classpath
ctx = os.path.abspath('..').replace('\\', '/')
cps = [ctx]
_ = [sys.path.insert(0, cp) for cp in cps if cp not in sys.path]

# data path
datapath = '/'.join([ctx, 'data'])

seed = 88
utils = reload('trainer.utils.utils')
np.set_printoptions(precision=4, suppress=True, linewidth=100)
np.random.seed(seed)

  from ._conv import register_converters as _register_converters


## Simple Data Preprocess

In [55]:
import datetime as dt

ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
ratings['timestamp'] = ratings.timestamp.map(dt.datetime.fromtimestamp).map(str)
ratings['ori_rating'] = ratings['rating']
ratings['rating'] = (ratings.rating >= 4).astype(int)
tr, te = utils.split_by_ratio(ratings)

movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
avg_rt = ratings.groupby("movieId", as_index=False).ori_rating.mean().rename(index=str, columns={'ori_rating': 'avg_rating'})
movies = movies.merge(avg_rt, how='left', on='movieId')
# movies.avg_rating.fillna(ratings.rating.mean())
movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)
# movies["year"] = minmax_scale(movies.year.fillna(movies.year.median()))

In [None]:
def preprocess(data, movie_trans, train_hist=None, is_train=True):
    queue = []
    data = data.merge(movie_trans, how="left", on="movieId")
    columns=["user_id", "query_movie_ids",
             "genres", "avg_rating", "year", "candidate_movie_id",
             "timestamp",
             "rating"]
    
    list2str = lambda lst: ','.join(map(str, lst))
    for u, df in data.groupby("userId"):
        df = df.sort_values("rating", ascending=False)
        if not is_train:
            user_movies_hist = train_hist.query("userId == {}".format(u)).movieId
        for i, (_, r) in enumerate(df.iterrows()):
            if is_train:
                query_hist = df.movieId[:i].tolist() + df.movieId[i + 1:].tolist()
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.timestamp, r.rating])
            else:
                tr_hist = set(user_movies_hist.tolist())
                query_hist = list(tr_hist - set([int(r.movieId)]))
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.timestamp, r.rating])
    return pd.DataFrame(queue, columns=columns)
    
tr_merged = preprocess(tr, movies)
tr_merged.to_csv('./tr.raw.movielens.csv', index=False, header=None)

te_merged = preprocess(te, movies, tr, is_train=False)
te_merged.to_csv('./te.raw.movielens.csv', index=False, header=None)
# 合併成一個檔案
merged = pd.concat([tr_merged, te_merged], ignore_index=True)
merged.to_csv('./merged_movielens.csv', index=False, header=None)
merged.head()

<br/>
<br/>
<br/>
## Cmd Submit Training

In [None]:
!cd D:/Python/notebook/recomm_prod && \
gcloud ml-engine jobs submit training recomm_movielens_15 \
    --job-dir gs://recomm-job/foo/model \
    --runtime-version 1.4 \
    --module-name trainer.ctrl \
    --package-path trainer \
    --region asia-east1 \
    --config config.yaml \
    -- \
    --method train \
    --conf-path gs://recomm-job/foo/data/user_supplied/movielens.yaml

In [None]:
!gcloud ml-engine jobs describe recomm_movielens_15

In [None]:
!cd .. && python setup.py build

## Python Client API Transform Data

In [None]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
hparam = HParams(conf_path='gs://movielens-foo/user_supplied/movielens.yaml')
hparam.add_hparam('is_local', False)
ctrl.gen_data(hparam)

## View Schema

In [4]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
loader = ctrl.load_schema(params)

vars(loader.schema)

2018-03-06 15:53:38,551 - Loader - INFO [line:363] - try to unserialize from gs://recomm-job/foo-bar/movielens_recommendation/data/parsed.yaml


{'col_states_': OrderedDict([('query_movie_ids',
               CatgMapper(allow_null=True, default=None, is_multi=True,
                     name='query_movie_ids', sep=',', vocabs=None, vocabs_path=None)),
              ('genres',
               CatgMapper(allow_null=True, default=None, is_multi=True, name='genres',
                     sep='|', vocabs=None, vocabs_path=None)),
              ('avg_rating', NumericMapper(default=None, name='avg_rating')),
              ('year', NumericMapper(default=None, name='year')),
              ('candidate_movie_id',
               CatgMapper(allow_null=True, default=None, is_multi=False,
                     name='candidate_movie_id', sep=None, vocabs=None, vocabs_path=None)),
              ('rating',
               CatgMapper(allow_null=False, default=None, is_multi=False, name='rating',
                     sep=None, vocabs=None, vocabs_path=None))]),
 'conf_': {'columns': [{'id': 'user_id', 'm_dtype': 'catg'},
   {'id': 'query_movie_ids',
  

<br/>
<br/>
<br/>
## Python Client API Submit Training

In [40]:
utils = reload('trainer.utils.utils')
reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml',
          'runtime_version': '1.4'}
ret = ctrl.train_submit(params)
job_id = ret.get('job_id')
print( ret.get('response') )

2018-03-06 16:54:48,147 - Ctrl - INFO [line:142] - foo-bar: gen_data take time 0:00:05.214712
jobId: movielens_recommendation_20180306165443177431
state: QUEUED
  for chunk in iter(lambda: fp.read(4096), ''):
Job [movielens_recommendation_20180306165443177431] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe movielens_recommendation_20180306165443177431

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs movielens_recommendation_20180306165443177431



<br/>
<br/>
<br/>
## Describe Job States

In [49]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

env = reload('trainer.env')
utils = reload('trainer.utils.utils')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
ret = ctrl.describe(params)
ret['response']

2018-03-06 17:02:50,455 - googleapiclient.discovery - INFO [line:868] - URL being requested: GET https://ml.googleapis.com/v1/projects/training-recommendation-engine/jobs/movielens_recommendation_20180306165443177431?alt=json
2018-03-06 17:02:50,457 - oauth2client.transport - INFO [line:151] - Attempting refresh to obtain initial access_token
2018-03-06 17:02:50,489 - oauth2client.client - INFO [line:795] - Refreshing access_token
2018-03-06 17:02:51,741 - Ctrl - INFO [line:160] - foo-bar: describe take time 0:00:01.405866


{'createTime': '2018-03-06T08:54:45Z',
 'endTime': '2018-03-06T09:02:39Z',
 'jobId': 'movielens_recommendation_20180306165443177431',
 'startTime': '2018-03-06T08:55:24Z',
 'state': 'SUCCEEDED',
 'trainingInput': {'args': ['--train-steps',
   '1000',
   '--method',
   'train',
   '--conf-path',
   'gs://movielens-foo/user_supplied/movielens.yaml',
   '--job-id',
   'movielens_recommendation_20180306165443177431'],
  'jobDir': 'gs://recomm-job/foo-bar/movielens_recommendation/model',
  'packageUris': ['gs://recomm-job/foo-bar/movielens_recommendation/model/packages/3bbce4b391e6421266fbedaacbc4690883ca34ba045a61fc0078565b2978d84e/trainer-0.1.tar.gz'],
  'pythonModule': 'trainer.ctrl',
  'pythonVersion': '3.5',
  'region': 'asia-east1',
  'runtimeVersion': '1.4'},
 'trainingOutput': {'consumedMLUnits': 0.1}}

## Deploy

In [50]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
ret = ctrl.deploy(params)
ret

2018-03-06 17:03:58,768 - Service - INFO [line:92] - try to create model [foo_bar_movielens_recommendation] ...
2018-03-06 17:03:58,776 - googleapiclient.discovery - INFO [line:868] - URL being requested: POST https://ml.googleapis.com/v1/projects/training-recommendation-engine/models?alt=json
2018-03-06 17:03:58,778 - oauth2client.transport - INFO [line:151] - Attempting refresh to obtain initial access_token
2018-03-06 17:03:58,811 - oauth2client.client - INFO [line:795] - Refreshing access_token
2018-03-06 17:04:00,089 - Service - INFO [line:95] - try to clean old version ...
2018-03-06 17:04:00,100 - googleapiclient.discovery - INFO [line:868] - URL being requested: GET https://ml.googleapis.com/v1/projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions?alt=json
2018-03-06 17:04:00,359 - Service - INFO [line:140] - delete model version [projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180306165500482317]
20

{'err_cde': '00',
 'response': {'metadata': {'@type': 'type.googleapis.com/google.cloud.ml.v1.OperationMetadata',
   'createTime': '2018-03-06T09:04:00Z',
   'modelName': 'projects/training-recommendation-engine/models/foo_bar_movielens_recommendation',
   'operationType': 'CREATE_VERSION',
   'version': {'createTime': '2018-03-06T09:03:59Z',
    'deploymentUri': 'gs://recomm-job/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520326728',
    'description': '[foo-bar] recommendation model',
    'name': 'projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180306170401467879',
    'runtimeVersion': '1.4'}},
  'name': 'projects/training-recommendation-engine/operations/create_foo_bar_movielens_recommendation_v20180306170401467879-1520327039058'}}

## Get Information From Deployed Model

In [51]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
ret = ctrl.model_info(params)
ret

2018-03-06 17:04:14,426 - googleapiclient.discovery - INFO [line:868] - URL being requested: GET https://ml.googleapis.com/v1/projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180306170401467879?alt=json
2018-03-06 17:04:14,427 - oauth2client.transport - INFO [line:151] - Attempting refresh to obtain initial access_token
2018-03-06 17:04:14,460 - oauth2client.client - INFO [line:795] - Refreshing access_token


{'err_cde': '00',
 'response': {'createTime': '2018-03-06T09:03:59Z',
  'deploymentUri': 'gs://recomm-job/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520326728',
  'description': '[foo-bar] recommendation model',
  'name': 'projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180306170401467879',
  'runtimeVersion': '1.4',
  'state': 'CREATING'}}

## Restful predict

In [64]:


ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
ratings['timestamp'] = ratings.timestamp.map(dt.datetime.fromtimestamp).map(str)
ratings['ori_rating'] = ratings['rating']
ratings['rating'] = (ratings.rating >= 4).astype(int)

movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
avg_rt = ratings.groupby("movieId", as_index=False).ori_rating.mean().rename(index=str, columns={'ori_rating': 'avg_rating'})
movies = movies.merge(avg_rt, how='left', on='movieId')
movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)

loader = ctrl.load_schema(params)
merged = pd.read_csv('../data/foo/user_supplied/raws/merged_movielens.csv', names=loader.schema.raw_cols)
merged.head()

2018-03-06 17:13:48,520 - Loader - INFO [line:363] - try to unserialize from gs://recomm-job/foo-bar/movielens_recommendation/data/parsed.yaml


Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,timestamp,rating
0,1,"1953,2105,31,1029,1061,1129,1263,1287,1293,133...",Drama,4.26087,1989.0,1172,2009-12-14 10:53:25,1
1,1,"1172,2105,31,1029,1061,1129,1263,1287,1293,133...",Action|Crime|Thriller,4.021739,1971.0,1953,2009-12-14 10:53:11,1
2,1,"1172,1953,31,1029,1061,1129,1263,1287,1293,133...",Action|Adventure|Sci-Fi,3.478723,1982.0,2105,2009-12-14 10:52:19,1
3,1,"1172,1953,2105,1029,1061,1129,1263,1287,1293,1...",Drama,3.178571,1995.0,31,2009-12-14 10:52:24,0
4,1,"1172,1953,2105,31,1061,1129,1263,1287,1293,133...",Animation|Children|Drama|Musical,3.702381,1941.0,1029,2009-12-14 10:52:59,0


In [118]:
utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

user_id = 22
data = {
    'user_id': merged.query('user_id == {}'.format(user_id)).iloc[[0]].user_id.tolist(),
    'query_movie_ids': merged.query('user_id == {}'.format(user_id)).iloc[[0]].query_movie_ids.tolist(),
}
items = movies.rename(index=str, columns={"movieId": "candidate_movie_id"}).drop('title', 1)
items.loc[:, 'candidate_movie_id'] = items.candidate_movie_id.astype(str)
items = items.to_dict('list')
data.update(items)

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml',
          'json_data': data}
ret = ctrl.predict(params)

2018-03-06 17:54:59,573 - Loader - INFO [line:363] - try to unserialize from gs://recomm-job/foo-bar/movielens_recommendation/data/parsed.yaml
2018-03-06 17:55:01,630 - Loader - INFO [line:451] - try to restful transform ... 
<class 'dict'>
2018-03-06 17:55:01,715 - Ctrl - INFO [line:316] - foo-bar: predict take time 0:00:02.344979


In [128]:
# dict_keys(['avg_rating', 'query_movie_ids', 'genres', 'year', 'candidate_movie_id'])
ret['response'].get('query_movie_ids')

[[954,
  1456,
  1254,
  1225,
  1126,
  1118,
  1117,
  1116,
  1113,
  1101,
  1084,
  1046,
  1025,
  1010,
  995,
  972,
  967,
  958,
  1729,
  1811,
  1814,
  2477,
  3214,
  3093,
  3034,
  2824,
  2819,
  2683,
  2584,
  2437,
  1967,
  2399,
  2375,
  2289,
  2213,
  2170,
  2063,
  2039,
  956,
  1404,
  407,
  562,
  499,
  428,
  495,
  523,
  486,
  525,
  526,
  226,
  233,
  143,
  267,
  696,
  881,
  2310,
  208,
  2402,
  2373,
  204,
  2454,
  185,
  2397,
  917,
  2214,
  2532,
  240,
  2181,
  2175,
  2174,
  2169,
  2162,
  2161,
  2148,
  2140,
  2478,
  2646,
  2554,
  3001,
  3217,
  46,
  3203,
  3200,
  3199,
  3183,
  47,
  3060,
  3046,
  66,
  2952,
  153,
  2894,
  2861,
  133,
  138,
  2744,
  2720,
  2682,
  2095,
  2637,
  2610,
  2130,
  322,
  2094,
  522,
  1346,
  1337,
  1310,
  1306,
  1295,
  1280,
  1235,
  1130,
  496,
  520,
  1115,
  2085,
  1112,
  1066,
  649,
  1018,
  650,
  872,
  971,
  965,
  959,
  889,
  1360,
  1361,
  1388,
  43,


<br/>
<br/>
<br/>
## Local Transform

In [None]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
service = reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 'is_local': True}
ctrl.gen_data(params)

<br/>
<br/>
<br/>
## Local View Schema

In [None]:
utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
service = reload('trainer.service')

with flex.io('../repo/foo-bar/movielens_recommendation/data/parsed.yaml').as_reader() as f:
    schema = flex.Schema.unserialize(f.stream)
vars(schema)

<br/>
<br/>
<br/>
## Local Training

In [None]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.reco_mf_dnn_est')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 
          'is_local': True,
          'runtime_version': '1.4',
          'train_steps': 1000}
ctrl.train(params)

## Test

In [45]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.reco_mf_dnn_est')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 
          'raw_dir': 'gs://recomm-job/foo-bar'}
print(ctrl.test(params))

{'response': {'export_path': 'gs://recomm-job/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520326380', 'model_name': 'foo_bar_movielens_recommendation', 'job_id': 'movielens_recommendation_20180306165116939740', 'version': 'v20180306165500482317'}, 'err_cde': '00'}


### 更改GCS movielens.yaml

In [None]:
import shutil
from google.cloud.storage.blob import Blob
from io import BytesIO

utils = reload('trainer.utils.utils')
flex = reload('trainer.utils.flex')
env = reload('trainer.env')

with flex.io('../data/foo/user_supplied/movielens.yaml') as r, \
    flex.io('gs://movielens-foo/user_supplied/movielens.yaml') as w:
    w.write(r.read())

# stream = BytesIO(open('../data/foo/user_supplied/movielens.yaml', mode='rb').read())
# utils.gcs_blob('gs://movielens-foo/user_supplied/movielens.yaml').upload_from_file(stream)

<br/>
<br/>
<br/>
## Python API Credential

In [None]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from google.cloud import storage

authpath = '../auth.json'
project = 'training-recommendation-engine'
cred = GoogleCredentials.from_stream(authpath)
svc = discovery.build('ml', 'v1', credentials=cred)

st_client = storage.Client.from_service_account_json(authpath)
bucket = st_client.get_bucket('recomm-job')

In [None]:
from io import StringIO, BytesIO

blob = bucket.get_blob('user_supplied/raws/merged_movielens.csv')
sio = BytesIO()
blob.download_to_file(sio)

In [None]:
cols = list(data.keys())
multi_cols = ('query_movie_ids', 'genres')

def trans(features):
    # features = OrderedDict(zip(cols, data))
    print( schema.col_states_['query_movie_ids'].transform( features['query_movie_ids'] ) )
    # for col in multi_cols:
    #     features[col] = tf.string_to_number(tf.string_split(features[col], ',').values, out_type=tf.int32)
    return features

def add_seq_cols(feat):
    for m_col in multi_cols:
        name = '{}_len'.format(m_col)
        feat[name] = tf.size(feat[m_col])
        cols.append(name)
    return feat

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.Dataset.from_tensors(data)
    dataset = dataset.map(trans, num_parallel_calls=4)
    dataset = dataset.map(add_seq_cols, num_parallel_calls=4)
    print('cols', cols)
    dataset = dataset.repeat(1)
    dataset = dataset.padded_batch(5, OrderedDict(zip(cols, ([], [], [], [None], [], [None], [], []))))
    inputs = dataset.make_one_shot_iterator().get_next()
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            _, = sess.run([inputs])
            # print( sess.run(inputs) )
            pass

<br/>
<br/>
<br/>
## Dataset

In [None]:
def make_datasets(fpath_ary, schema, n_batch=128, n_epoch=1):
    def to_dense(sp):
        dense = tf.sparse_to_dense(sp.indices, sp.dense_shape, sp.values, '')
        return tf.reshape(tf.to_int32(tf.string_to_number(dense)), [-1])

    def to_sparse(dense):
        idx = tf.where(tf.not_equal(dense, 0))
        return tf.SparseTensor(indices=idx, dense_shape=dense.get_shape(), values=tf.gather_nd(dense, idx))

    def parse_csv(value):
        data = tf.decode_csv(value, record_defaults=defaults)
        features = OrderedDict(zip(cols, data))
        multi_cols = df_conf.query("{} == '{}' and {} == True".format(schema.M_DTYPE, schema.CATG, schema.IS_MULTI)).id.values
        for col in multi_cols:
            features[col] = tf.string_split([features[col]], ',')
            features[col] = to_dense(features[col])
            # features['{}_lens'.format(col)] = tf.size(features[col])
        return features 
    
    df_conf = schema.df_conf_.query('{}.notnull()'.format(schema.TYPE))
    cols = schema.cols
    defaults = []
    for _, r in df_conf.iterrows():
        if r[schema.M_DTYPE] == schema.CATG:
            defaults.append([''] if r[schema.IS_MULTI] else [0])
        else:
            defaults.append([])
    dataset = tf.data.TextLineDataset(fpath_ary)
    dataset = dataset.map(parse_csv, num_parallel_calls=4)
    has_multi = (df_conf[schema.M_DTYPE] == schema.CATG) & (df_conf[schema.IS_MULTI] == True)
    if sum(has_multi):
        multi_cols = df_conf[has_multi].id.values
        dataset = dataset.padded_batch(n_batch, OrderedDict( zip(cols, tuple([None] if e else [] for e in has_multi))) )
    else:
        dataset = dataset.batch(n_batch)
    dataset = dataset.shuffle(n_batch * 10, seed=seed).repeat(n_epoch)
    features = dataset.make_one_shot_iterator().get_next()
    return features, features.pop(schema.label[0])
                                
# tf.reset_default_graph()
with tf.Graph().as_default():
    inputs = make_datasets(['./movielens.tr'], loader.schema, n_batch=30)
    query_lens = tf.sequence_mask([1, 2, 3])
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            _, = sess.run([inputs])
            # print( sess.run(inputs) )
            pass

## Feature Columns with tf.feature_column.input_layer

In [None]:
a = pd.Series(minmax_scale(np.random.normal(0, 1, size=1000)))
a.hist(bins=50)

In [None]:
%%time
tf.reset_default_graph()
with tf.Graph().as_default():
    user_id = tf.feature_column.categorical_column_with_hash_bucket('user_id', hash_bucket_size=1000, dtype=tf.int32)
    user_id = tf.feature_column.embedding_column(user_id, dimension=8)
    avg_rating = tf.feature_column.numeric_column('avg_rating')
    columns = [user_id, avg_rating]
    
    def make_datasets(fpath_ary):
        cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
        defaults = [[0], [''], [''], [], [], [0], []]

        def parse_csv(value):
            data = tf.decode_csv(value, record_defaults=defaults)
            features = OrderedDict(zip(cols, data))
            # print(features)
            return features
        
        dataset = tf.data.TextLineDataset(fpath_ary)
        dataset = (dataset.map(parse_csv, num_parallel_calls=4)
                          .batch(3)
                          # .padded_batch(3, OrderedDict(zip(cols, ([], [None], [None], [], [], [], []))))
                          .shuffle(10, seed=seed)
                          .repeat(1)
                  )
        return dataset.make_one_shot_iterator().get_next()
    
    inputs = make_datasets(['./te_processed.batch.csv'])
    inputs = tf.feature_column.input_layer(inputs, columns)
    # features = tf.parse_example(serialized_example, features=tf.feature_column.make_parse_example_spec(columns))
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            print(sess.run(inputs))

### Make Example

In [None]:
%%time
cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
is_multi = [False, True, True, False, False, False, False]
pd_dtypes = [int, str, str, float, float, int, float]
types = ['int64_list', 'int64_list', 'int64_list', 'float_list', 'float_list', 'int64_list', 'float_list']
tf_types = [tf.int64, tf.int64, tf.int64, tf.float32, tf.float32, tf.int64, tf.float32]
def persist_example(fpath, tfpath):
    with tf.python_io.TFRecordWriter(tfpath) as w:
        for chunk in pd.read_csv(fpath, names=cols, dtype=dict(zip(cols, pd_dtypes)), chunksize=1000):
            chunk['query_movie_ids'] = chunk.query_movie_ids.map(lambda r: map(int, r.split(',')))
            chunk['genres'] = chunk.genres.map(lambda r: map(int, r.split(',')))
            
            for idx, r in chunk.iterrows():
                ex = tf.train.Example()
                for multi, col, tpe in zip(is_multi, cols, types):
                    val = r[col]
                    # ex.features.feature[col].int64_list or float_list or bytes_list
                    feat_type = getattr(ex.features.feature[col], tpe)
                    # extend function for multivalent columns, otherwise append
                    append_or_extend = 'append' if not multi else 'extend'                    
                    getattr(feat_type.value, append_or_extend)(val)
                w.write(ex.SerializePartialToString())

persist_example('./te_processed.csv', './data.tfrecord')

In [None]:
def decode_example(ser_example):
    # queue = tf.train.string_input_producer([fpath], num_epochs=1)
    # _, ser_example = tf.TFRecordReader().read(queue)
    # ser_example = tf.train.batch([ser_example], batch_size=10)
    ctx_features = {col: tf.FixedLenFeature([], tf_tpe)
                    for col, tf_tpe in zip(cols, tf_types) if col not in ('query_movie_ids', 'genres')}
    seq_features = {col: tf.FixedLenSequenceFeature([], tf_tpe) 
                    for col, tf_tpe in [('query_movie_ids', tf.int64), ('genres', tf.int64)]}
    context_dict, sequence_dict = tf.parse_single_sequence_example(ser_example, 
                                                                   context_features=ctx_features, 
                                                                   sequence_features=seq_features)
    # for col, tpe in zip(cols, tf_types):
    #     val = feature_dict[col]
    #     feature_dict[col] = tf.sparse_to_dense(val.indices, val.dense_shape, val.values, name=col)
    feature_dict = {}
    feature_dict.update(context_dict)
    feature_dict.update(sequence_dict)
    ret = OrderedDict()
    for c in cols:
        ret[c] = feature_dict[c]
    return tuple(ret.values())

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.TFRecordDataset(['./data.tfrecord'])
    dataset = dataset.map(decode_example).padded_batch(10, padded_shapes=([], [None], [None], [], [], [], []))
    # dataset = dataset.batch(3)
    iters = dataset.make_one_shot_iterator()
    r = iters.get_next()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print( sess.run(r) )

## Traditional parse_example
1. tf.train.Coordinator + tf.train.start_queue_runners

In [None]:
from tensorflow.python.framework import sparse_tensor
import re

def to_sparse(dense):
    idx = tf.where(tf.not_equal(dense, 0))
    return tf.SparseTensor(idx, tf.gather_nd(dense, idx), dense.get_shape())

def make_example(val):
    example = tf.train.Example(features=tf.train.Features(
        feature = {
            'query_movie_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=val)),
            'genres': tf.train.Feature(int64_list=tf.train.Int64List(value=val))
        }
    ))
    return example

tf.reset_default_graph()
with tf.Graph().as_default():
    
    filename = "tmp.tfrecords"
    if not os.path.exists(filename):
        # os.remove(filename)
        writer = tf.python_io.TFRecordWriter(filename)
        with writer:
            for idx, r in teProcessed.head().iterrows():
                for col in ('query_movie_ids', 'genres'):
                    val = list(map(int, re.split(',\s*', r[col])))
                    ex = make_example(val)
                    writer.write(ex.SerializeToString())

    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(["tmp.tfrecords"], num_epochs=1)
    _, serialized_example = reader.read(filename_queue)

    batch = tf.train.batch(tensors=[serialized_example], batch_size=1)
    features = {
        'query_movie_ids': tf.VarLenFeature(tf.int64),
        'genres': tf.VarLenFeature(tf.int64)
    }
    data = tf.parse_example(batch, features)
    query_movie_ids = data['query_movie_ids']
    embbedding = tf.Variable(tf.glorot_uniform_initializer()([9125]), dtype=tf.float32)
    print(query_movie_ids.dense_shape)
    # r = tf.layers.dense(query_movie_ids, 10)
    # emb_query = tf.nn.embedding_lookup_sparse([embbedding], query_movie_ids, None, combiner='sqrtn')
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        try:
            print(sess.run(data))
            pass
        except tf.errors.OutOfRangeError as e:
            coord.request_stop(e)
        finally:
            coord.request_stop()
            coord.join(threads)
    

## Test

In [None]:
tf.reset_default_graph()
with tf.Graph().as_default():
    labels = tf.constant(np.ones([10, 8]))
    pred = tf.concat([tf.Variable(tf.ones(shape=[1, 8]), trainable=False), tf.Variable(tf.truncated_normal([9, 8]))], 0)
    loss = tf.losses.mean_squared_error(predictions=pred, labels=labels)
    train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        print(pred.eval())
        for i in range(1000):
            sess.run([train_op])
        print()
        print(pred.eval())

In [None]:
tf.zeros