In [1]:
%matplotlib inline
import os, sys, numpy as np, pandas as pd, tensorflow as tf, re, codecs, seaborn as sns, json, time, csv, datetime as dt
import pickle, collections, random, math, numbers, scipy.sparse as sp, matplotlib.pyplot as plt, scipy.sparse as sp

from pprint import pprint
from tensorflow.contrib.training.python.training.hparam import HParams

def reload(mName):
    import importlib
    if mName in sys.modules:
        del sys.modules[mName]
    return importlib.import_module(mName)


from collections import deque, defaultdict, OrderedDict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, minmax_scale
from matplotlib import pyplot as plt
plt.style.use('ggplot')

# classpath
ctx = os.path.abspath('..').replace('\\', '/')
cps = [ctx]
_ = [sys.path.insert(0, cp) for cp in cps if cp not in sys.path]

# data path
datapath = '/'.join([ctx, 'data'])

seed = 88
utils = reload('trainer.utils.utils')
np.set_printoptions(precision=4, suppress=True, linewidth=100)
np.random.seed(seed)

  from ._conv import register_converters as _register_converters


## Simple Data Preprocess

In [55]:
import datetime as dt

ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
ratings['timestamp'] = ratings.timestamp.map(dt.datetime.fromtimestamp).map(str)
ratings['ori_rating'] = ratings['rating']
ratings['rating'] = (ratings.rating >= 4).astype(int)
tr, te = utils.split_by_ratio(ratings)

movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
avg_rt = ratings.groupby("movieId", as_index=False).ori_rating.mean().rename(index=str, columns={'ori_rating': 'avg_rating'})
movies = movies.merge(avg_rt, how='left', on='movieId')
# movies.avg_rating.fillna(ratings.rating.mean())
movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)
# movies["year"] = minmax_scale(movies.year.fillna(movies.year.median()))

In [None]:
def preprocess(data, movie_trans, train_hist=None, is_train=True):
    queue = []
    data = data.merge(movie_trans, how="left", on="movieId")
    columns=["user_id", "query_movie_ids",
             "genres", "avg_rating", "year", "candidate_movie_id",
             "timestamp",
             "rating"]
    
    list2str = lambda lst: ','.join(map(str, lst))
    for u, df in data.groupby("userId"):
        df = df.sort_values("rating", ascending=False)
        if not is_train:
            user_movies_hist = train_hist.query("userId == {}".format(u)).movieId
        for i, (_, r) in enumerate(df.iterrows()):
            if is_train:
                query_hist = df.movieId[:i].tolist() + df.movieId[i + 1:].tolist()
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.timestamp, r.rating])
            else:
                tr_hist = set(user_movies_hist.tolist())
                query_hist = list(tr_hist - set([int(r.movieId)]))
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.timestamp, r.rating])
    return pd.DataFrame(queue, columns=columns)
    
tr_merged = preprocess(tr, movies)
tr_merged.to_csv('./tr.raw.movielens.csv', index=False, header=None)

te_merged = preprocess(te, movies, tr, is_train=False)
te_merged.to_csv('./te.raw.movielens.csv', index=False, header=None)
# 合併成一個檔案
merged = pd.concat([tr_merged, te_merged], ignore_index=True)
merged.to_csv('./merged_movielens.csv', index=False, header=None)
merged.head()

<br/>
<br/>
<br/>
## Cmd Submit Training

In [None]:
!cd D:/Python/notebook/recomm_prod && \
gcloud ml-engine jobs submit training recomm_movielens_15 \
    --job-dir gs://recomm-job/foo/model \
    --runtime-version 1.4 \
    --module-name trainer.ctrl \
    --package-path trainer \
    --region asia-east1 \
    --config config.yaml \
    -- \
    --method train \
    --conf-path gs://recomm-job/foo/data/user_supplied/movielens.yaml

In [None]:
!gcloud ml-engine jobs describe recomm_movielens_15

In [None]:
!cd .. && python setup.py build

## Python Client API Transform Data

In [None]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
hparam = HParams(conf_path='gs://movielens-foo/user_supplied/movielens.yaml')
hparam.add_hparam('is_local', False)
ctrl.gen_data(hparam)

## View Schema

In [4]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
loader = ctrl.load_schema(params)

vars(loader.schema)

2018-03-06 15:53:38,551 - Loader - INFO [line:363] - try to unserialize from gs://recomm-job/foo-bar/movielens_recommendation/data/parsed.yaml


{'col_states_': OrderedDict([('query_movie_ids',
               CatgMapper(allow_null=True, default=None, is_multi=True,
                     name='query_movie_ids', sep=',', vocabs=None, vocabs_path=None)),
              ('genres',
               CatgMapper(allow_null=True, default=None, is_multi=True, name='genres',
                     sep='|', vocabs=None, vocabs_path=None)),
              ('avg_rating', NumericMapper(default=None, name='avg_rating')),
              ('year', NumericMapper(default=None, name='year')),
              ('candidate_movie_id',
               CatgMapper(allow_null=True, default=None, is_multi=False,
                     name='candidate_movie_id', sep=None, vocabs=None, vocabs_path=None)),
              ('rating',
               CatgMapper(allow_null=False, default=None, is_multi=False, name='rating',
                     sep=None, vocabs=None, vocabs_path=None))]),
 'conf_': {'columns': [{'id': 'user_id', 'm_dtype': 'catg'},
   {'id': 'query_movie_ids',
  

In [99]:
pad = tf.keras.preprocessing.sequence.pad_sequences
col = [[1, 2], [1, 2, 3], [1]]
maxlen = max(map(len, col))
pad(col, padding="post", maxlen=maxlen).tolist()


[[1, 2, 0], [1, 2, 3], [1, 0, 0]]

<br/>
<br/>
<br/>
## Python Client API Submit Training

In [154]:
utils = reload('trainer.utils.utils')
reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml',
          'runtime_version': '1.4'}
ret = ctrl.train_submit(params)
job_id = ret.get('job_id')
print( ret.get('response') )

2018-03-08 15:22:27,661 - Ctrl - INFO [line:139] - foo-bar: gen_data take time 0:00:06.690555
jobId: foo_bar_movielens_recommendation_20180308152221665827
state: QUEUED
  for chunk in iter(lambda: fp.read(4096), ''):
Job [foo_bar_movielens_recommendation_20180308152221665827] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe foo_bar_movielens_recommendation_20180308152221665827

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs foo_bar_movielens_recommendation_20180308152221665827



<br/>
<br/>
<br/>
## Describe Job States

In [155]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

env = reload('trainer.env')
utils = reload('trainer.utils.utils')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml', 'job_id': job_id}
ret = ctrl.describe(params)
ret.get('response')

2018-03-08 15:22:28,470 - googleapiclient.discovery - INFO [line:274] - URL being requested: GET https://www.googleapis.com/discovery/v1/apis/ml/v1/rest
2018-03-08 15:22:29,763 - googleapiclient.discovery - INFO [line:868] - URL being requested: GET https://ml.googleapis.com/v1/projects/training-recommendation-engine/jobs/foo_bar_movielens_recommendation_20180308152221665827?alt=json
2018-03-08 15:22:29,765 - oauth2client.transport - INFO [line:151] - Attempting refresh to obtain initial access_token
2018-03-08 15:22:29,798 - oauth2client.client - INFO [line:795] - Refreshing access_token
2018-03-08 15:22:31,101 - Ctrl - INFO [line:157] - foo-bar: describe take time 0:00:02.643986


{'createTime': '2018-03-08T07:22:26Z',
 'jobId': 'foo_bar_movielens_recommendation_20180308152221665827',
 'state': 'PREPARING',
 'trainingInput': {'args': ['--train-steps',
   '1000',
   '--method',
   'train',
   '--conf-path',
   'gs://movielens-foo/user_supplied/movielens.yaml',
   '--job-id',
   'foo_bar_movielens_recommendation_20180308152221665827'],
  'jobDir': 'gs://recomm-job/foo-bar/movielens_recommendation/model',
  'packageUris': ['gs://recomm-job/foo-bar/movielens_recommendation/model/packages/6a7f7097e2b3e0e19f97e673596db54eaeeeebba2e5e65a7c1bd959a20ed6dc7/trainer-0.1.tar.gz'],
  'pythonModule': 'trainer.ctrl',
  'pythonVersion': '3.5',
  'region': 'asia-east1',
  'runtimeVersion': '1.4'},
 'trainingOutput': {}}

## Deploy

In [161]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
ret = ctrl.deploy(params)
ret

2018-03-08 15:43:40,636 - Service - INFO [line:98] - try to create model [foo_bar_movielens_recommendation] ...
2018-03-08 15:43:40,643 - googleapiclient.discovery - INFO [line:868] - URL being requested: POST https://ml.googleapis.com/v1/projects/training-recommendation-engine/models?alt=json
2018-03-08 15:43:40,645 - oauth2client.transport - INFO [line:151] - Attempting refresh to obtain initial access_token
2018-03-08 15:43:40,682 - oauth2client.client - INFO [line:795] - Refreshing access_token
2018-03-08 15:43:41,969 - Service - INFO [line:101] - try to clean old version ...
2018-03-08 15:43:41,974 - googleapiclient.discovery - INFO [line:868] - URL being requested: GET https://ml.googleapis.com/v1/projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions?alt=json
2018-03-08 15:43:42,245 - Service - INFO [line:142] - delete model version [projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180308153622527903]
2

{'err_cde': '00',
 'response': {'metadata': {'@type': 'type.googleapis.com/google.cloud.ml.v1.OperationMetadata',
   'createTime': '2018-03-08T07:43:44Z',
   'modelName': 'projects/training-recommendation-engine/models/foo_bar_movielens_recommendation',
   'operationType': 'CREATE_VERSION',
   'version': {'createTime': '2018-03-08T07:43:43Z',
    'deploymentUri': 'gs://recomm-job/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520494135',
    'description': '[foo-bar] recommendation model',
    'name': 'projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180308154343291994',
    'runtimeVersion': '1.4'}},
  'name': 'projects/training-recommendation-engine/operations/create_foo_bar_movielens_recommendation_v20180308154343291994-1520495023263'}}

## Get Information From Deployed Model

In [163]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml'}
ret = ctrl.model_info(params)
ret

2018-03-08 15:45:11,616 - googleapiclient.discovery - INFO [line:868] - URL being requested: GET https://ml.googleapis.com/v1/projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180308154343291994?alt=json
2018-03-08 15:45:11,617 - oauth2client.transport - INFO [line:151] - Attempting refresh to obtain initial access_token
2018-03-08 15:45:11,651 - oauth2client.client - INFO [line:795] - Refreshing access_token


{'err_cde': '00',
 'response': {'createTime': '2018-03-08T07:43:43Z',
  'deploymentUri': 'gs://recomm-job/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520494135',
  'description': '[foo-bar] recommendation model',
  'isDefault': True,
  'name': 'projects/training-recommendation-engine/models/foo_bar_movielens_recommendation/versions/v20180308154343291994',
  'runtimeVersion': '1.4',
  'state': 'READY'}}

## Restful predict

In [9]:
ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
ratings['timestamp'] = ratings.timestamp.map(dt.datetime.fromtimestamp).map(str)
ratings['ori_rating'] = ratings['rating']
ratings['rating'] = (ratings.rating >= 4).astype(int)

movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
avg_rt = ratings.groupby("movieId", as_index=False).ori_rating.mean().rename(index=str, columns={'ori_rating': 'avg_rating'})
movies = movies.merge(avg_rt, how='left', on='movieId')
movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.yaml'}
loader = ctrl.load_schema(params)
merged = pd.read_csv('../data/foo/user_supplied/raws/merged_movielens.csv', names=loader.schema.raw_cols)
merged.head()

2018-03-08 17:51:45,153 - Loader - INFO [line:363] - try to unserialize from gs://recomm-job/foo-bar/movielens_recommendation/data/parsed.yaml


Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,timestamp,rating
0,1,"1953,2105,31,1029,1061,1129,1263,1287,1293,133...",Drama,4.26087,1989.0,1172,2009-12-14 10:53:25,1
1,1,"1172,2105,31,1029,1061,1129,1263,1287,1293,133...",Action|Crime|Thriller,4.021739,1971.0,1953,2009-12-14 10:53:11,1
2,1,"1172,1953,31,1029,1061,1129,1263,1287,1293,133...",Action|Adventure|Sci-Fi,3.478723,1982.0,2105,2009-12-14 10:52:19,1
3,1,"1172,1953,2105,1029,1061,1129,1263,1287,1293,1...",Drama,3.178571,1995.0,31,2009-12-14 10:52:24,0
4,1,"1172,1953,2105,31,1061,1129,1263,1287,1293,133...",Animation|Children|Drama|Musical,3.702381,1941.0,1029,2009-12-14 10:52:59,0


In [24]:
def restful_data(user_ids):
    data = {
        'query_movie_ids': merged.query('user_id in {}'.format(user_ids)).groupby('user_id').query_movie_ids.max().tolist(),
    }
    items = movies.rename(index=str, columns={"movieId": "candidate_movie_id"}).drop('title', 1)
    items.loc[:, 'candidate_movie_id'] = items.candidate_movie_id.astype(str)
    # reduce to 5 records
    items = items[:5].to_dict('list')
    data.update(items)
    return data
    
restful_data((22,))

{'avg_rating': [3.8724696356275303,
  3.4018691588785046,
  3.1610169491525424,
  2.3846153846153846,
  3.267857142857143],
 'candidate_movie_id': ['1', '2', '3', '4', '5'],
 'genres': ['Adventure|Animation|Children|Comedy|Fantasy',
  'Adventure|Children|Fantasy',
  'Comedy|Romance',
  'Comedy|Drama|Romance',
  'Comedy'],
 'query_movie_ids': ['32,1884,1580,1527,1387,1377,1376,1375,1372,1356,1339,1291,1270,1255,1240,1215,1210,1200,2174,2288,2291,3081,4011,3868,3793,3535,3527,3355,3213,3033,2459,2987,2959,2858,2762,2712,2571,2542,1198,1799,457,648,555,480,551,589,541,592,593,253,260,163,296,858,1089,2881,235,2990,2953,231,3052,208,2985,1148,2763,3147,267,2723,2717,2716,2710,2701,2700,2683,2672,3082,3300,3176,3751,4015,47,3999,3996,3994,3977,48,3826,3809,70,3697,173,3623,3578,153,158,3438,3408,3354,2617,3285,3253,2657,356,2616,588,1693,1682,1645,1641,1625,1608,1544,1391,552,586,1374,2605,1371,1320,784,1263,785,1080,1214,1208,1201,1097,1721,1722,1769,44,315,355,2502,1101,2431,2402,2340,230

In [10]:
utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': 'gs://movielens-foo/user_supplied/movielens.yaml',
          'json_data': restful_data((22,))}
ret = ctrl.predict(params)
ret.get('response')

In [107]:
"""
query_movie_ids
genres
avg_rating
year
candidate_movie_id
query_movie_ids_len
genres_len
"""
# path = 'D:/Python/notebook/recomm_prod/repo/foo-bar/movielens_recommendation/1520474347'
path = 'D:/Python/notebook/recomm_prod/repo/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520485856'
tf.reset_default_graph()
with tf.Graph().as_default():
    with tf.Session() as sess:
        tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], path)
        # graph = sess.graph
        # print(sess.run(graph.get_tensor_by_name('gmf/pred:0'), feed_dict={
        #     graph.get_tensor_by_name('query_movie_ids:0'): data_for_model.get('query_movie_ids'),
        #     graph.get_tensor_by_name('genres:0'): data_for_model.get('genres'),
        #     graph.get_tensor_by_name('avg_rating:0'): data_for_model.get('avg_rating'),
        #     graph.get_tensor_by_name('year:0'): data_for_model.get('year'),
        #     graph.get_tensor_by_name('candidate_movie_id:0'): data_for_model.get('candidate_movie_id'),
        #     graph.get_tensor_by_name('query_movie_ids_len:0'): data_for_model.get('query_movie_ids_len'),
        #     graph.get_tensor_by_name('genres_len:0'): data_for_model.get('genres_len')
        # }))
        for node in sess.graph.get_operations():
            shape = node.node_def.attr.get('shape')
            print(node.name)

INFO:tensorflow:Restoring parameters from b'D:/Python/notebook/recomm_prod/repo/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520485856\\variables\\variables'
2018-03-08 13:12:33,378 - tensorflow - INFO [line:116] - Restoring parameters from b'D:/Python/notebook/recomm_prod/repo/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520485856\\variables\\variables'
global_step/Initializer/zeros/shape_as_tensor
global_step/Initializer/zeros/Const
global_step/Initializer/zeros
global_step
global_step/Assign
global_step/read
query_movie_ids
genres
avg_rating
year
candidate_movie_id
query_movie_ids_len
genres_len
init/random_uniform/shape
init/random_uniform/min
init/random_uniform/max
init/random_uniform/RandomUniform
init/random_uniform/sub
init/random_uniform/mul
init/random_uniform
init/b_global
init/b_global/Assign
init/b_global/read
init/embedding/random_uniform/shape
init/embedding/random_uniform/min
init/embedding/random_uniform/max
init/embedding/random_unif

<br/>
<br/>
<br/>
## Local Transform

In [100]:
utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
service = reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 'is_local': True}
ctrl.gen_data(params)

2018-03-08 13:07:29,937 - Loader - INFO [line:363] - try to unserialize from D:\Python\notebook\recomm_prod\repo/foo-bar/movielens_recommendation/data/parsed.yaml
2018-03-08 13:07:31,455 - Loader - INFO [line:381] - try to transform ['D:/Python/notebook/recomm_prod/data/foo/user_supplied/raws/merged_movielens.csv'] ... 
2018-03-08 13:08:21,425 - Loader - INFO [line:440] - [D:/Python/notebook/recomm_prod/data/foo/user_supplied/raws/merged_movielens.csv]: process take time 0:00:47.925426
2018-03-08 13:08:21,431 - Ctrl - INFO [line:87] - foo-bar: gen_data take time 0:00:51.494970


{'err_cde': '00'}

<br/>
<br/>
<br/>
## Local View Schema

In [3]:
utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
service = reload('trainer.service')

with flex.io('../repo/foo-bar/movielens_recommendation/data/parsed.yaml').as_reader() as f:
    schema = flex.Schema.unserialize(f.stream)
vars(schema)

{'col_states_': OrderedDict([('query_movie_ids',
               CatgMapper(allow_null=True, default=None, is_multi=True,
                     name='query_movie_ids', sep=',', vocabs=None, vocabs_path=None)),
              ('genres',
               CatgMapper(allow_null=True, default=None, is_multi=True, name='genres',
                     sep='|', vocabs=None, vocabs_path=None)),
              ('avg_rating', NumericMapper(default=None, name='avg_rating')),
              ('year', NumericMapper(default=None, name='year')),
              ('candidate_movie_id',
               CatgMapper(allow_null=True, default=None, is_multi=False,
                     name='candidate_movie_id', sep=None, vocabs=None, vocabs_path=None)),
              ('rating',
               CatgMapper(allow_null=False, default=None, is_multi=False, name='rating',
                     sep=None, vocabs=None, vocabs_path=None))]),
 'conf_': {'columns': [{'id': 'user_id', 'm_dtype': 'catg'},
   {'id': 'query_movie_ids',
  

<br/>
<br/>
<br/>
## Local Training

In [8]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.reco_mf_dnn_est')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 
          'is_local': True,
          'runtime_version': '1.4',
          'train_steps': 600}
result = ctrl.train(params)

2018-03-08 17:12:59,158 - Ctrl - INFO [line:168] - received params: {'is_local': True, 'train_steps': 600, 'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 'runtime_version': '1.4'}
2018-03-08 17:12:59,160 - Ctrl - INFO [line:174] - do local training
2018-03-08 17:12:59,190 - Ctrl - INFO [line:194] - foo-bar: try to unserialize D:\Python\notebook\recomm_prod\repo/foo-bar/movielens_recommendation/data/parsed.yaml
2018-03-08 17:13:01,264 - Service - INFO [line:41] - received params: {'save_every_steps': None, 'train_steps': 600, 'n_batch': 128, 'model_id': 'movielens_recommendation', 'valid_file': 'D:\\Python\\notebook\\recomm_prod\\repo/foo-bar/movielens_recommendation/data/data.vl', 'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 'job_dir': 'D:\\Python\\notebook\\recomm_prod\\repo/foo-bar/movielens_recommendation/model', 'dim': 16, 'data_dir': 'D:\\Python\\notebook\\recomm_prod\\repo/foo-bar/movielens_recommendation/data', 'raw_dir': 'D:/Python/notebook/rec

2018-03-08 17:13:26,497 - tensorflow - INFO [line:116] - Evaluation [230/233]
INFO:tensorflow:Evaluation [233/233]
2018-03-08 17:13:26,523 - tensorflow - INFO [line:116] - Evaluation [233/233]
INFO:tensorflow:Finished evaluation at 2018-03-08-09:13:26
2018-03-08 17:13:26,552 - tensorflow - INFO [line:116] - Finished evaluation at 2018-03-08-09:13:26
INFO:tensorflow:Saving dict for global step 549: auc = 0.79645175, global_step = 549, loss = 0.550726
2018-03-08 17:13:26,554 - tensorflow - INFO [line:116] - Saving dict for global step 549: auc = 0.79645175, global_step = 549, loss = 0.550726
2018-03-08 17:13:26,570 - BestScoreExporter - INFO [line:235] - clean export_path: D:\Python\notebook\recomm_prod\repo/foo-bar/movielens_recommendation/model\export\export_foo-bar
2018-03-08 17:13:26,574 - BestScoreExporter - INFO [line:240] - nice eval loss: 0.5507259964942932, export to pb
INFO:tensorflow:Calling model_fn.
2018-03-08 17:13:26,598 - tensorflow - INFO [line:116] - Calling model_fn.
I

In [9]:
model = result.get('response')
vars(model)

{'ans': <tf.Tensor 'loss/strided_slice:0' shape=(?, 1) dtype=float32>,
 'auc': (<tf.Tensor 'metrics/auc/value:0' shape=() dtype=float32>,
  <tf.Tensor 'metrics/auc/update_op:0' shape=() dtype=float32>),
 'avg_rating': <tf.Tensor 'IteratorGetNext:0' shape=(?,) dtype=float32>,
 'b_candidate_movie_id': <tf.Variable 'init/embedding/b_candidate_movie_id:0' shape=(26,) dtype=float32_ref>,
 'b_global': <tf.Variable 'init/b_global:0' shape=() dtype=float32_ref>,
 'b_query_movie_ids': <tf.Variable 'init/embedding/b_query_movie_ids:0' shape=(16,) dtype=float32_ref>,
 'candidate_bias': <tf.Tensor 'item_encoding/MatMul:0' shape=(?, 1) dtype=float32>,
 'candidate_emb': <tf.Tensor 'item_encoding/embedding_lookup:0' shape=(?, 16) dtype=float32>,
 'candidate_movie_id': <tf.Tensor 'IteratorGetNext:1' shape=(?,) dtype=int32>,
 'emb_genres': <tf.Tensor 'item_encoding/Sum:0' shape=(?, 8) dtype=float32>,
 'emb_item': <tf.Tensor 'item_encoding/dense_2/Selu:0' shape=(?, 32) dtype=float32>,
 'emb_query': <tf.

## Local Get Model and Predict

In [21]:
utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
est = reload('trainer.reco_mf_dnn_est')
service = reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
model = ctrl.get_model({'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 'is_local': True, 'json_data': data})
estimator = model.create_est()

cols = ['query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
defaults = [[''], [''], [], [], [0], [0]]
multi_cols = ('query_movie_ids', 'genres')

# def add_seq_cols(feat):
#     for m_col in multi_cols:
#         name = '{}_len'.format(m_col)
#         feat[name] = tf.size(feat[m_col])
#         cols.append(name)
#     return feat
# dataset = dataset.map(add_seq_cols, num_parallel_calls=4)
# dataset = dataset.repeat(1)
# dataset = dataset.padded_batch(5, OrderedDict(zip(cols, ([None], [None], [], [], [], [], [], [], []))))
# features = dataset.make_one_shot_iterator().get_next()
# for e in estimator.predict(lambda: dataset):
#     print(e)

INFO:tensorflow:Using config: {'_task_type': 'worker', '_save_summary_steps': 100, '_tf_random_seed': 88, '_is_chief': True, '_model_dir': 'D:\\Python\\notebook\\recomm_prod\\repo/foo-bar/movielens_recommendation/model', '_evaluation_master': '', '_keep_checkpoint_max': 5, '_num_worker_replicas': 1, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_master': '', '_service': None, '_save_checkpoints_secs': 600, '_task_id': 0, '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001DFB9519630>, '_global_id_in_cluster': 0, '_log_step_count_steps': 300}
2018-03-08 17:57:17,896 - tensorflow - INFO [line:116] - Using config: {'_task_type': 'worker', '_save_summary_steps': 100, '_tf_random_seed': 88, '_is_chief': True, '_model_dir': 'D:\\Python\\notebook\\recomm_prod\\repo/foo-bar/movielens_recommendation/model', '_evaluation_master': '', '_keep_checkpoint_max': 5, '_num_worker_replicas': 1,

<TensorDataset shapes: {candidate_movie_id: (5,), genres: (5,), year: (5,), avg_rating: (5,), query_movie_ids: (1,)}, types: {candidate_movie_id: tf.string, genres: tf.string, year: tf.float32, avg_rating: tf.float32, query_movie_ids: tf.string}>

In [14]:
data

{'avg_rating': [3.8724696356275303,
  3.4018691588785046,
  3.1610169491525424,
  2.3846153846153846,
  3.267857142857143],
 'candidate_movie_id': ['1', '2', '3', '4', '5'],
 'genres': ['Adventure|Animation|Children|Comedy|Fantasy',
  'Adventure|Children|Fantasy',
  'Comedy|Romance',
  'Comedy|Drama|Romance',
  'Comedy'],
 'query_movie_ids': ['32,1884,1580,1527,1387,1377,1376,1375,1372,1356,1339,1291,1270,1255,1240,1215,1210,1200,2174,2288,2291,3081,4011,3868,3793,3535,3527,3355,3213,3033,2459,2987,2959,2858,2762,2712,2571,2542,1198,1799,457,648,555,480,551,589,541,592,593,253,260,163,296,858,1089,2881,235,2990,2953,231,3052,208,2985,1148,2763,3147,267,2723,2717,2716,2710,2701,2700,2683,2672,3082,3300,3176,3751,4015,47,3999,3996,3994,3977,48,3826,3809,70,3697,173,3623,3578,153,158,3438,3408,3354,2617,3285,3253,2657,356,2616,588,1693,1682,1645,1641,1625,1608,1544,1391,552,586,1374,2605,1371,1320,784,1263,785,1080,1214,1208,1201,1097,1721,1722,1769,44,315,355,2502,1101,2431,2402,2340,230

## Local Deploy

In [170]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
flex = reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 'is_local': True}
ret = ctrl.deploy(params)
ret

2018-03-08 16:35:23,760 - Service - INFO [line:98] - try to create model [foo_bar_movielens_recommendation] ...
2018-03-08 16:35:23,762 - Ctrl - ERROR [line:255] - create_model_rsc() takes 3 positional arguments but 4 were given
Traceback (most recent call last):
  File "D:/Python/notebook/recomm_prod\trainer\ctrl.py", line 249, in deploy
    res = self.service.deploy(p, deploy_conf[self.EXPORT_PATH])
  File "D:/Python/notebook/recomm_prod\trainer\service.py", line 99, in deploy
    self.create_model_rsc(ml, p, model_name)
TypeError: create_model_rsc() takes 3 positional arguments but 4 were given


{'err_cde': '99',
 'err_msg': 'create_model_rsc() takes 3 positional arguments but 4 were given'}

In [17]:
model = result.get('response')
flex = reload('trainer.utils.flex')
# with flex.io('./data.json').as_reader('r') as f:
#     data = json.load(f.stream)


for e in model.estimator_.predict(input_fn=lambda: dataset):
    print(e)

INFO:tensorflow:Calling model_fn.
2018-03-08 17:21:54,874 - tensorflow - INFO [line:116] - Calling model_fn.


ValueError: Tensor("IteratorGetNext_2:3", shape=(?, 1), dtype=string) must be from the same graph as Tensor("init/embedding/w_query_movie_ids:0", shape=(9125, 16), dtype=float32_ref).

## Test

In [4]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.reco_mf_dnn_est')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
params = {'conf_path': '../data/foo/user_supplied/movielens.local.yaml', 
          'raw_dir': 'gs://recomm-job/foo-bar'}
print(ctrl.test(params).get('response'))

{'export_path': 'gs://recomm-job/foo-bar/movielens_recommendation/model/export/export_foo-bar/1520326728', 'model_name': 'foo_bar_movielens_recommendation', 'version': 'v20180306181514380085', 'job_id': 'movielens_recommendation_20180306165443177431'}


### 更改GCS movielens.yaml

In [None]:
import shutil
from google.cloud.storage.blob import Blob
from io import BytesIO

utils = reload('trainer.utils.utils')
flex = reload('trainer.utils.flex')
env = reload('trainer.env')

with flex.io('../data/foo/user_supplied/movielens.yaml') as r, \
    flex.io('gs://movielens-foo/user_supplied/movielens.yaml') as w:
    w.write(r.read())

# stream = BytesIO(open('../data/foo/user_supplied/movielens.yaml', mode='rb').read())
# utils.gcs_blob('gs://movielens-foo/user_supplied/movielens.yaml').upload_from_file(stream)

In [None]:
cols = list(data.keys())
multi_cols = ('query_movie_ids', 'genres')

def trans(features):
    # features = OrderedDict(zip(cols, data))
    print( schema.col_states_['query_movie_ids'].transform( features['query_movie_ids'] ) )
    # for col in multi_cols:
    #     features[col] = tf.string_to_number(tf.string_split(features[col], ',').values, out_type=tf.int32)
    return features

def add_seq_cols(feat):
    for m_col in multi_cols:
        name = '{}_len'.format(m_col)
        feat[name] = tf.size(feat[m_col])
        cols.append(name)
    return feat

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.Dataset.from_tensors(data)
    dataset = dataset.map(trans, num_parallel_calls=4)
    dataset = dataset.map(add_seq_cols, num_parallel_calls=4)
    print('cols', cols)
    dataset = dataset.repeat(1)
    dataset = dataset.padded_batch(5, OrderedDict(zip(cols, ([], [], [], [None], [], [None], [], []))))
    inputs = dataset.make_one_shot_iterator().get_next()
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            _, = sess.run([inputs])
            # print( sess.run(inputs) )
            pass

<br/>
<br/>
<br/>
## Dataset

In [None]:
def make_datasets(fpath_ary, schema, n_batch=128, n_epoch=1):
    def to_dense(sp):
        dense = tf.sparse_to_dense(sp.indices, sp.dense_shape, sp.values, '')
        return tf.reshape(tf.to_int32(tf.string_to_number(dense)), [-1])

    def to_sparse(dense):
        idx = tf.where(tf.not_equal(dense, 0))
        return tf.SparseTensor(indices=idx, dense_shape=dense.get_shape(), values=tf.gather_nd(dense, idx))

    def parse_csv(value):
        data = tf.decode_csv(value, record_defaults=defaults)
        features = OrderedDict(zip(cols, data))
        multi_cols = df_conf.query("{} == '{}' and {} == True".format(schema.M_DTYPE, schema.CATG, schema.IS_MULTI)).id.values
        for col in multi_cols:
            features[col] = tf.string_split([features[col]], ',')
            features[col] = to_dense(features[col])
            # features['{}_lens'.format(col)] = tf.size(features[col])
        return features 
    
    df_conf = schema.df_conf_.query('{}.notnull()'.format(schema.TYPE))
    cols = schema.cols
    defaults = []
    for _, r in df_conf.iterrows():
        if r[schema.M_DTYPE] == schema.CATG:
            defaults.append([''] if r[schema.IS_MULTI] else [0])
        else:
            defaults.append([])
    dataset = tf.data.TextLineDataset(fpath_ary)
    dataset = dataset.map(parse_csv, num_parallel_calls=4)
    has_multi = (df_conf[schema.M_DTYPE] == schema.CATG) & (df_conf[schema.IS_MULTI] == True)
    if sum(has_multi):
        multi_cols = df_conf[has_multi].id.values
        dataset = dataset.padded_batch(n_batch, OrderedDict( zip(cols, tuple([None] if e else [] for e in has_multi))) )
    else:
        dataset = dataset.batch(n_batch)
    dataset = dataset.shuffle(n_batch * 10, seed=seed).repeat(n_epoch)
    features = dataset.make_one_shot_iterator().get_next()
    return features, features.pop(schema.label[0])
                                
# tf.reset_default_graph()
with tf.Graph().as_default():
    inputs = make_datasets(['./movielens.tr'], loader.schema, n_batch=30)
    query_lens = tf.sequence_mask([1, 2, 3])
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            _, = sess.run([inputs])
            # print( sess.run(inputs) )
            pass

## Feature Columns with tf.feature_column.input_layer

In [None]:
a = pd.Series(minmax_scale(np.random.normal(0, 1, size=1000)))
a.hist(bins=50)

In [None]:
%%time
tf.reset_default_graph()
with tf.Graph().as_default():
    user_id = tf.feature_column.categorical_column_with_hash_bucket('user_id', hash_bucket_size=1000, dtype=tf.int32)
    user_id = tf.feature_column.embedding_column(user_id, dimension=8)
    avg_rating = tf.feature_column.numeric_column('avg_rating')
    columns = [user_id, avg_rating]
    
    def make_datasets(fpath_ary):
        cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
        defaults = [[0], [''], [''], [], [], [0], []]

        def parse_csv(value):
            data = tf.decode_csv(value, record_defaults=defaults)
            features = OrderedDict(zip(cols, data))
            # print(features)
            return features
        
        dataset = tf.data.TextLineDataset(fpath_ary)
        dataset = (dataset.map(parse_csv, num_parallel_calls=4)
                          .batch(3)
                          # .padded_batch(3, OrderedDict(zip(cols, ([], [None], [None], [], [], [], []))))
                          .shuffle(10, seed=seed)
                          .repeat(1)
                  )
        return dataset.make_one_shot_iterator().get_next()
    
    inputs = make_datasets(['./te_processed.batch.csv'])
    inputs = tf.feature_column.input_layer(inputs, columns)
    # features = tf.parse_example(serialized_example, features=tf.feature_column.make_parse_example_spec(columns))
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            print(sess.run(inputs))

### Make Example

In [None]:
%%time
cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
is_multi = [False, True, True, False, False, False, False]
pd_dtypes = [int, str, str, float, float, int, float]
types = ['int64_list', 'int64_list', 'int64_list', 'float_list', 'float_list', 'int64_list', 'float_list']
tf_types = [tf.int64, tf.int64, tf.int64, tf.float32, tf.float32, tf.int64, tf.float32]
def persist_example(fpath, tfpath):
    with tf.python_io.TFRecordWriter(tfpath) as w:
        for chunk in pd.read_csv(fpath, names=cols, dtype=dict(zip(cols, pd_dtypes)), chunksize=1000):
            chunk['query_movie_ids'] = chunk.query_movie_ids.map(lambda r: map(int, r.split(',')))
            chunk['genres'] = chunk.genres.map(lambda r: map(int, r.split(',')))
            
            for idx, r in chunk.iterrows():
                ex = tf.train.Example()
                for multi, col, tpe in zip(is_multi, cols, types):
                    val = r[col]
                    # ex.features.feature[col].int64_list or float_list or bytes_list
                    feat_type = getattr(ex.features.feature[col], tpe)
                    # extend function for multivalent columns, otherwise append
                    append_or_extend = 'append' if not multi else 'extend'                    
                    getattr(feat_type.value, append_or_extend)(val)
                w.write(ex.SerializePartialToString())

persist_example('./te_processed.csv', './data.tfrecord')

In [None]:
def decode_example(ser_example):
    # queue = tf.train.string_input_producer([fpath], num_epochs=1)
    # _, ser_example = tf.TFRecordReader().read(queue)
    # ser_example = tf.train.batch([ser_example], batch_size=10)
    ctx_features = {col: tf.FixedLenFeature([], tf_tpe)
                    for col, tf_tpe in zip(cols, tf_types) if col not in ('query_movie_ids', 'genres')}
    seq_features = {col: tf.FixedLenSequenceFeature([], tf_tpe) 
                    for col, tf_tpe in [('query_movie_ids', tf.int64), ('genres', tf.int64)]}
    context_dict, sequence_dict = tf.parse_single_sequence_example(ser_example, 
                                                                   context_features=ctx_features, 
                                                                   sequence_features=seq_features)
    # for col, tpe in zip(cols, tf_types):
    #     val = feature_dict[col]
    #     feature_dict[col] = tf.sparse_to_dense(val.indices, val.dense_shape, val.values, name=col)
    feature_dict = {}
    feature_dict.update(context_dict)
    feature_dict.update(sequence_dict)
    ret = OrderedDict()
    for c in cols:
        ret[c] = feature_dict[c]
    return tuple(ret.values())

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.TFRecordDataset(['./data.tfrecord'])
    dataset = dataset.map(decode_example).padded_batch(10, padded_shapes=([], [None], [None], [], [], [], []))
    # dataset = dataset.batch(3)
    iters = dataset.make_one_shot_iterator()
    r = iters.get_next()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print( sess.run(r) )

## Traditional parse_example
1. tf.train.Coordinator + tf.train.start_queue_runners

In [None]:
from tensorflow.python.framework import sparse_tensor
import re

def to_sparse(dense):
    idx = tf.where(tf.not_equal(dense, 0))
    return tf.SparseTensor(idx, tf.gather_nd(dense, idx), dense.get_shape())

def make_example(val):
    example = tf.train.Example(features=tf.train.Features(
        feature = {
            'query_movie_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=val)),
            'genres': tf.train.Feature(int64_list=tf.train.Int64List(value=val))
        }
    ))
    return example

tf.reset_default_graph()
with tf.Graph().as_default():
    
    filename = "tmp.tfrecords"
    if not os.path.exists(filename):
        # os.remove(filename)
        writer = tf.python_io.TFRecordWriter(filename)
        with writer:
            for idx, r in teProcessed.head().iterrows():
                for col in ('query_movie_ids', 'genres'):
                    val = list(map(int, re.split(',\s*', r[col])))
                    ex = make_example(val)
                    writer.write(ex.SerializeToString())

    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(["tmp.tfrecords"], num_epochs=1)
    _, serialized_example = reader.read(filename_queue)

    batch = tf.train.batch(tensors=[serialized_example], batch_size=1)
    features = {
        'query_movie_ids': tf.VarLenFeature(tf.int64),
        'genres': tf.VarLenFeature(tf.int64)
    }
    data = tf.parse_example(batch, features)
    query_movie_ids = data['query_movie_ids']
    embbedding = tf.Variable(tf.glorot_uniform_initializer()([9125]), dtype=tf.float32)
    print(query_movie_ids.dense_shape)
    # r = tf.layers.dense(query_movie_ids, 10)
    # emb_query = tf.nn.embedding_lookup_sparse([embbedding], query_movie_ids, None, combiner='sqrtn')
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        try:
            print(sess.run(data))
            pass
        except tf.errors.OutOfRangeError as e:
            coord.request_stop(e)
        finally:
            coord.request_stop()
            coord.join(threads)
    

## Test

In [168]:
tf.reset_default_graph()
with tf.Graph().as_default():
    a = tf.placeholder(shape=[None, None, None], dtype=tf.float32)
    print( a.shape.dims )
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

[Dimension(None), Dimension(None), Dimension(None)]


In [None]:
tf.zeros