In [22]:
%matplotlib inline
import os, sys, numpy as np, pandas as pd, tensorflow as tf, re, codecs, seaborn as sns, json, time, csv, datetime as dt
import pickle, collections, random, math, numbers, scipy.sparse as sp, matplotlib.pyplot as plt, scipy.sparse as sp
from pprint import pprint

def reload(mName):
    import importlib
    if mName in sys.modules:
        del sys.modules[mName]
    return importlib.import_module(mName)


from collections import deque, defaultdict, OrderedDict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, minmax_scale
from matplotlib import pyplot as plt
plt.style.use('ggplot')

# classpath
ctx = os.path.abspath('..').replace('\\', '/')
cps = [ctx]
_ = [sys.path.insert(0, cp) for cp in cps if cp not in sys.path]

# data path
datapath = '/'.join([ctx, 'data'])

seed = 88
utils = reload('trainer.utils.utils')
np.set_printoptions(precision=4, suppress=True, linewidth=100)
np.random.seed(seed)

## Simple Data Preprocess

In [2]:
import datetime as dt

ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
ratings['timestamp'] = ratings.timestamp.map(dt.datetime.fromtimestamp).map(str)
ratings['ori_rating'] = ratings['rating']
ratings['rating'] = (ratings.rating >= 4).astype(int)
tr, te = utils.split_by_ratio(ratings)

movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
avg_rt = ratings.groupby("movieId", as_index=False).ori_rating.mean().rename(index=str, columns={'ori_rating': 'avg_rating'})
movies = movies.merge(avg_rt, how='left', on='movieId')
# movies.avg_rating.fillna(ratings.rating.mean())
movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)
# movies["year"] = minmax_scale(movies.year.fillna(movies.year.median()))

In [3]:
def preprocess(data, movie_trans, train_hist=None, is_train=True):
    queue = []
    data = data.merge(movie_trans, how="left", on="movieId")
    columns=["user_id", "query_movie_ids",
             "genres", "avg_rating", "year", "candidate_movie_id",
             "timestamp",
             "rating"]
    
    list2str = lambda lst: ','.join(map(str, lst))
    for u, df in data.groupby("userId"):
        df = df.sort_values("rating", ascending=False)
        if not is_train:
            user_movies_hist = train_hist.query("userId == {}".format(u)).movieId
        for i, (_, r) in enumerate(df.iterrows()):
            if is_train:
                query_hist = df.movieId[:i].tolist() + df.movieId[i + 1:].tolist()
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.timestamp, r.rating])
            else:
                tr_hist = set(user_movies_hist.tolist())
                query_hist = list(tr_hist - set([int(r.movieId)]))
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.timestamp, r.rating])
    return pd.DataFrame(queue, columns=columns)
    
tr_merged = preprocess(tr, movies)
tr_merged.to_csv('./tr.raw.movielens.csv', index=False, header=None)

te_merged = preprocess(te, movies, tr, is_train=False)
te_merged.to_csv('./te.raw.movielens.csv', index=False, header=None)
# 合併成一個檔案
merged = pd.concat([tr_merged, te_merged], ignore_index=True)
merged.to_csv('./merged_movielens.csv', index=False, header=None)
merged.head()

Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,timestamp,rating
0,1,"1953,2105,31,1029,1061,1129,1263,1287,1293,133...",Drama,4.26087,1989.0,1172,2009-12-14 10:53:25,1
1,1,"1172,2105,31,1029,1061,1129,1263,1287,1293,133...",Action|Crime|Thriller,4.021739,1971.0,1953,2009-12-14 10:53:11,1
2,1,"1172,1953,31,1029,1061,1129,1263,1287,1293,133...",Action|Adventure|Sci-Fi,3.478723,1982.0,2105,2009-12-14 10:52:19,1
3,1,"1172,1953,2105,1029,1061,1129,1263,1287,1293,1...",Drama,3.178571,1995.0,31,2009-12-14 10:52:24,0
4,1,"1172,1953,2105,31,1061,1129,1263,1287,1293,133...",Animation|Children|Drama|Musical,3.702381,1941.0,1029,2009-12-14 10:52:59,0


<br/>
<br/>
<br/>
## Transform Training Data

In [8]:
%%time
from tensorflow.contrib.training.python.training.hparam import HParams

reload('trainer.service')
ctrl = reload('trainer.ctrl').Ctrl.instance
hparam = HParams(conf_path='{}/foo/user_supplied/movielens.yaml'.format(datapath))
print(ctrl.gen_data(hparam))

try to parse D:/Python/notebook/recomm_prod/data/foo/user_supplied/movielens.yaml (user supplied) ...
try to transform ['D:/Python/notebook/recomm_prod/data/foo/user_supplied/raws\\merged_movielens.csv'] ... 
[D:/Python/notebook/recomm_prod/data/foo/user_supplied/raws\merged_movielens.csv]: process take time 0:00:59.478155
2018-02-13 11:02:02,449 - ctrl - INFO - foo: gen_data take time 0:01:03.828361
{'err_cde': 0}
Wall time: 1min 3s


<br/>
<br/>
<br/>
## Train

In [8]:
from tensorflow.contrib.training.python.training.hparam import HParams
utils = reload('trainer.utils.utils')
reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
hparam = HParams(conf_path='{}/foo/user_supplied/movielens.yaml'.format(datapath))

ctrl.train(hparam)

2018-02-14 12:04:00,541 - Ctrl - INFO - foo: try to generate training data...
2018-02-14 12:04:00,543 - Loader - INFO - try to parse D:/Python/notebook/recomm_prod/data/foo/user_supplied/movielens.yaml (user supplied) ...
2018-02-14 12:04:03,130 - Loader - INFO - try to transform ['D:/Python/notebook/recomm_prod/data/foo/user_supplied/raws\\merged_movielens.csv'] ... 
[D:/Python/notebook/recomm_prod/data/foo/user_supplied/raws\merged_movielens.csv]: process take time 0:01:02.248726
2018-02-14 12:05:06,191 - Service - INFO - hparam: {'conf_path': 'D:/Python/notebook/recomm_prod/data/foo/user_supplied/movielens.yaml',
 'data_dir': 'D:/Python/notebook/recomm_prod/repo\\foo\\data',
 'dim': 16,
 'eval_name': 'foo',
 'eval_steps': 233,
 'export_name': 'export_foo',
 'job_dir': 'D:/Python/notebook/recomm_prod/repo\\foo\\model',
 'n_batch': 128,
 'override': True,
 'parsed_conf_path': 'D:/Python/notebook/recomm_prod/repo\\foo\\data\\parsed.yaml',
 'pid': 'foo',
 'raw_dir': 'D:/Python/notebook/

INFO:tensorflow:Assets added to graph.
2018-02-14 12:05:39,079 - tensorflow - INFO - Assets added to graph.
INFO:tensorflow:No assets to write.
2018-02-14 12:05:39,081 - tensorflow - INFO - No assets to write.
INFO:tensorflow:SavedModel written to: b"D:/Python/notebook/recomm_prod/repo\\foo\\model_1518581106.1947258\\export\\export_foo\\temp-b'1518581138'\\saved_model.pb"
2018-02-14 12:05:39,729 - tensorflow - INFO - SavedModel written to: b"D:/Python/notebook/recomm_prod/repo\\foo\\model_1518581106.1947258\\export\\export_foo\\temp-b'1518581138'\\saved_model.pb"
INFO:tensorflow:Create CheckpointSaverHook.
2018-02-14 12:05:41,137 - tensorflow - INFO - Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from D:/Python/notebook/recomm_prod/repo\foo\model_1518581106.1947258\model.ckpt-549
2018-02-14 12:05:41,536 - tensorflow - INFO - Restoring parameters from D:/Python/notebook/recomm_prod/repo\foo\model_1518581106.1947258\model.ckpt-549
INFO:tensorflow:Saving checkpoints for

INFO:tensorflow:Evaluation [69/233]
2018-02-14 12:06:37,018 - tensorflow - INFO - Evaluation [69/233]
INFO:tensorflow:Evaluation [92/233]
2018-02-14 12:06:37,237 - tensorflow - INFO - Evaluation [92/233]
INFO:tensorflow:Evaluation [115/233]
2018-02-14 12:06:37,477 - tensorflow - INFO - Evaluation [115/233]
INFO:tensorflow:Evaluation [138/233]
2018-02-14 12:06:37,784 - tensorflow - INFO - Evaluation [138/233]
INFO:tensorflow:Evaluation [161/233]
2018-02-14 12:06:38,053 - tensorflow - INFO - Evaluation [161/233]
INFO:tensorflow:Evaluation [184/233]
2018-02-14 12:06:38,327 - tensorflow - INFO - Evaluation [184/233]
INFO:tensorflow:Evaluation [207/233]
2018-02-14 12:06:38,581 - tensorflow - INFO - Evaluation [207/233]
INFO:tensorflow:Evaluation [230/233]
2018-02-14 12:06:38,865 - tensorflow - INFO - Evaluation [230/233]
INFO:tensorflow:Evaluation [233/233]
2018-02-14 12:06:38,893 - tensorflow - INFO - Evaluation [233/233]
INFO:tensorflow:Finished evaluation at 2018-02-14-04:06:38
2018-02-1

<reco_mf_dnn.reco_mf_dnn_est.ModelMfDNN at 0x23ae3759860>

<br/>
<br/>
<br/>
## Cmd Submit Training

In [73]:
!cd D:/Python/notebook/recomm_prod && \
gcloud ml-engine jobs submit training recomm_movielens_15 \
    --job-dir gs://recomm-job/foo/model \
    --runtime-version 1.4 \
    --module-name trainer.ctrl \
    --package-path trainer \
    --region asia-east1 \
    --config config.yaml \
    -- \
    --method train \
    --conf-path gs://recomm-job/foo/data/user_supplied/movielens.yaml

jobId: recomm_movielens_15
state: QUEUED


  for chunk in iter(lambda: fp.read(4096), ''):
Job [recomm_movielens_15] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe recomm_movielens_15

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs recomm_movielens_15


In [113]:
!gcloud ml-engine jobs describe recomm_movielens_15

createTime: '2018-02-22T06:32:40Z'
endTime: '2018-02-22T06:33:44Z'
jobId: recomm_movielens_15
startTime: '2018-02-22T06:33:13Z'
state: SUCCEEDED
trainingInput:
  args:
  - --method
  - train
  - --conf-path
  - gs://recomm-job/foo/data/user_supplied/movielens.yaml
  jobDir: gs://recomm-job/foo/model
  packageUris:
  - gs://recomm-job/foo/model/packages/0a10348c08fa5c2fcbcd7017d1f01ba18be758689a6f860fc725b64bff1f2955/trainer-0.1.tar.gz
  pythonModule: trainer.ctrl
  pythonVersion: '3.5'
  region: asia-east1
  runtimeVersion: '1.4'
trainingOutput:
  consumedMLUnits: 0.1



View job in the Cloud Console at:
https://console.cloud.google.com/ml/jobs/recomm_movielens_15?project=training-recommendation-engine

View logs at:
https://console.cloud.google.com/logs?resource=ml.googleapis.com%2Fjob_id%2Frecomm_movielens_15&project=training-recommendation-engine


In [40]:
!cd .. && python setup.py build

running build
running build_py
creating build
creating build\lib
creating build\lib\trainer
copying trainer\ctrl.py -> build\lib\trainer
copying trainer\env.py -> build\lib\trainer
copying trainer\reco_mf_dnn.py -> build\lib\trainer
copying trainer\reco_mf_dnn_est.py -> build\lib\trainer
copying trainer\service.py -> build\lib\trainer
copying trainer\__init__.py -> build\lib\trainer
creating build\lib\trainer\utils
copying trainer\utils\flex.py -> build\lib\trainer\utils
copying trainer\utils\kkbox_data.py -> build\lib\trainer\utils
copying trainer\utils\neg_sampling_data.py -> build\lib\trainer\utils
copying trainer\utils\utils.py -> build\lib\trainer\utils
copying trainer\utils\__init__.py -> build\lib\trainer\utils
copying trainer\logging.yaml -> build\lib\trainer


<br/>
<br/>
<br/>
## Python Client API Submit Training

In [199]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
hparam = HParams(conf_path='{}/foo/user_supplied/movielens.yaml'.format(datapath))
print( ctrl.train_submit(hparam) )

  for chunk in iter(lambda: fp.read(4096), ''):
ERROR: (gcloud.ml-engine.jobs.submit.training) Resource in project [training-recommendation-engine] is the subject of a conflict: Field: job.job_id Error: A job with this id already exists.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: A job with this id already exists.
    field: job.job_id



In [41]:
from tensorflow.contrib.training.python.training.hparam import HParams

utils = reload('trainer.utils.utils')
env = reload('trainer.env')
reload('trainer.utils.flex')
reload('trainer.service')

ctrl = reload('trainer.ctrl').Ctrl.instance
hparam = HParams(conf_path='gs://movielens-foo/user_supplied/movielens.yaml'.format(datapath))
loader = ctrl.gen_data(hparam)

2018-02-25 20:03:52,366 - Loader - INFO - try to parse gs://movielens-foo/user_supplied/movielens.yaml (user supplied) ...
2018-02-25 20:04:10,800 - Ctrl - INFO - foo: gen_data take time 0:00:18.854078


In [60]:
# r = codecs.open('./1.txt', 'a', 'utf-8')
r.stream.close()

In [19]:
import yaml
from io import BytesIO

utils = reload('trainer.utils.utils')
blob = utils.gcs_blob('gs://movielens-foo/user_supplied/movielens.yaml')
yaml.load(blob.download_as_string())
# sio = BytesIO(blob.download_as_string())
# pd.read_csv(sio, header=None, encoding='utf-8')

{'columns': [{'id': 'user_id', 'm_dtype': 'catg'},
  {'id': 'query_movie_ids',
   'is_multi': True,
   'm_dtype': 'catg',
   'sep': ',',
   'vocabs_path': 'gs://movielens-foo/user_supplied/item.vocab'},
  {'id': 'genres',
   'is_multi': True,
   'm_dtype': 'catg',
   'sep': '|',
   'vocabs_path': 'gs://movielens-foo/user_supplied/genres.vocab'},
  {'id': 'avg_rating', 'm_dtype': 'cont'},
  {'id': 'year', 'm_dtype': 'cont'},
  {'id': 'candidate_movie_id',
   'm_dtype': 'catg',
   'vocabs_path': 'gs://movielens-foo/user_supplied/item.vocab'},
  {'date_format': '%Y-%m-%d %H:%M:%S',
   'id': 'timestamp',
   'm_dtype': 'datetime'},
  {'id': 'rating', 'm_dtype': 'catg'}],
 'item': ['genres', 'avg_rating', 'year', 'candidate_movie_id'],
 'label': ['rating'],
 'override': True,
 'project_id': 'foo',
 'raw_dir': 'gs://movielens-foo/user_supplied/raws',
 'user': ['query_movie_ids']}

### 更改GCS movielens.yaml

In [114]:
from google.cloud.storage.blob import Blob

env = reload('trainer.env')
bucket = env.bucket('movielens-foo')
blob = bucket.get_blob('user_supplied/movielens.yaml')
blob.upload_from_file(open('../data/foo/user_supplied/movielens.yaml', 'rb'))

<br/>
<br/>
<br/>
## Python API Credential

In [4]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from google.cloud import storage

authpath = '../auth.json'
project = 'training-recommendation-engine'
cred = GoogleCredentials.from_stream(authpath)
svc = discovery.build('ml', 'v1', credentials=cred)

st_client = storage.Client.from_service_account_json(authpath)
bucket = st_client.get_bucket('recomm-job')

In [21]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from google.cloud import storage

bucket = storage.Client().get_bucket('movielens-foo')

<Blob: movielens-foo, user_supplied/genres.vocab>

In [41]:
from io import StringIO, BytesIO

blob = bucket.get_blob('user_supplied/raws/merged_movielens.csv')
sio = BytesIO()
blob.download_to_file(sio)

In [48]:
sio.getvalue().decode('utf-8')[:1000]

'1,"1953,2105,31,1029,1061,1129,1263,1287,1293,1339,1343,1371,1405",Drama,4.260869565217392,1989.0,1172,2009-12-14 10:53:25,1\r\n1,"1172,2105,31,1029,1061,1129,1263,1287,1293,1339,1343,1371,1405",Action|Crime|Thriller,4.021739130434782,1971.0,1953,2009-12-14 10:53:11,1\r\n1,"1172,1953,31,1029,1061,1129,1263,1287,1293,1339,1343,1371,1405",Action|Adventure|Sci-Fi,3.478723404255319,1982.0,2105,2009-12-14 10:52:19,1\r\n1,"1172,1953,2105,1029,1061,1129,1263,1287,1293,1339,1343,1371,1405",Drama,3.1785714285714284,1995.0,31,2009-12-14 10:52:24,0\r\n1,"1172,1953,2105,31,1061,1129,1263,1287,1293,1339,1343,1371,1405",Animation|Children|Drama|Musical,3.7023809523809526,1941.0,1029,2009-12-14 10:52:59,0\r\n1,"1172,1953,2105,31,1029,1129,1263,1287,1293,1339,1343,1371,1405",Thriller,3.5454545454545454,1996.0,1061,2009-12-14 10:53:02,0\r\n1,"1172,1953,2105,31,1029,1061,1263,1287,1293,1339,1343,1371,1405",Action|Adventure|Sci-Fi|Thriller,3.3125,1981.0,1129,2009-12-14 10:53:05,0\r\n1,"1172,1953,2105,31

In [169]:
# b = a.get(name='projects/{}/jobs/recomm_movielens_13'.format(project))
a = svc.projects().jobs()
b = a.cancel(name='projects/{}/jobs/recomm_movielens_13'.format(project))
b.execute()

TypeError: Missing required parameter "body"

In [None]:
svc.projects().jobs().get

<br/>
<br/>
<br/>
## Restful predict

In [2]:
flex = reload('utils.flex')
with codecs.open('../repo/foo/data/parsed.yaml', 'r', 'utf-8') as r:
    schema = flex.Schema.unserialize(r)

merged = pd.read_csv('../data/foo/user_supplied/raws/merged_movielens.csv', names=schema.raw_cols)
merged.head()

Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,timestamp,rating
0,1,"1953,2105,31,1029,1061,1129,1263,1287,1293,133...",Drama,4.26087,1989.0,1172,2009-12-14 10:53:25,1
1,1,"1172,2105,31,1029,1061,1129,1263,1287,1293,133...",Action|Crime|Thriller,4.021739,1971.0,1953,2009-12-14 10:53:11,1
2,1,"1172,1953,31,1029,1061,1129,1263,1287,1293,133...",Action|Adventure|Sci-Fi,3.478723,1982.0,2105,2009-12-14 10:52:19,1
3,1,"1172,1953,2105,1029,1061,1129,1263,1287,1293,1...",Drama,3.178571,1995.0,31,2009-12-14 10:52:24,0
4,1,"1172,1953,2105,31,1061,1129,1263,1287,1293,133...",Animation|Children|Drama|Musical,3.702381,1941.0,1029,2009-12-14 10:52:59,0


In [5]:
data = {
    'user_id': merged.query('user_id == 22').iloc[[0]].user_id.tolist(),
    'query_movie_ids': merged.query('user_id == 22').iloc[[0]].query_movie_ids.tolist(),
}

items = movies.rename(index=str, columns={"movieId": "candidate_movie_id"})\
              .drop('title', 1) # .to_dict('list')
items.loc[:, 'candidate_movie_id'] = items.candidate_movie_id.astype(str)
items = items.to_dict('list')
data.update(items)
with codecs.open('predic.data.json', 'w', 'utf-8') as w:
    json.dump(data, w)
data.keys()

dict_keys(['query_movie_ids', 'year', 'candidate_movie_id', 'avg_rating', 'genres', 'user_id'])

In [None]:
from tensorflow.contrib.training.python.training.hparam import HParams
reload('utils')
reload('utils.flex')
reload('service')
ctrl = reload('ctrl').Ctrl.instance

hparam = HParams(conf_path='{}/foo/user_supplied/movielens.yaml'.format(datapath))
hparam.add_hparam('data', 'predic.data.json')
ret = ctrl.predict(hparam)

In [7]:
!gcloud ml-engine local predict \
--model-dir D:/Python/notebook/recomm_prod/repo/foo/model_1518581106.1947258/export/export_foo/1518581138 \
--json-instances tmp.20180221155312370200.json

ERROR: (gcloud.ml-engine.local.predict) Something has gone really wrong; we can't find a valid Python executable on your PATH.


In [3]:
%%bash
gcloud ml-engine local predict \
--model-dir D:/Python/notebook/recomm_prod/repo/foo/model_1518581106.1947258/export/export_foo/1518581138 \
--json-instances tmp.20180221155312370200.json

D:\Python\Anaconda3\envs\py3_5\python.exe: can't open file '/cygdrive/d/google-cloud-sdk/lib/gcloud.py': [Errno 2] No such file or directory


In [1]:
cols = list(data.keys())
multi_cols = ('query_movie_ids', 'genres')

def trans(features):
    # features = OrderedDict(zip(cols, data))
    print( schema.col_states_['query_movie_ids'].transform( features['query_movie_ids'] ) )
    # for col in multi_cols:
    #     features[col] = tf.string_to_number(tf.string_split(features[col], ',').values, out_type=tf.int32)
    return features

def add_seq_cols(feat):
    for m_col in multi_cols:
        name = '{}_len'.format(m_col)
        feat[name] = tf.size(feat[m_col])
        cols.append(name)
    return feat

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.Dataset.from_tensors(data)
    dataset = dataset.map(trans, num_parallel_calls=4)
    dataset = dataset.map(add_seq_cols, num_parallel_calls=4)
    print('cols', cols)
    dataset = dataset.repeat(1)
    dataset = dataset.padded_batch(5, OrderedDict(zip(cols, ([], [], [], [None], [], [None], [], []))))
    inputs = dataset.make_one_shot_iterator().get_next()
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            _, = sess.run([inputs])
            # print( sess.run(inputs) )
            pass

NameError: name 'data' is not defined

In [1]:
%%bash
gcloud ml-engine local predict \
    --model-dir=D:/Python/notebook/recomm_prod/repo/foo/model_1518581106.1947258/export/export_foo/1518581138 \
    --package-path recomm_prod

/cygdrive/d/google-cloud-sdk/bin/gcloud: line 113: D:/Python/Anaconda3/envs/py3_5: Is a directory
/cygdrive/d/google-cloud-sdk/bin/gcloud: line 113: exec: D:/Python/Anaconda3/envs/py3_5: cannot execute: Is a directory


In [4]:
%%bash
cd D:/Python/notebook/tensorflow_estimator/census-demo
TRAIN_FILE=data/adult.data.csv
EVAL_FILE=data/adult.test.csv
OUTPUT_DIR=model/census
python -m trainer.task --train-files $TRAIN_FILE \
                       --eval-files $EVAL_FILE \
                       --job-dir $OUTPUT_DIR \
                       --train-steps 1000 \
                       --eval-steps 100

model dir model/census


  from ._conv import register_converters as _register_converters
INFO:tensorflow:Using config: {'_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001F871BA2860>, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_is_chief': True, '_model_dir': 'model/census', '_master': '', '_num_ps_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_task_type': 'worker', '_session_config': None, '_num_worker_replicas': 1, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_keep_checkpoint_max': 5}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from model/census\model.ckpt-1
INFO:tensorflow:Saving checkpoints for 2 into model/census\mode

In [35]:
%%bash
cd D:/Python/notebook/recomm_prod
TRAIN_FILE=trainer/data/movielens.tr
EVAL_FILE=trainer/data/movielens.vl
OUTPUT_DIR=trainer/model/reco_mf_dnn

gcloud ml-engine local train \
    --module-name trainer.task \
    --job-dir $OUTPUT_DIR
    -- \
    --train-files $TRAIN_DATA \
    --eval-files $EVAL_DATA \
    --train-steps 1000 \
    --eval-steps 100 \

/cygdrive/d/google-cloud-sdk/bin/gcloud: line 113: exec: D:\Python\Anaconda3\envs\py2_7\python.exe: not found
bash: line 10: --: command not found


<br/>
<br/>
<br/>
## Dataset

In [None]:
def make_datasets(fpath_ary, schema, n_batch=128, n_epoch=1):
    def to_dense(sp):
        dense = tf.sparse_to_dense(sp.indices, sp.dense_shape, sp.values, '')
        return tf.reshape(tf.to_int32(tf.string_to_number(dense)), [-1])

    def to_sparse(dense):
        idx = tf.where(tf.not_equal(dense, 0))
        return tf.SparseTensor(indices=idx, dense_shape=dense.get_shape(), values=tf.gather_nd(dense, idx))

    def parse_csv(value):
        data = tf.decode_csv(value, record_defaults=defaults)
        features = OrderedDict(zip(cols, data))
        multi_cols = df_conf.query("{} == '{}' and {} == True".format(schema.M_DTYPE, schema.CATG, schema.IS_MULTI)).id.values
        for col in multi_cols:
            features[col] = tf.string_split([features[col]], ',')
            features[col] = to_dense(features[col])
            # features['{}_lens'.format(col)] = tf.size(features[col])
        return features 
    
    df_conf = schema.df_conf_.query('{}.notnull()'.format(schema.TYPE))
    cols = schema.cols
    defaults = []
    for _, r in df_conf.iterrows():
        if r[schema.M_DTYPE] == schema.CATG:
            defaults.append([''] if r[schema.IS_MULTI] else [0])
        else:
            defaults.append([])
    dataset = tf.data.TextLineDataset(fpath_ary)
    dataset = dataset.map(parse_csv, num_parallel_calls=4)
    has_multi = (df_conf[schema.M_DTYPE] == schema.CATG) & (df_conf[schema.IS_MULTI] == True)
    if sum(has_multi):
        multi_cols = df_conf[has_multi].id.values
        dataset = dataset.padded_batch(n_batch, OrderedDict( zip(cols, tuple([None] if e else [] for e in has_multi))) )
    else:
        dataset = dataset.batch(n_batch)
    dataset = dataset.shuffle(n_batch * 10, seed=seed).repeat(n_epoch)
    features = dataset.make_one_shot_iterator().get_next()
    return features, features.pop(schema.label[0])
                                
# tf.reset_default_graph()
with tf.Graph().as_default():
    inputs = make_datasets(['./movielens.tr'], loader.schema, n_batch=30)
    query_lens = tf.sequence_mask([1, 2, 3])
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            _, = sess.run([inputs])
            # print( sess.run(inputs) )
            pass

## Feature Columns with tf.feature_column.input_layer

In [None]:
a = pd.Series(minmax_scale(np.random.normal(0, 1, size=1000)))
a.hist(bins=50)

In [None]:
%%time
tf.reset_default_graph()
with tf.Graph().as_default():
    user_id = tf.feature_column.categorical_column_with_hash_bucket('user_id', hash_bucket_size=1000, dtype=tf.int32)
    user_id = tf.feature_column.embedding_column(user_id, dimension=8)
    avg_rating = tf.feature_column.numeric_column('avg_rating')
    columns = [user_id, avg_rating]
    
    def make_datasets(fpath_ary):
        cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
        defaults = [[0], [''], [''], [], [], [0], []]

        def parse_csv(value):
            data = tf.decode_csv(value, record_defaults=defaults)
            features = OrderedDict(zip(cols, data))
            # print(features)
            return features
        
        dataset = tf.data.TextLineDataset(fpath_ary)
        dataset = (dataset.map(parse_csv, num_parallel_calls=4)
                          .batch(3)
                          # .padded_batch(3, OrderedDict(zip(cols, ([], [None], [None], [], [], [], []))))
                          .shuffle(10, seed=seed)
                          .repeat(1)
                  )
        return dataset.make_one_shot_iterator().get_next()
    
    inputs = make_datasets(['./te_processed.batch.csv'])
    inputs = tf.feature_column.input_layer(inputs, columns)
    # features = tf.parse_example(serialized_example, features=tf.feature_column.make_parse_example_spec(columns))
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            print(sess.run(inputs))

### Make Example

In [None]:
%%time
cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
is_multi = [False, True, True, False, False, False, False]
pd_dtypes = [int, str, str, float, float, int, float]
types = ['int64_list', 'int64_list', 'int64_list', 'float_list', 'float_list', 'int64_list', 'float_list']
tf_types = [tf.int64, tf.int64, tf.int64, tf.float32, tf.float32, tf.int64, tf.float32]
def persist_example(fpath, tfpath):
    with tf.python_io.TFRecordWriter(tfpath) as w:
        for chunk in pd.read_csv(fpath, names=cols, dtype=dict(zip(cols, pd_dtypes)), chunksize=1000):
            chunk['query_movie_ids'] = chunk.query_movie_ids.map(lambda r: map(int, r.split(',')))
            chunk['genres'] = chunk.genres.map(lambda r: map(int, r.split(',')))
            
            for idx, r in chunk.iterrows():
                ex = tf.train.Example()
                for multi, col, tpe in zip(is_multi, cols, types):
                    val = r[col]
                    # ex.features.feature[col].int64_list or float_list or bytes_list
                    feat_type = getattr(ex.features.feature[col], tpe)
                    # extend function for multivalent columns, otherwise append
                    append_or_extend = 'append' if not multi else 'extend'                    
                    getattr(feat_type.value, append_or_extend)(val)
                w.write(ex.SerializePartialToString())

persist_example('./te_processed.csv', './data.tfrecord')

In [None]:
def decode_example(ser_example):
    # queue = tf.train.string_input_producer([fpath], num_epochs=1)
    # _, ser_example = tf.TFRecordReader().read(queue)
    # ser_example = tf.train.batch([ser_example], batch_size=10)
    ctx_features = {col: tf.FixedLenFeature([], tf_tpe)
                    for col, tf_tpe in zip(cols, tf_types) if col not in ('query_movie_ids', 'genres')}
    seq_features = {col: tf.FixedLenSequenceFeature([], tf_tpe) 
                    for col, tf_tpe in [('query_movie_ids', tf.int64), ('genres', tf.int64)]}
    context_dict, sequence_dict = tf.parse_single_sequence_example(ser_example, 
                                                                   context_features=ctx_features, 
                                                                   sequence_features=seq_features)
    # for col, tpe in zip(cols, tf_types):
    #     val = feature_dict[col]
    #     feature_dict[col] = tf.sparse_to_dense(val.indices, val.dense_shape, val.values, name=col)
    feature_dict = {}
    feature_dict.update(context_dict)
    feature_dict.update(sequence_dict)
    ret = OrderedDict()
    for c in cols:
        ret[c] = feature_dict[c]
    return tuple(ret.values())

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.TFRecordDataset(['./data.tfrecord'])
    dataset = dataset.map(decode_example).padded_batch(10, padded_shapes=([], [None], [None], [], [], [], []))
    # dataset = dataset.batch(3)
    iters = dataset.make_one_shot_iterator()
    r = iters.get_next()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print( sess.run(r) )

## Traditional parse_example
1. tf.train.Coordinator + tf.train.start_queue_runners

In [None]:
from tensorflow.python.framework import sparse_tensor
import re

def to_sparse(dense):
    idx = tf.where(tf.not_equal(dense, 0))
    return tf.SparseTensor(idx, tf.gather_nd(dense, idx), dense.get_shape())

def make_example(val):
    example = tf.train.Example(features=tf.train.Features(
        feature = {
            'query_movie_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=val)),
            'genres': tf.train.Feature(int64_list=tf.train.Int64List(value=val))
        }
    ))
    return example

tf.reset_default_graph()
with tf.Graph().as_default():
    
    filename = "tmp.tfrecords"
    if not os.path.exists(filename):
        # os.remove(filename)
        writer = tf.python_io.TFRecordWriter(filename)
        with writer:
            for idx, r in teProcessed.head().iterrows():
                for col in ('query_movie_ids', 'genres'):
                    val = list(map(int, re.split(',\s*', r[col])))
                    ex = make_example(val)
                    writer.write(ex.SerializeToString())

    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(["tmp.tfrecords"], num_epochs=1)
    _, serialized_example = reader.read(filename_queue)

    batch = tf.train.batch(tensors=[serialized_example], batch_size=1)
    features = {
        'query_movie_ids': tf.VarLenFeature(tf.int64),
        'genres': tf.VarLenFeature(tf.int64)
    }
    data = tf.parse_example(batch, features)
    query_movie_ids = data['query_movie_ids']
    embbedding = tf.Variable(tf.glorot_uniform_initializer()([9125]), dtype=tf.float32)
    print(query_movie_ids.dense_shape)
    # r = tf.layers.dense(query_movie_ids, 10)
    # emb_query = tf.nn.embedding_lookup_sparse([embbedding], query_movie_ids, None, combiner='sqrtn')
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        try:
            print(sess.run(data))
            pass
        except tf.errors.OutOfRangeError as e:
            coord.request_stop(e)
        finally:
            coord.request_stop()
            coord.join(threads)
    

## Test

In [None]:
tf.reset_default_graph()
with tf.Graph().as_default():
    labels = tf.constant(np.ones([10, 8]))
    pred = tf.concat([tf.Variable(tf.ones(shape=[1, 8]), trainable=False), tf.Variable(tf.truncated_normal([9, 8]))], 0)
    loss = tf.losses.mean_squared_error(predictions=pred, labels=labels)
    train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        print(pred.eval())
        for i in range(1000):
            sess.run([train_op])
        print()
        print(pred.eval())

In [None]:
tf.zeros