In [23]:
%matplotlib inline
import os, sys, numpy as np, pandas as pd, tensorflow as tf, re, codecs, seaborn as sns, json, time, csv, datetime as dt
import pickle, collections, random, math, numbers, scipy.sparse as sp, matplotlib.pyplot as plt, scipy.sparse as sp

def reload(mName):
    import importlib
    if mName in sys.modules:
        del sys.modules[mName]
    return importlib.import_module(mName)


from collections import deque, defaultdict, OrderedDict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, minmax_scale
from matplotlib import pyplot as plt
plt.style.use('ggplot')

# classpath
ctx = os.path.abspath('..')
cps = [ctx]
_ = [sys.path.insert(0, cp) for cp in cps if cp not in sys.path]

# data path
datapath = '/'.join([ctx, 'data'])

seed = 88
utils = reload('utils.utils')
np.set_printoptions(precision=4, suppress=True, linewidth=100)
np.random.seed(seed)

## Test

In [2]:
headers = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
teProcessed = pd.read_csv('./te_processed.csv', names=headers)
# teProcessed['query_movie_ids'] = teProcessed.query_movie_ids.str.replace('\[(.+)\]', '\\1')
# teProcessed['genres'] = teProcessed.genres.str.replace('\[(.+)\]', '\\1')
teProcessed.head() # .to_csv('./te_processed.csv', index=False, header=None)

Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,rating
0,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",458,0.661939,0.701754,1665,4.0
1,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",51,0.669753,0.684211,1708,3.0
2,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",116,0.763441,0.631579,2925,3.0
3,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",782,0.643026,0.736842,1962,2.5
4,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",459,0.600529,0.754386,1743,2.0


### Shema

In [50]:
reco_flex = reload('reco_mf_dnn.reco_mf_dnn_flex_shema')
schema = reco_flex.Schema()
print(schema.df_conf.dtypes)
schema.df_conf

id              object
data_type       object
date_format    float64
model_dtype     object
n_unique       float64
is_multi        object
sep             object
aux            float64
ref             object
type            object
dtype: object


Unnamed: 0,id,data_type,date_format,model_dtype,n_unique,is_multi,sep,aux,ref,type
0,query_movie_ids,str,,catg,9125.0,True,",",,candidate_movie_id,user
1,rating,,,catg,,,,,,label
2,genres,str,,catg,20.0,True,",",,,item
3,avg_rating,float,,cont,,,,,,item
4,year,float,,cont,,,,,,item
5,candidate_movie_id,,,,9125.0,,,,,item


In [3]:
# set([batch[batch.composer.str.contains('\|', na=False)].iloc[0].lyricist])
mapper = utils.PartialMapper(10000, keep_order=False).partial_fit(batch.composer)
mapper.inverse_transform(mapper.transform(batch.lyricist))[:10]
mapper.enc

{nan: 0,
 'David Lowen| Daniel Navarro': 2501,
 '邱聖倫': 2804,
 '詹雅雯': 3407,
 'Giuliano Sacchetto-Giordano Trivellato': 1,
 'Lenka Kirpac| Kevin Griffin': 2787,
 'Dawin| Timothy Mingo| Ricky Hawk': 2503,
 'Sang Hyuck Lim| Young Zin Son (임상혁| 손영진)': 2504,
 '林隆璇': 3,
 '謝銘祐 陳皓宇': 4537,
 'Kygo| RHODES| Natalie Salter': 3935,
 'Julian Casablancas': 4285,
 '伍樂城': 4,
 'Andrew Williams| Jeremy Williams| Jamie Goodwin': 5,
 '이종현 (Lee Jong Hyun) | 박은우 (Park Eun Woo)': 2949,
 'Anton Zaslavski|Matthew Koma|Miriam Bryant|Victor Rådström': 6,
 '周恆毅': 1251,
 'Alessandro Lindblad/Theo Hutchcraft/Calvin Harris': 7,
 'Andrew Taggart|Jon Sandler|Luke Moellman': 2506,
 '陳建寧 Ian Chen': 2951,
 'Enya| Nicky Ryan| Roma Ryan': 8,
 'Alex San': 2502,
 '王雁盟': 9,
 'Chris Brown| Aaron Lamont Small| Wesley Dees| Edward Griffin| Kenneth Franklin| Andre Blake| Santiago Bauza| Gabrielle Nowee| Warren Griffin III| Nathaniel Hale| Gwendolyn Chisolm| Cheryl Cook| Sylvia Robinson| Angela Stone': 2507,
 'Loptimist': 3143,
 'J

# tf.decode_csv + tf.data.TextLineDataset

## Simple Data Preprocess

In [71]:
ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
tr, te = utils.split_ratings(ratings)

movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
movies["avg_rating"] = ratings.groupby("movieId").rating.mean()
movies["avg_rating"] = minmax_scale(movies.avg_rating.fillna(ratings.rating.mean()))
movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)
movies["year"] = minmax_scale(movies.year.fillna(movies.year.median()))

In [90]:
def preprocess(data, movie_trans, train_hist=None, is_train=True):
    queue = []
    data = data.merge(movie_trans, how="left", on="movieId")
    columns=["user_id", "query_movie_ids", 
             "genres", "avg_rating", "year", "candidate_movie_id",
             "rating"]
    
    list2str = lambda lst: ','.join(map(str, lst))
    for u, df in data.groupby("userId"):
        df = df.sort_values("rating", ascending=False)
        if not is_train:
            user_movies_hist = train_hist.query("userId == {}".format(u)).movieId
        for i, (_, r) in enumerate(df.iterrows()):
            if is_train:
                query_hist = df.movieId[:i].tolist() + df.movieId[i + 1:].tolist()
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.rating])
            else:
                all_hist = set(user_movies_hist.tolist())
                query_hist = list(all_hist - set([int(r.movieId)]))
                query_hist = list2str(query_hist)
                queue.append([int(r.userId), query_hist, r.genres, r.avg_rating, r.year, int(r.movieId), r.rating])
    return pd.DataFrame(queue, columns=columns)
    
tr_merged = preprocess(tr, movies)
tr_merged.to_csv('./tr_movielens.csv', index=False, header=None)

te_merged = preprocess(te, movies, tr, is_train=False)
te_merged.to_csv('./te_movielens.csv', index=False, header=None)
te_merged.head()

Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,rating
0,1,"1953,1061,1029,1287,1343,1129,1293,1263,1371,1...",Action|Adventure|Sci-Fi,0.611111,0.701754,2105,4.0
1,1,"1953,1061,1029,1287,1343,1129,1293,1263,1371,1...",Adventure|Comedy,0.676357,0.684211,2150,3.0
2,1,"1953,1061,1029,1287,1343,1129,1293,1263,1371,1...",Comedy|Western,0.825397,0.631579,3671,3.0
3,1,"1953,1061,1029,1287,1343,1129,1293,1263,1371,1...",Drama|Horror|Sci-Fi|Thriller,0.694444,0.736842,2455,2.5
4,1,"1953,1061,1029,1287,1343,1129,1293,1263,1371,1...",Action|Adventure|Fantasy,0.676357,0.754386,2193,2.0


## Write Config File

In [129]:
# self.conf = OrderedDict(
#     user = [{Schema.ID: 'query_movie_ids', Schema.DTYPE: 'str', Schema.MODEL_DTYPE: 'catg',
#              Schema.N_UNIQUE: 9125, Schema.IS_MULTI: True, Schema.SEP: ','}],
#     item = [{Schema.ID: 'genres', Schema.DTYPE: 'str', Schema.MODEL_DTYPE: 'catg', Schema.N_UNIQUE: 20, Schema.IS_MULTI: True, Schema.SEP: ','},
#             {Schema.ID: 'avg_rating', Schema.DTYPE: 'float', Schema.MODEL_DTYPE: 'cont'},
#             {Schema.ID: 'year', Schema.DTYPE: 'float', Schema.MODEL_DTYPE: 'cont'},
#             {Schema.ID: 'candidate_movie_id', Schema.TYPE: 'catg', Schema.N_UNIQUE: 9125}],
#     label = [{Schema.ID: 'rating', Schema.MODEL_DTYPE: 'catg'}]
# )
reco_flex = reload('reco_mf_dnn.reco_mf_dnn_flex_shema')
conf = '''
{
    "label": [{"id": "target", "dtype": "float", "model_dtype": "catg"}],
    "user": [{"id": "query_movie_ids", "dtype": "int", "model_dtype": "catg", "is_multi": true, "n_unique": 9125}],
    "item": [{"id": "candidate_movie_id", "dtype": "int", "model_dtype": "catg"},
             {"id": "genres", "dtype": "str", "model_dtype": "catg", "is_multi": true, "sep": "|", "n_unique": 20},
             {"id": "avg_rating", "dtype": "float", "model_dtype": "cont"},
             {"id": "year", "dtype": "float", "model_dtype": "cont"}]
}
'''.strip()
schema = reco_flex.Schema(conf)
schema.df_conf

AssertionError: [query_movie_ids] multivalent columns expect [sep] attr, actual [None]

In [125]:
a = schema.df_conf
for _, r in a[(a['is_multi'] == True) & pd.isnull(a['sep'])].iterrows():
    print(pd.isnull(r['sep']))

True
True


In [30]:
%%time
def make_datasets(fpath_ary):
    cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
    defaults = [[0], [''], [''], [], [], [0], []]

    def to_dense(sp):
        dense = tf.sparse_to_dense(sp.indices, sp.dense_shape, sp.values, '')
        return tf.reshape(tf.to_int32(tf.string_to_number(dense)), [-1])

    def to_sparse(dense):
        idx = tf.where(tf.not_equal(dense, 0))
        return tf.SparseTensor(indices=idx, dense_shape=dense.get_shape(), values=tf.gather_nd(dense, idx))

    def parse_csv(value):
        data = tf.decode_csv(value, record_defaults=defaults)
        features = OrderedDict(zip(cols, data))
        for col in ('query_movie_ids', 'genres'):
            features[col] = tf.string_split([features[col]], ',')
            features[col] = to_sparse(to_dense(features[col]))
        return features 

    dataset = tf.data.TextLineDataset(fpath_ary)
    dataset = (dataset.map(parse_csv, num_parallel_calls=4)
                      .padded_batch(3, OrderedDict(zip(cols, ([], [None], [None], [], [], [], []))))
                      .shuffle(10, seed=seed)
                      .repeat(2)
              )
    return dataset.make_one_shot_iterator().get_next()

tf.reset_default_graph()
with tf.Graph().as_default():
    inputs = make_datasets(['./te_processed.batch.csv'])
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            print(sess.run(inputs)['user_id'])

Help on function weighted_categorical_column in module tensorflow.python.feature_column.feature_column:

weighted_categorical_column(categorical_column, weight_feature_key, dtype=tf.float32)
    Applies weight values to a `_CategoricalColumn`.
    
    Use this when each of your sparse inputs has both an ID and a value. For
    example, if you're representing text documents as a collection of word
    frequencies, you can provide 2 parallel sparse input features ('terms' and
    'frequencies' below).
    
    Example:
    
    Input `tf.Example` objects:
    
    ```proto
    [
      features {
        feature {
          key: "terms"
          value {bytes_list {value: "very" value: "model"}}
        }
        feature {
          key: "frequencies"
          value {float_list {value: 0.3 value: 0.1}}
        }
      },
      features {
        feature {
          key: "terms"
          value {bytes_list {value: "when" value: "course" value: "human"}}
        }
        feature {
      

## Feature Columns with tf.feature_column.input_layer

In [49]:
a = pd.read_csv('./te_processed.batch.csv', 
                names=['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating'])
a.head()

Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,rating
0,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",458,0.661939,0.701754,1665,4.0
1,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",51,0.669753,0.684211,1708,3.0
2,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",116,0.763441,0.631579,2925,3.0
3,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",782,0.643026,0.736842,1962,2.5
4,0,"833,931,1083,906,1515,1041,1140,1111,1047,1017...",459,0.600529,0.754386,1743,2.0


In [50]:
%%time
tf.reset_default_graph()
with tf.Graph().as_default():
    user_id = tf.feature_column.categorical_column_with_hash_bucket('user_id', hash_bucket_size=1000, dtype=tf.int32)
    user_id = tf.feature_column.embedding_column(user_id, dimension=8)
    avg_rating = tf.feature_column.numeric_column('avg_rating')
    columns = [user_id, avg_rating]
    
    def make_datasets(fpath_ary):
        cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
        defaults = [[0], [''], [''], [], [], [0], []]

        def parse_csv(value):
            data = tf.decode_csv(value, record_defaults=defaults)
            features = OrderedDict(zip(cols, data))
            # print(features)
            return features
        
        dataset = tf.data.TextLineDataset(fpath_ary)
        dataset = (dataset.map(parse_csv, num_parallel_calls=4)
                          .batch(3)
                          # .padded_batch(3, OrderedDict(zip(cols, ([], [None], [None], [], [], [], []))))
                          .shuffle(10, seed=seed)
                          .repeat(1)
                  )
        return dataset.make_one_shot_iterator().get_next()
    
    inputs = make_datasets(['./te_processed.batch.csv'])
    inputs = tf.feature_column.input_layer(inputs, columns)
    # features = tf.parse_example(serialized_example, features=tf.feature_column.make_parse_example_spec(columns))
    ctx = []
    with tf.train.MonitoredTrainingSession() as sess:
        while not sess.should_stop():
            print(sess.run(inputs))

[[ 0.6822 -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]
 [ 0.644   0.0625 -0.0403 -0.1782  0.4948  0.0477 -0.3435  0.0557  0.3814]
 [ 0.7     0.0625 -0.0403 -0.1782  0.4948  0.0477 -0.3435  0.0557  0.3814]]
[[ 0.643  -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]
 [ 0.6005 -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]
 [ 0.6164 -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]]
[[ 0.6619 -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]
 [ 0.6698 -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]
 [ 0.7634 -0.0843  0.1698  0.5523  0.5349  0.2882  0.1485 -0.2189  0.4389]]
[[ 0.5718  0.0625 -0.0403 -0.1782  0.4948  0.0477 -0.3435  0.0557  0.3814]]
Wall time: 784 ms


### Make Example

In [3]:
%%time
cols = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
is_multi = [False, True, True, False, False, False, False]
pd_dtypes = [int, str, str, float, float, int, float]
types = ['int64_list', 'int64_list', 'int64_list', 'float_list', 'float_list', 'int64_list', 'float_list']
tf_types = [tf.int64, tf.int64, tf.int64, tf.float32, tf.float32, tf.int64, tf.float32]
def persist_example(fpath, tfpath):
    with tf.python_io.TFRecordWriter(tfpath) as w:
        for chunk in pd.read_csv(fpath, names=cols, dtype=dict(zip(cols, pd_dtypes)), chunksize=1000):
            chunk['query_movie_ids'] = chunk.query_movie_ids.map(lambda r: map(int, r.split(',')))
            chunk['genres'] = chunk.genres.map(lambda r: map(int, r.split(',')))
            
            for idx, r in chunk.iterrows():
                ex = tf.train.Example()
                for multi, col, tpe in zip(is_multi, cols, types):
                    val = r[col]
                    # ex.features.feature[col].int64_list or float_list or bytes_list
                    feat_type = getattr(ex.features.feature[col], tpe)
                    # extend function for multivalent columns, otherwise append
                    append_or_extend = 'append' if not multi else 'extend'                    
                    getattr(feat_type.value, append_or_extend)(val)
                w.write(ex.SerializePartialToString())

persist_example('./te_processed.csv', './data.tfrecord')

In [None]:
def decode_example(ser_example):
    # queue = tf.train.string_input_producer([fpath], num_epochs=1)
    # _, ser_example = tf.TFRecordReader().read(queue)
    # ser_example = tf.train.batch([ser_example], batch_size=10)
    ctx_features = {col: tf.FixedLenFeature([], tf_tpe)
                    for col, tf_tpe in zip(cols, tf_types) if col not in ('query_movie_ids', 'genres')}
    seq_features = {col: tf.FixedLenSequenceFeature([], tf_tpe) 
                    for col, tf_tpe in [('query_movie_ids', tf.int64), ('genres', tf.int64)]}
    context_dict, sequence_dict = tf.parse_single_sequence_example(ser_example, 
                                                                   context_features=ctx_features, 
                                                                   sequence_features=seq_features)
    # for col, tpe in zip(cols, tf_types):
    #     val = feature_dict[col]
    #     feature_dict[col] = tf.sparse_to_dense(val.indices, val.dense_shape, val.values, name=col)
    feature_dict = {}
    feature_dict.update(context_dict)
    feature_dict.update(sequence_dict)
    ret = OrderedDict()
    for c in cols:
        ret[c] = feature_dict[c]
    return tuple(ret.values())

tf.reset_default_graph()
with tf.Graph().as_default():
    dataset = tf.data.TFRecordDataset(['./data.tfrecord'])
    dataset = dataset.map(decode_example).padded_batch(10, padded_shapes=([], [None], [None], [], [], [], []))
    # dataset = dataset.batch(3)
    iters = dataset.make_one_shot_iterator()
    r = iters.get_next()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print( sess.run(r) )

## Traditional parse_example
1. tf.train.Coordinator + tf.train.start_queue_runners

In [None]:
from tensorflow.python.framework import sparse_tensor
import re

def to_sparse(dense):
    idx = tf.where(tf.not_equal(dense, 0))
    return tf.SparseTensor(idx, tf.gather_nd(dense, idx), dense.get_shape())

def make_example(val):
    example = tf.train.Example(features=tf.train.Features(
        feature = {
            'query_movie_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=val)),
            'genres': tf.train.Feature(int64_list=tf.train.Int64List(value=val))
        }
    ))
    return example

tf.reset_default_graph()
with tf.Graph().as_default():
    
    filename = "tmp.tfrecords"
    if not os.path.exists(filename):
        # os.remove(filename)
        writer = tf.python_io.TFRecordWriter(filename)
        with writer:
            for idx, r in teProcessed.head().iterrows():
                for col in ('query_movie_ids', 'genres'):
                    val = list(map(int, re.split(',\s*', r[col])))
                    ex = make_example(val)
                    writer.write(ex.SerializeToString())

    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(["tmp.tfrecords"], num_epochs=1)
    _, serialized_example = reader.read(filename_queue)

    batch = tf.train.batch(tensors=[serialized_example], batch_size=1)
    features = {
        'query_movie_ids': tf.VarLenFeature(tf.int64),
        'genres': tf.VarLenFeature(tf.int64)
    }
    data = tf.parse_example(batch, features)
    query_movie_ids = data['query_movie_ids']
    embbedding = tf.Variable(tf.glorot_uniform_initializer()([9125]), dtype=tf.float32)
    print(query_movie_ids.dense_shape)
    # r = tf.layers.dense(query_movie_ids, 10)
    # emb_query = tf.nn.embedding_lookup_sparse([embbedding], query_movie_ids, None, combiner='sqrtn')
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        try:
            print(sess.run(data))
            pass
        except tf.errors.OutOfRangeError as e:
            coord.request_stop(e)
        finally:
            coord.request_stop()
            coord.join(threads)
    

## Test

In [290]:
tf.reset_default_graph()
with tf.Graph().as_default():
    labels = tf.constant(np.ones([10, 8]))
    pred = tf.concat([tf.Variable(tf.ones(shape=[1, 8]), trainable=False), tf.Variable(tf.truncated_normal([9, 8]))], 0)
    loss = tf.losses.mean_squared_error(predictions=pred, labels=labels)
    train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        print(pred.eval())
        for i in range(1000):
            sess.run([train_op])
        print()
        print(pred.eval())

[[ 1.      1.      1.      1.      1.      1.      1.      1.    ]
 [ 0.389  -0.8997 -0.1771 -1.3736 -0.5522  1.085   0.2084 -0.6331]
 [ 0.3051  0.141  -0.8393  0.0532 -0.2783  0.4222 -1.1649 -1.2179]
 [ 0.2309  0.3492 -0.9968  1.2853 -1.1383  0.9017 -0.0005  0.9116]
 [-1.7894 -0.7522 -0.7565 -1.9722 -0.6737 -0.9124  0.1356 -1.3875]
 [ 0.6158 -0.2479  0.2636  0.2511  0.1735  0.0047  0.602   1.1785]
 [-0.8968  1.0958 -0.1999 -0.2517 -1.0478 -1.015  -1.0509  1.6131]
 [ 0.2981 -0.5663 -0.6425 -0.7068  0.4924  0.6517  0.1711 -0.8053]
 [-0.9977  0.9528 -0.7456 -1.5973  0.6737  0.7269  0.9841 -0.3262]
 [ 0.1359 -0.6149  1.5493 -1.0061  0.5749 -1.5367  0.4196  1.1141]]

[[ 1.      1.      1.      1.      1.      1.      1.      1.    ]
 [ 0.95    0.8446  0.9037  0.8058  0.873   1.007   0.9352  0.8664]
 [ 0.9431  0.9297  0.8495  0.9225  0.8954  0.9527  0.8228  0.8185]
 [ 0.9371  0.9468  0.8366  1.0233  0.825   0.992   0.9181  0.9928]
 [ 0.7717  0.8566  0.8563  0.7568  0.863   0.8435  0.9293  0

In [281]:
tf.zeros