In [5]:
%matplotlib inline
import os, sys, numpy as np, pandas as pd, tensorflow as tf, re, codecs, seaborn as sns, json, time, csv, datetime as dt
import pickle, collections, random, math, numbers, scipy.sparse as sp, matplotlib.pyplot as plt, scipy.sparse as sp

def reload(mName):
    import importlib
    if mName in sys.modules:
        del sys.modules[mName]
    return importlib.import_module(mName)


from collections import deque, defaultdict, OrderedDict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, minmax_scale
from matplotlib import pyplot as plt
plt.style.use('ggplot')

# classpath
ctx = os.path.abspath('..')
cps = [ctx]
_ = [sys.path.insert(0, cp) for cp in cps if cp not in sys.path]

# data path
datapath = '/'.join([ctx, 'data'])

utils = reload('utils.utils')
np.set_printoptions(precision=4, suppress=True, linewidth=100)
np.random.seed(42)

## Data Prepare

In [2]:
ratings = pd.read_csv("{}/ml-latest-small/ratings.csv".format(datapath))
movies = pd.read_csv("{}/ml-latest-small/movies.csv".format(datapath))
tags = pd.read_csv("{}/ml-latest-small/tags.csv".format(datapath))

uidEnc, midEnc = LabelEncoder(), LabelEncoder()
# encode user id and movie id to real value
midEnc.fit(movies.movieId)
uidEnc.fit(ratings.userId)

ratings["userId"] = uidEnc.transform(ratings.userId)
ratings["movieId"] = midEnc.transform(ratings.movieId)

movies["movieId"] = midEnc.transform(movies.movieId)

tags["userId"] = uidEnc.transform(tags.userId)
tags["movieId"] = midEnc.transform(tags.movieId)

midMap = pd.Series(dict(zip(movies.movieId, movies.title)))

nUsers, nMovies = len(uidEnc.classes_), len(midEnc.classes_)

print(ratings.shape)
ratings.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,0,30,2.5,1260759144
1,0,833,3.0,1260759179
2,0,859,3.0,1260759182
3,0,906,2.0,1260759185
4,0,931,4.0,1260759205


In [3]:
tr = pd.read_csv("{}/ml-latest-small/movielens.tr.csv".format(datapath))
te = pd.read_csv("{}/ml-latest-small/movielens.te.csv".format(datapath))

# state = utils.loadPickle("./data/ml-latest-small/state.h")
# uidEnc, midEnc, midMap, nUsers, nMovies = \
#     (state["uidEnc"], state["midEnc"], state["midMap"], state["nUsers"], state["nMovies"])

# train data rating matrix
trRatingMat = np.zeros((nUsers, nMovies))
# test data rating matrix
teRatingMat = np.zeros((nUsers, nMovies))
for idx, r in tr.iterrows():
    trRatingMat[int(r.userId), int(r.movieId)] = r.rating
for idx, r in te.iterrows():
    teRatingMat[int(r.userId), int(r.movieId)] = r.rating

print("train interaction matrix shape: ", trRatingMat.shape, "test interaction matrix shape: ", teRatingMat.shape)
print("train.shape: ", tr.shape, "test.shape: ", te.shape)
print()
print(tr.head())
print()
print(te.head())

train interaction matrix shape:  (671, 9125) test interaction matrix shape:  (671, 9125)
train.shape:  (69399, 4) test.shape:  (30605, 4)

   userId  movieId  rating   timestamp
0       0      931     4.0  1260759205
1       0     1515     4.0  1260759191
2       0       30     2.5  1260759144
3       0      833     3.0  1260759179
4       0      859     3.0  1260759182

   userId  movieId  rating   timestamp
0       0     1665     4.0  1260759139
1       0     1708     3.0  1260759194
2       0     1743     2.0  1260759198
3       0     1815     2.0  1260759108
4       0     1962     2.5  1260759113


In [None]:
def preprocess(data, movie_trans, train_hist=None, is_train=True):
    queue = []
    data = data.merge(movie_trans, how="left", on="movieId")
    columns=["user_id", "query_movie_ids", 
             "genres", "avg_rating", "year", "candidate_movie_id",
             "rating"]
    for u, df in data.groupby("userId"):
        df = df.sort_values("rating", ascending=False)
        if not is_train:
            user_movies_hist = train_hist.query("userId == {}".format(u)).movieId
        for i, (_, r) in enumerate(df.iterrows()):
            if is_train:
                queue.append([int(r.userId), df.movieId[:i].tolist() + df.movieId[i + 1:].tolist(), r.genres, r.avg_rating, r.year, int(r.movieId), r.rating])
            else:
                # queue.append([int(r.userId), df.movieId[:i].tolist() + df.movieId[i + 1:].tolist(), r.genres, r.avg_rating, r.year, int(r.movieId), r.rating])
                # all_hist = set(user_movies_hist.tolist() + df.movieId[:i].tolist())
                all_hist = set(user_movies_hist.tolist())
                queue.append([int(r.userId), list(all_hist - set([int(r.movieId)])), r.genres, r.avg_rating, r.year, int(r.movieId), r.rating])
    return pd.DataFrame(queue, columns=columns)

movie_trans, genres_enc = utils.doMovies(movies)
movie_trans["avg_rating"] = ratings.groupby("movieId").rating.mean()
movie_trans["avg_rating"] = minmax_scale(movie_trans.avg_rating.fillna(ratings.rating.mean()))
movie_trans["year"] = movie_trans.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)
movie_trans["year"] = minmax_scale(movie_trans.year.fillna(movie_trans.year.median()))
n_genres = len(genres_enc.enc_)

trProcessed = preprocess(tr, movie_trans)
teProcessed = preprocess(te, movie_trans, tr, is_train=False)
trProcessed.head()

In [None]:
teProcessed['query_movie_ids'] = teProcessed.query_movie_ids.map(lambda r: ','.join(map(str, r))) 
teProcessed['genres'] = teProcessed.genres.map(lambda r: ','.join(map(str, r)))
teProcessed.to_csv('./te_processed.csv', index=False, header=None)

## Data Function

In [None]:
def do_multi(df, multi_cols):
    """對於multivalent的欄位, 需要增加一個column去描述該欄位的長度"""
    pad = tf.keras.preprocessing.sequence.pad_sequences
    ret = OrderedDict()
    for colname, col in df.iteritems():
        if colname in multi_cols:
            lens = col.map(len)
            ret[colname] = list(pad(col, padding="post", maxlen=lens.max()))
            ret[colname + "_len"] = lens.values
        else:
            ret[colname] = col.values
    return ret

def dataFn(data, n_batch=128, shuffle=False):
    pad = tf.keras.preprocessing.sequence.pad_sequences
    def fn():
        dataInner = data.copy()
        indices = utils.get_minibatches_idx(len(dataInner), n_batch, shuffle=shuffle)
        for ind in indices:
            yield do_multi(dataInner.iloc[ind], ["query_movie_ids", "genres"])
    return fn

for i, e in enumerate(dataFn(trProcessed, n_batch=5, shuffle=True)(), 1):
    break
pd.DataFrame(e)

## MF with DNN Model

In [14]:
model_dir='./model/reco_mf_dnn'
dim = 16
lr = 0.005
n_batch = 128

reco_mf_dnn = reload('reco_mf_dnn.reco_mf_dnn_flex_shema')
tf.reset_default_graph()
model = reco_mf_dnn.ModelMfDNN(nUsers, nMovies, dim=dim, learning_rate=lr, model_dir=model_dir)

AttributeError: can't set attribute

## Train

In [None]:
with tf.Session(graph=model.graph) as sess:
    model.fit(sess, 
              dataFn(trProcessed, n_batch, shuffle=True), 
              dataFn(teProcessed, n_batch, shuffle=False), reset=True, n_epoch=10)
# 0.526

### 觀察單一user與預測分布圖

In [None]:
def user_item_data(data, uids, movie_trans, n_batch=128):
    u_col = ["user_id", "query_movie_ids"]
    cache = {"u_ary": []}
    items = do_multi(movie_trans, ["genres"])
    items["candidate_movie_id"] = items.pop("movieId")
    def clear(u_ary):
        u_data = do_multi(pd.DataFrame(data=u_ary, columns=u_col), ["query_movie_ids"])
        cache["u_ary"] = []
        return u_data
    
    for uid, df in data[data.user_id.isin(uids)].groupby("user_id"):
        u_rec, u_ary = df.iloc[0], cache["u_ary"]
        # print(u_rec.query_movie_ids, u_rec.candidate_movie_id)
        u_rec.set_value("query_movie_ids", u_rec.query_movie_ids + [u_rec.candidate_movie_id])
        u_ary.append(u_rec[u_col].values)
        if len(u_ary) >= n_batch:
            yield clear(u_ary), items
    yield clear(u_ary), items

In [None]:
from sklearn.preprocessing import minmax_scale

# user id from 0 ~ 670
uid = 22
u_queries, movies_meta = list(user_item_data(trProcessed, [uid], movie_trans))[0]
with tf.Session(graph=model.graph) as sess:
    pred = model.predict(sess, u_queries, movies_meta)
print("shape: ", pred.shape, pred)

nnzCoord = teRatingMat[uid].nonzero()
f, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].set_title("pred distribution")
pd.Series(pred.ravel()[nnzCoord]).hist(bins=30, ax=ax[0])
ax[1].set_title("real distribution")
pd.Series(map(lambda e: e, teRatingMat[uid][nnzCoord])).hist(bins=30, ax=ax[1])
plt.show()

In [None]:
# 可給定user id細看每個user的rating與model預測效果
# valid user id from 0 ~ 670
uid = 23
with tf.Session(graph=model.graph) as sess:
    u_queries, movies_meta = list(user_item_data(teProcessed, [uid], movie_trans))[0]
    recomm = model.predict(
        sess, 
        u_queries, 
        movies_meta
    ).ravel()
recommDf = pd.DataFrame(data={
              "userId": uid,
              "movieId": range(len(recomm)), 
              "title": midMap[np.arange(len(recomm))].values, 
              "rating": teRatingMat[uid, range(len(recomm))],
              "predRating": recomm},
             columns=("userId", "movieId", "title", "rating", "predRating"))
# ascending 可以調整True or False觀察結果
recommDf.query("rating != 0").sort_values("rating", ascending=False).head(20)

In [None]:
recommDf.query("rating != 0").sort_values("predRating", ascending=False).head(20)

### ROC AUC

In [None]:
from sklearn.metrics import roc_curve, auc
def drawRocCurve(y, predProba):
    fprRf, tprRf, _ = roc_curve(y, predProba, pos_label=1)
    aucScr = auc(fprRf, tprRf)
    print("auc:", aucScr)
    f, ax = plt.subplots(1, 1, figsize=(6, 6))
    
    ax.plot([0, 1], [0, 1], 'k--')
    ax.plot(fprRf, tprRf, label='ROC CURVE')
    ax.set_xlabel('False positive rate')
    ax.set_ylabel('True positive rate')
    ax.set_title('AOC: Area Under Curve (score: {:.4f})'.format(aucScr))
    ax.legend(loc='best')
    plt.show()

with tf.Session(graph=model.graph) as sess:
    model.ckpt(sess, model.model_dir)
    labels, preds = [], []
    for data in dataFn(teProcessed)():
        labels.append(data['rating'])
        preds.append(sess.run(model.gmf, model.feed_dict(data, mode="eval")).ravel())
# regard rating >= 4 as user like this movie
drawRocCurve((np.concatenate(labels) >= 4).astype(int), np.concatenate(preds))

### NDCG: Normalized Discounted Cumulative Gain
1. A measure of ranking quality.
2. loop 每一位user, prediciton score排序後計算NDCG
    <br/>$$ DCG_p = \sum^p_{i = 1} \frac{2^{rel_i} - 1}{log_2(i + 1)} $$<br/>
3. IDCG: Ideal DCG, 為理想狀態下的DCG分數, 即model全部命中的DCG分數, 而NDCG: Normalized DCG, 公式如下
    <br/>$$ NDCG_p = \sum^p_{i = 1} \frac{DCG_p}{IDCG_p} $$<br/>
4. 所以NDCG是一個比值, 介於0 ~ 1之間

In [None]:
def strict_condition(label):
    label = label[label != 0]
    pos, neg = sum(label >= 4), sum(label < 4)
    return len(label) >= 10 and pos <= neg and pos > 0
    
print("rating數量 >= 10 且 負評價數量 >= 正評價數量 有 [{}] 人".format(sum(strict_condition(label) for label in teRatingMat)))

def norm_condition(label):
    label = label[label != 0]
    return sum(label >= 4) > 0 and sum(label < 4) > 0
print("rating正評價數量 >= 0 且 rating負評價數量 >= 0 有 [{}] 人".format(sum(norm_condition(label) for label in teRatingMat)))

def single_user_ndcg(label, score, label_thres=4, k=10):
    """single user ndcg score"""
    nnz = label.nonzero()[0]
    # if np.sum(label >= label_thres) < k: return None
    label, score = label[nnz], score[nnz]
    label = (label >= label_thres).astype(int)
    return utils.ndcg_score(label, score, k)

def all_user_ndcg(label, pred_mat, cond_fn, label_thres=4, k=10):
    """avg of all user ndcg score"""
    tot_ndcg, actual_cnt = 0, 0
    for i, (label, score) in enumerate(zip(teRatingMat, pred_mat)):
        if not cond_fn(label): continue

        ndcg = single_user_ndcg(label, score, k=10)
        if ndcg is not None:
            tot_ndcg += ndcg
            actual_cnt += 1
    return tot_ndcg / actual_cnt

with tf.Session(graph=model.graph) as sess:
    pred_mat = []
    for u_data, items in user_item_data(teProcessed, np.arange(nUsers), movie_trans, n_batch=128):
        pred_mat.append(model.predict(sess, u_data, items))
    pred_mat = np.vstack(pred_mat)
    
strict_ndcg = all_user_ndcg(teRatingMat, pred_mat, strict_condition, label_thres=4, k=10)
norm_ndcg = all_user_ndcg(teRatingMat, pred_mat, norm_condition, label_thres=4, k=10)
print("strict condition ndcg at 10: ", strict_ndcg)
print("norm condition ndcg at 10: ", norm_ndcg)

## Test

In [6]:
headers = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']
teProcessed = pd.read_csv('./te_processed.csv', names=headers)
# teProcessed['query_movie_ids'] = teProcessed.query_movie_ids.str.replace('\[(.+)\]', '\\1')
# teProcessed['genres'] = teProcessed.genres.str.replace('\[(.+)\]', '\\1')
teProcessed.head() # .to_csv('./te_processed.csv', index=False, header=None)

Unnamed: 0,user_id,query_movie_ids,genres,avg_rating,year,candidate_movie_id,rating
0,0,"833, 931, 1083, 906, 1515, 1041, 1140, 1111, 1...","3, 5, 8",0.661939,0.701754,1665,4.0
1,0,"833, 931, 1083, 906, 1515, 1041, 1140, 1111, 1...","5, 1",0.669753,0.684211,1708,3.0
2,0,"833, 931, 1083, 906, 1515, 1041, 1140, 1111, 1...","1, 16",0.763441,0.631579,2925,3.0
3,0,"833, 931, 1083, 906, 1515, 1041, 1140, 1111, 1...","0, 7, 8, 2",0.643026,0.736842,1962,2.5
4,0,"833, 931, 1083, 906, 1515, 1041, 1140, 1111, 1...","3, 5, 9",0.600529,0.754386,1743,2.0


In [None]:
from tensorflow.python.framework import sparse_tensor
import re

def to_sparse(dense):
    idx = tf.where(tf.not_equal(dense, 0))
    return tf.SparseTensor(idx, tf.gather_nd(dense, idx), dense.get_shape())

def make_example(val):
    example = tf.train.Example(features=tf.train.Features(
        feature = {
            'query_movie_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=val)),
            'genres': tf.train.Feature(int64_list=tf.train.Int64List(value=val))
        }
    ))
    return example

tf.reset_default_graph()
with tf.Graph().as_default():
    
    filename = "tmp.tfrecords"
    if not os.path.exists(filename):
        # os.remove(filename)
        writer = tf.python_io.TFRecordWriter(filename)
        with writer:
            for idx, r in teProcessed.head().iterrows():
                for col in ('query_movie_ids', 'genres'):
                    val = list(map(int, re.split(',\s*', r[col])))
                    ex = make_example(val)
                    writer.write(ex.SerializeToString())

    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(["tmp.tfrecords"], num_epochs=1)
    _, serialized_example = reader.read(filename_queue)

    batch = tf.train.batch(tensors=[serialized_example], batch_size=1)
    features = {
        'query_movie_ids': tf.VarLenFeature(tf.int64),
        'genres': tf.VarLenFeature(tf.int64)
    }
    data = tf.parse_example(batch, features)
    query_movie_ids = data['query_movie_ids']
    embbedding = tf.Variable(tf.glorot_uniform_initializer()([9125]), dtype=tf.float32)
    emb_query = tf.nn.embedding_lookup_sparse([embbedding], query_movie_ids, None, combiner='sqrtn')
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        try:
            emb_query_ = sess.run(emb_query)
            print(emb_query_)
            pass
        except tf.errors.OutOfRangeError as e:
            coord.request_stop(e)
        finally:
            coord.request_stop()
            coord.join(threads)
    

In [13]:
emb_query

<tf.Tensor 'embedding_lookup_sparse:0' shape=(?,) dtype=float32>

In [None]:
import re
headers = ['user_id', 'query_movie_ids', 'genres', 'avg_rating', 'year', 'candidate_movie_id', 'rating']

tf.reset_default_graph()
with tf.Graph().as_default():
    # reader = tf.TextLineReader()
    # _, value = reader.read(tf.train.string_input_producer(['./te_processed.csv']))
    def parse_csv(value):
        cols = tf.decode_csv(value, record_defaults=[[0], [''], [''], [], [], [0], []])
        return OrderedDict(zip(headers, cols))
    
    def do_multi(*inputs):
        r = OrderedDict(zip(headers, inputs))
        r['query_movie_ids'] = re.split(',\s*', r['query_movie_ids'])
        return tuple(r.values())

    dataset = tf.data.TextLineDataset(['./te_processed.csv'])
    dataset = dataset.map(parse_csv)\
                     .map(lambda r: tf.py_func(do_multi, 
                                               [v for v in r.values()], 
                                               [tf.int32, tf.string, tf.string, tf.float32, tf.float32, tf.int32, tf.float32]))\
                     .batch(10)
    iters = dataset.make_one_shot_iterator()
    data = iters.get_next()
    
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        tf.tables_initializer().run()
        data_ = sess.run(data)
        
        # coord = tf.train.Coordinator()
        # threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        # try:
        #     cur_data = sess.run(data)
        #     print(cur_data)
        # except tf.errors.OutOfRangeError as e:
        #     coord.request_stop(e)
        # finally:
        #     coord.request_stop()
        #     coord.join(threads)

In [None]:
help(tf.train.match_filenames_once)

In [None]:
tf.reset_default_graph()
with tf.Graph().as_default():
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer(["tmp.tfrecords"], num_epochs=1)
    _, serialized_example = reader.read(filename_queue)
    # 'query_movie_ids', 'genres'
    genres = tf.feature_column.categorical_column_with_hash_bucket('genres', hash_bucket_size=1000, dtype=tf.int64)
    genres = tf.feature_column.embedding_column(genres, dimension=8)
    query_movie_ids = tf.feature_column.categorical_column_with_hash_bucket('query_movie_ids', hash_bucket_size=1000, dtype=tf.int64)
    query_movie_ids = tf.feature_column.embedding_column(query_movie_ids, dimension=8)
    columns = [query_movie_ids, genres]
    features = tf.parse_example(serialized_example, features=tf.feature_column.make_parse_example_spec(columns))
    with tf.Session() as sess:
        tf.global_variables_initializer().run()