# FM practice

## Ref
本段代码来自：https://github.com/babakx/fm_tensorflow/blob/master/fm_tensorflow.ipynb

关于FM模型的说明，见博文 https://codingcat.cn/article/75

## 数据集
https://grouplens.org/datasets/movielens/100k/  
数据格式是每行一条信息，每行包含四列：用户id，电影id，打分，时间。

## load data

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer

# laod data with pandas
cols = ['user', 'item', 'rating', 'timestamp']
train = pd.read_csv('./data/ua.base', delimiter='\t', names=cols)
test = pd.read_csv('./data/ua.test', delimiter='\t', names=cols)

In [4]:
train.head(10)

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
5,1,6,5,887431973
6,1,7,4,875071561
7,1,8,1,875072484
8,1,9,5,878543541
9,1,10,3,875693118


## 预处理

下面👇方法的作用，是将每条原始数据（电影id、用户id）转换为FM所需要的onehot形式。

In [6]:
from itertools import count 
from collections import defaultdict
from scipy.sparse import csr    

def vectorize_dic(dic, ix=None, p=None):
    """ 
    Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) 
    
    parameters:
    -----------
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    """
    if (ix == None):
        d = count(0)
        ix = defaultdict(lambda: next(d)) 
        
    n = len(list(dic.values())[0]) # num samples
    g = len(list(dic.keys())) # num groups
    nz = n * g # number of non-zeros

    col_ix = np.empty(nz, dtype=int)     
    
    i = 0
    for k, lis in dic.items():     
        # append index el with k in order to prevet mapping different columns with same id to same index
        col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
        i += 1
        
    row_ix = np.repeat(np.arange(0, n), g)      
    data = np.ones(nz)
    
    if (p == None):
        p = len(ix)
        
    ixx = np.where(col_ix < p)

    return csr.csr_matrix((data[ixx],(row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

为便于理解上面方法的作用，示例如下。

In [41]:
data_, ix_ = vectorize_dic({'a':[1,2,2], 'b':[2,4,3]})
data_.todense()

matrix([[1., 0., 1., 0., 0.],
        [0., 1., 0., 1., 0.],
        [0., 1., 0., 0., 1.]])

In [42]:
ix_

defaultdict(<function __main__.vectorize_dic.<locals>.<lambda>()>,
            {'1a': 0, '2a': 1, '2b': 2, '3b': 4, '4b': 3})



所以，经过上面的vectorize方法，将训练数据（一共3条），转换成FM所需的onehot形式了。data_中的每一列是一个特征，ix_ 存储了特征的名称及对应的列号，data的每一行是一条训练数据。

| 样本号 | 1a | 2a | 2b | 4b | 3b |
|  ---  | -- | -- | -- | -- | -- |
|   0   |  1 |  0 |  1 |  0 |  0 |
|   1   |  0 |  1 |  0 |  1 |  0 |
|   2   |  0 |  1 |  0 |  0 |  1 |


此外需要注意一点，vectorize方法返回的是csr_matrix，是一种压缩了的稀疏矩阵格式，所以需要使用todense转换成我们容易看懂的矩阵格式。关于csr，更多信息可以看[这里](https://link.jianshu.com/?t=https%3A%2F%2Fblog.csdn.net%2Fu012871493%2Farticle%2Fdetails%2F51593451)

---
ok，理解了上面的函数之后，可以用它将本次用到的数据转换为FM所需的onehot形式了。

In [7]:
# vectorize data and convert them to csr matrix
X_train, ix = vectorize_dic({'users': train.user.values, 'items': train.item.values})
X_test, ix = vectorize_dic({'users': test.user.values, 'items': test.item.values}, ix, X_train.shape[1])
y_train = train.rating.values
y_test= test.rating.values

In [44]:
X_train = X_train.todense()
X_test = X_test.todense()

# print shape of data
print(X_train.shape)
print(X_test.shape)

(90570, 2623)
(9430, 2623)


## 模型定义

### 定义需要用到的参数

In [46]:
import tensorflow as tf

n, p = X_train.shape
# n  训练样本数量
# p  特征数量

# number of latent factors
k = 10

# design matrix
X = tf.placeholder('float', shape=[None, p])
# target vector
y = tf.placeholder('float', shape=[None, 1])

# bias and weights
w0 = tf.Variable(tf.zeros([1]))
W = tf.Variable(tf.zeros([p]))

# interaction factors, randomly initialized 
V = tf.Variable(tf.random_normal([k, p], stddev=0.01))

# estimate of y, initialized to 0.
y_hat = tf.Variable(tf.zeros([n, 1]))

### 定义FM的公式
这里使用经过推导之后的那一版计算公式。

In [47]:
from IPython.display import display, Math, Latex
display(Math(r'\hat{y}(\mathbf{x}) = w_0 + \sum_{j=1}^{p}w_jx_j + \frac{1}{2} \sum_{f=1}^{k} ((\sum_{j=1}^{p}v_{j,f}x_j)^2-\sum_{j=1}^{p}v_{j,f}^2 x_j^2)'))

<IPython.core.display.Math object>

In [48]:
# Calculate output with FM equation
linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True))
pair_interactions = (tf.multiply(0.5,
                    tf.reduce_sum(
                        tf.subtract(
                            tf.pow( tf.matmul(X, tf.transpose(V)), 2),
                            tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))),
                        1, keep_dims=True)))
y_hat = tf.add(linear_terms, pair_interactions)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


### 定义**损失函数**

In [51]:
display(Math(r'L = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 + \lambda_w ||W||^2 + \lambda_v ||V||^2'))

<IPython.core.display.Math object>

In [52]:
# L2 regularized sum of squares loss function over W and V
lambda_w = tf.constant(0.001, name='lambda_w')
lambda_v = tf.constant(0.001, name='lambda_v')

l2_norm = (tf.reduce_sum(
            tf.add(
                tf.multiply(lambda_w, tf.pow(W, 2)),
                tf.multiply(lambda_v, tf.pow(V, 2)))))

error = tf.reduce_mean(tf.square(tf.subtract(y, y_hat)))
loss = tf.add(error, l2_norm)

使用梯度下降进行参数更新。

In [53]:
display(Math(r'\Theta_{i+1} = \Theta_{i} - \eta \frac{\delta L}{\delta \Theta}'))

<IPython.core.display.Math object>

In [54]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)

## 分batch

In [55]:
def batcher(X_, y_=None, batch_size=-1):
    n_samples = X_.shape[0]

    if batch_size == -1:
        batch_size = n_samples
    if batch_size < 1:
       raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))

    for i in range(0, n_samples, batch_size):
        upper_bound = min(i + batch_size, n_samples)
        ret_x = X_[i:upper_bound]
        ret_y = None
        if y_ is not None:
            ret_y = y_[i:i + batch_size]
            yield (ret_x, ret_y)

## 模型训练

In [57]:
from tqdm import tqdm_notebook as tqdm

epochs = 10
batch_size = 1000

# Launch the graph
init = tf.global_variables_initializer()
sess = tf.Session()

sess.run(init)

for epoch in tqdm(range(epochs), unit='epoch'):
    perm = np.random.permutation(X_train.shape[0])
    # iterate over batches
    for bX, bY in batcher(X_train[perm], y_train[perm], batch_size):
        sess.run(optimizer, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)})

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




## 模型评估

In [58]:

errors = []
for bX, bY in batcher(X_test, y_test):
    errors.append(sess.run(error, feed_dict={X: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))

RMSE = np.sqrt(np.array(errors).mean())
print(RMSE)

1.1134949


In [59]:
sess.close()