In [1]:
from numpy import *
from pathlib import Path

In [4]:
#奇异值分解（SVD，Singular Value Decomposition），应用于隐形语言索引（文档主题的确定）和推荐系统

In [5]:
##奇异值表示对角矩阵的元素。

In [10]:
def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]

In [11]:
Data = loadExData()
U,Sigma,VT = linalg.svd(Data)

In [12]:
Sigma

array([9.64365076e+00, 5.29150262e+00, 7.80307960e-16, 4.02009206e-17,
       1.38744823e-33])

In [15]:
#根据Sigma的奇异值，重构矩阵
Sig3 = mat([[Sigma[0],0,0],
           [0,Sigma[1],0],
           [0,0,Sigma[2]]])
U[:,:3]*Sig3*VT[:3,:]

matrix([[-1.04793449e-15,  2.51142317e-16,  7.96792175e-16,
          2.00000000e+00,  2.00000000e+00],
        [-6.54487160e-16,  4.92555506e-16,  1.61931655e-16,
          3.00000000e+00,  3.00000000e+00],
        [-2.26768726e-16,  1.63098445e-16,  6.36702813e-17,
          1.00000000e+00,  1.00000000e+00],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         -4.08233644e-17, -4.08233644e-17],
        [ 2.00000000e+00,  2.00000000e+00,  2.00000000e+00,
         -8.16467288e-17, -8.16467288e-17],
        [ 5.00000000e+00,  5.00000000e+00,  5.00000000e+00,
          1.62949893e-16,  1.62949893e-16],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00,
         -4.08233644e-17, -4.08233644e-17]])

In [18]:
#相似度计算，有欧式距离、皮尔逊相关系数、余弦相似度
def ecludSim(inA,inB):
    return 1.0/(1.0 + linalg.norm(inA - inB))

def pearsSim(inA,inB):
    if len(inA) < 3:
        return 1.0
    return 0.5 + 0.5 * corrcoef(inA,inB,rowvar=0)[0][1]

def cosSim(inA,inB):
    num = float(inA.T * inB)
    denom = linalg.norm(inA) * linalg.norm(inB)
    return 0.5 + 0.5 * (num/denom)

In [20]:
Data = mat(Data)
cosSim(Data[:,0],Data[:,4])

0.5

In [27]:
##基于物品的相似度和基于用户的相似度。取决于哪个的数据量少些
def standEst(dataMat,user,simMeas,item):
    n = shape(dataMat)[1]
    simTotal,ratSimTotal = 0.0,0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0:
            continue
            
        overLap = nonzero(logical_and(dataMat[:,item].A > 0,  #寻找两个用户都评级的物品
                         dataMat[:,j].A > 0))[0]
        if len(overLap) == 0:
            similarity = 0
        else:
            similarity = simMeas(dataMat[overLap,item],
                                dataMat[overLap,j])
            
        simTotal += similarity
        ratSimTotal += similarity * userRating         #相似度值 乘以用户对j物品的评分
        
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal/simTotal

In [28]:
def recommend(dataMat,user,N=3,simMeas=cosSim,estMethod=standEst):
    unratedItems = nonzero(dataMat[user,:].A == 0)[1]
    if len(unratedItems) == 0:
        return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat,user,simMeas,item)
        itemScores.append((item,estimatedScore))
    return sorted(itemScores, key = lambda p:p[1], reverse=True)[:N]   #寻找前N个未评价物品

In [25]:
Data[0,1]=Data[0,0]=Data[1,0]=Data[2,0]=4
Data[3,3]=2

In [29]:
#生成第2个用户的物品推荐结果和评分估计值
recommend(Data,2)

[(2, 2.5), (1, 2.0243290220056256)]

In [30]:
#利用SVD提高推荐的效果

In [31]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [32]:
myData = loadExData2()
U,Sigma,VT = linalg.svd(mat(myData))

In [34]:
def svdEst(dataMat,user,simMeas,item):
    n = shape(dataMat)[1]
    simTotal,ratSimTotal = 0.0,0.0
    
    U,Sigma,VT = linalg.svd(dataMat)         #进行svd分解，只利用包含90%能量值（方差和的90%分位数）的奇异值
    Sig4 = mat(eye(4) * Sigma[:4])          #生成4*4的矩阵
    xformedItems = dataMat.T * U[:,:4] * Sig4.I
    
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j == item:
            continue
        similarity = simMeas(xformedItems[item,:].T,
                            xformedItems[j,:].T)
        simTotal += similarity
        ratSimTotal += similarity * userRating         #相似度值 乘以用户对j物品的评分
        
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal/simTotal

In [35]:
recommend(mat(myData),1,estMethod=svdEst)

[(4, 3.344714938469228), (7, 3.3294020724526967), (9, 3.328100876390069)]

In [36]:
# 冷启动问题的解决方案，就是将推荐看成是搜索问题。在内部表现上，不同的解决办法虽然有所不同，但是对用户而言却都是透明的。为了将推荐看成是搜索问题，可能要使用所需要
# 推荐物品的属性（标签）。同时，我们也可以将这些属性作为相似度计算所需要的数据，这被称为基于内容
# (content-based)的推荐。可能，基于内容的推荐并不如前面介绍的基于协同过滤的推荐效果好 ，但拥有它，这就是个良好的开始。