# 14.1 SVD的应用

## 14.1.1 隐性语义索引

## 14.1.2 推荐系统

# 14.2 矩阵分解

$$
Data_{m\times n} = U_{m\times m}\Sigma_{m\times n}V^T_{n\times n}
$$

# 14.3 利用Python实现SVD

In [1]:
from numpy import *

In [2]:
U,Sigma,VT = linalg.svd([[1,1],[7,7]])

In [3]:
U

array([[-0.14142136, -0.98994949],
       [-0.98994949,  0.14142136]])

In [4]:
Sigma

array([ 10.,   0.])

In [5]:
VT

array([[-0.70710678, -0.70710678],
       [-0.70710678,  0.70710678]])

In [6]:
def load_ex_data():
    return [[1,1,1,0,0],
            [2,2,2,0,0],
            [1,1,1,0,0],
            [5,5,5,0,0],
            [1,1,0,2,2],
            [0,0,0,3,3],
            [0,0,0,1,1]]

In [7]:
Data = load_ex_data()

In [8]:
U,Sigma,VT = linalg.svd(Data)

In [9]:
Sigma

array([  9.72140007e+00,   5.29397912e+00,   6.84226362e-01,
         1.31137664e-15,   8.23101664e-32])

$$
Data_{m\times n}=U_{m\times 3}\Sigma_{3\times 3}{V^T}_{3\times n}
$$

In [10]:
Sig3 = mat([[Sigma[0],0,0],
            [0,Sigma[1],0],
            [0,0,Sigma[2]]])

In [11]:
U[:,:3]*Sig3*VT[:3,:]

matrix([[  1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          -1.51029363e-16,  -1.56558794e-16],
        [  2.00000000e+00,   2.00000000e+00,   2.00000000e+00,
           4.89625701e-16,   4.78566839e-16],
        [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          -1.89843800e-16,  -2.02420546e-16],
        [  5.00000000e+00,   5.00000000e+00,   5.00000000e+00,
          -4.29344060e-17,  -9.15066634e-17],
        [  1.00000000e+00,   1.00000000e+00,  -3.88578059e-16,
           2.00000000e+00,   2.00000000e+00],
        [  1.94289029e-16,   5.82867088e-16,  -6.10622664e-16,
           3.00000000e+00,   3.00000000e+00],
        [  2.77555756e-17,   1.52655666e-16,  -1.24900090e-16,
           1.00000000e+00,   1.00000000e+00]])

In [12]:
U[:,:3]*Sig3*VT[:3,:]-Data

matrix([[  4.44089210e-16,   6.66133815e-16,   1.33226763e-15,
          -1.51029363e-16,  -1.56558794e-16],
        [  0.00000000e+00,   4.44089210e-16,   0.00000000e+00,
           4.89625701e-16,   4.78566839e-16],
        [  0.00000000e+00,   2.22044605e-16,   0.00000000e+00,
          -1.89843800e-16,  -2.02420546e-16],
        [ -8.88178420e-16,   0.00000000e+00,  -8.88178420e-16,
          -4.29344060e-17,  -9.15066634e-17],
        [  0.00000000e+00,   6.66133815e-16,  -3.88578059e-16,
          -6.66133815e-16,  -2.22044605e-16],
        [  1.94289029e-16,   5.82867088e-16,  -6.10622664e-16,
           4.44089210e-16,   8.88178420e-16],
        [  2.77555756e-17,   1.52655666e-16,  -1.24900090e-16,
           0.00000000e+00,   2.22044605e-16]])

# 14.4 基于协同过滤的推荐引擎

## 14.4.1 相似度计算

**程序清单14-1** 相似度计算

In [13]:
from numpy import linalg as la

In [14]:
def eclud_sim(inA, inB):
    return 1.0/(1.0 + la.norm(inA - inB))

In [15]:
def pears_sim(inA, inB):
    if len(inA) < 3:
        return 1.0
    return 0.5 + 0.5 * corrcoef(inA, inB, rowvar=0)[0][1]

In [16]:
def cos_sim(inA, inB):
    num = float(inA.T*inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5+0.5*(num/denom)

In [17]:
my_mat = mat(load_ex_data())

In [18]:
eclud_sim(my_mat[:,0], my_mat[:,4])

0.13367660240019172

In [19]:
eclud_sim(my_mat[:,0], my_mat[:,0])

1.0

In [20]:
cos_sim(my_mat[:,0], my_mat[:,4])

0.54724555912615336

In [21]:
cos_sim(my_mat[:,0], my_mat[:,0])

0.99999999999999989

In [22]:
pears_sim(my_mat[:,0], my_mat[:,4])

0.23768619407595826

In [23]:
pears_sim(my_mat[:,0], my_mat[:,0])

1.0

## 14.4.2 基于物品的相似度还是基于用户的相似度？

## 14.4.3 推荐引擎的评价

# 14.5 示例：餐馆菜肴推荐引擎

## 14.5.1 推荐未尝过的菜肴

**程序清单14-2** 基于物品相似度的推荐引擎

In [25]:
def stand_est(data_mat, user, sim_meas, item):
    n = shape(data_mat)[1]
    sim_total = 0.0
    rat_sim_total = 0.0
    for j in range(n):
        user_rating = data_mat[user, j]
        if user_rating == 0:
            continue
        over_lap = nonzero(logical_and(data_mat[:,item].A>0,
                                      data_mat[:,j].A>0))[0]
        if len(over_lap) == 0:
            similarity = 0
        else:
            similarity = sim_meas(data_mat[over_lap, item],
                                 data_mat[over_lap, j])
            # print('the {} and {} similarity is: {}'.format(item, j, similarity))
            sim_total += similarity
            rat_sim_total += similarity * user_rating
    if sim_total == 0:
        return 0
    else:
        return rat_sim_total/sim_total

In [26]:
def recommend(data_mat, user, N=3, sim_meas=cos_sim, est_method=stand_est):
    unrated_items = nonzero(data_mat[user,:].A==0)[1]
    if len(unrated_items) == 0:
        return 'You rated everything'
    item_scores = []
    for item in unrated_items:
        estimated_score = est_method(data_mat, user, sim_meas, item)
        item_scores.append((item, estimated_score))
    return sorted(item_scores, key=lambda jj: jj[1], reverse=True)[:N]

In [31]:
my_mat = matrix([[4,4,0,2,2],
                 [4,0,0,3,3],
                 [4,0,0,1,1],
                 [1,1,1,2,0],
                 [2,2,2,0,0],
                 [1,1,1,0,0],
                 [5,5,5,0,0]])

In [32]:
my_mat

matrix([[4, 4, 0, 2, 2],
        [4, 0, 0, 3, 3],
        [4, 0, 0, 1, 1],
        [1, 1, 1, 2, 0],
        [2, 2, 2, 0, 0],
        [1, 1, 1, 0, 0],
        [5, 5, 5, 0, 0]])

In [33]:
recommend(my_mat, 2)

[(2, 2.5), (1, 2.0243290220056256)]

In [34]:
recommend(my_mat, 2, sim_meas=eclud_sim)

[(2, 3.0), (1, 2.8266504712098603)]

In [35]:
recommend(my_mat, 2, sim_meas=pears_sim)

[(2, 2.5), (1, 2.0)]

## 14.5.2 利用SVD提高推荐的效果

In [37]:
def load_ex_data2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

In [38]:
U, Sigma, VT = la.svd(mat(load_ex_data2()))

In [39]:
Sigma

array([ 15.77075346,  11.40670395,  11.03044558,   4.84639758,
         3.09292055,   2.58097379,   1.00413543,   0.72817072,
         0.43800353,   0.22082113,   0.07367823])

In [40]:
Sig2 = Sigma**2

In [41]:
sum(Sig2)

541.99999999999955

In [42]:
sum(Sig2)*0.9

487.79999999999961

In [43]:
sum(Sig2[:2])

378.82955951135784

In [44]:
sum(Sig2[:3])

500.50028912757921

**程序清单14-3** 基于SVD的评分估计

In [45]:
def svd_est(data_mat, user, sim_meas, item):
    n = shape(data_mat)[1]
    sim_total = 0.0
    rat_sim_total = 0.0
    U,Sigma,VT = la.svd(data_mat)
    Sig4 = mat(eye(4)*Sigma[:4])
    xformed_items = data_mat.T * U[:,:4] *Sig4.I
    for j in range(n):
        user_rating = data_mat[user,j]
        if user_rating == 0 or j == item:
            continue
        similarity = sim_meas(xformed_items[item,:].T,
                             xformed_items[j,:].T)
        print('The {} and {} similarity is: {}'.format(item, j, similarity))
        sim_total += similarity
        rat_sim_total += similarity * user_rating
    if sim_total == 0:
        return 0
    else:
        return rat_sim_total/sim_total

In [51]:
my_mat = mat(load_ex_data2())

In [53]:
recommend(my_mat, 1, est_method=svd_est)

The 0 and 3 similarity is: 0.4909504362809655
The 0 and 5 similarity is: 0.48427360863288366
The 0 and 10 similarity is: 0.5127549449063614
The 1 and 3 similarity is: 0.49129446474255783
The 1 and 5 similarity is: 0.4815163106117995
The 1 and 10 similarity is: 0.5097088624883723
The 2 and 3 similarity is: 0.49157334483312753
The 2 and 5 similarity is: 0.4823464228706552
The 2 and 10 similarity is: 0.5105839479452583
The 4 and 3 similarity is: 0.4504947733305868
The 4 and 5 similarity is: 0.5067948022540267
The 4 and 10 similarity is: 0.512895628760011
The 6 and 3 similarity is: 0.7436990825417141
The 6 and 5 similarity is: 0.468365960377396
The 6 and 10 similarity is: 0.4394646501300239
The 7 and 3 similarity is: 0.4821750213197406
The 7 and 5 similarity is: 0.4947163013621476
The 7 and 10 similarity is: 0.5249702098778072
The 8 and 3 similarity is: 0.4913072090951114
The 8 and 5 similarity is: 0.4912277621682813
The 8 and 10 similarity is: 0.5202895926889711
The 9 and 3 similarity is:

[(4, 3.3447149384692283), (7, 3.3294020724526971), (9, 3.3281008763900686)]

In [54]:
recommend(my_mat, 1, est_method=svd_est, sim_meas=pears_sim)

The 0 and 3 similarity is: 0.3419417929706251
The 0 and 5 similarity is: 0.12413197970004564
The 0 and 10 similarity is: 0.11669809771025108
The 1 and 3 similarity is: 0.3455600227334865
The 1 and 5 similarity is: 0.1264564541116927
The 1 and 10 similarity is: 0.11889179903888569
The 2 and 3 similarity is: 0.3451491807013254
The 2 and 5 similarity is: 0.12619038994686232
The 2 and 10 similarity is: 0.1186402291926914
The 4 and 3 similarity is: 0.45012552146129753
The 4 and 5 similarity is: 0.5285042654803261
The 4 and 10 similarity is: 0.5446471536184855
The 6 and 3 similarity is: 0.923822438331844
The 6 and 5 similarity is: 0.7248398199313778
The 6 and 10 similarity is: 0.7108959162553993
The 7 and 3 similarity is: 0.31948180898110834
The 7 and 5 similarity is: 0.11832446978388333
The 7 and 10 similarity is: 0.11337047794658778
The 8 and 3 similarity is: 0.33491047792850354
The 8 and 5 similarity is: 0.11967253306435388
The 8 and 10 similarity is: 0.11249675489082672
The 9 and 3 simil

[(4, 3.3469521867021741), (9, 3.3353796573274699), (6, 3.3071930278130366)]

In [55]:
recommend(my_mat, 1, est_method=svd_est, sim_meas=eclud_sim)

The 0 and 3 similarity is: 0.6123840274755334
The 0 and 5 similarity is: 0.5542623150427824
The 0 and 10 similarity is: 0.5572078635070534
The 1 and 3 similarity is: 0.6364208585338235
The 1 and 5 similarity is: 0.5691521960328635
The 1 and 10 similarity is: 0.5709292803881685
The 2 and 3 similarity is: 0.6077081015315077
The 2 and 5 similarity is: 0.5506128833241689
The 2 and 10 similarity is: 0.5536597775730837
The 4 and 3 similarity is: 0.5267757811863999
The 4 and 5 similarity is: 0.5045183157085598
The 4 and 10 similarity is: 0.5037094173413517
The 6 and 3 similarity is: 0.5435113288743953
The 6 and 5 similarity is: 0.4557411700443217
The 6 and 10 similarity is: 0.4478192737557705
The 7 and 3 similarity is: 0.605924260756124
The 7 and 5 similarity is: 0.5537170888296632
The 7 and 10 similarity is: 0.5573720982304103
The 8 and 3 similarity is: 0.6068608782573052
The 8 and 5 similarity is: 0.5521249466837764
The 8 and 10 similarity is: 0.5554943599370405
The 9 and 3 similarity is: 0

[(4, 3.3286756747000452), (9, 3.3247038080937834), (7, 3.3224884985810177)]

## 14.5.3 构建推荐引擎面临的挑战

# 14.6 基于SVD的图像压缩

**程序清单14-4** 图像压缩函数

In [80]:
def print_mat(in_mat, thresh=0.8):
    for i in range(32):
        for k in range(32):
            if float(in_mat[i,k]) > thresh:
                print('1', end='')
            else:
                print('0', end='')
        print('')

In [81]:
def img_compress(numSV=3, thresh=0.8):
    myl = []
    for line in open('0_5.txt').readlines():
        new_row = []
        for i in range(32):
            new_row.append(int(line[i]))
        myl.append(new_row)
    my_mat = mat(myl)
    print('****original matrix******')
    print_mat(my_mat, thresh)
    U,Sigma,VT = la.svd(my_mat)
    Sig_recon = mat(zeros((numSV,numSV)))
    for k in range(numSV):
        Sig_recon[k,k] = Sigma[k]
    recon_mat = U[:,:numSV]*Sig_recon*VT[:numSV,:]
    print('****reconstructed matrix using {} singular values******'.format(numSV))
    print_mat(recon_mat, thresh)

In [82]:
img_compress(2)

****original matrix******
00000000000000110000000000000000
00000000000011111100000000000000
00000000000111111110000000000000
00000000001111111111000000000000
00000000111111111111100000000000
00000001111111111111110000000000
00000000111111111111111000000000
00000000111111100001111100000000
00000001111111000001111100000000
00000011111100000000111100000000
00000011111100000000111110000000
00000011111100000000011110000000
00000011111100000000011110000000
00000001111110000000001111000000
00000011111110000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000001111100000000011111000000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000011111000000
00000000111110000000111111000000
00000000111111000001111110000000
00000000011111111111111110000000
00000000001111111111111110000000
00000000001111111111111110000000
00000000000111111

# 14.7 本章小结