# 协作型过滤

# 搜集偏好

In [1]:
%cd /Users/wonderful/Desktop

/Users/wonderful/Desktop


In [2]:
from recommendations import critics

In [3]:
critics['Lisa Rose']

{'Just My Luck': 3.0,
 'Lady in the Water': 2.5,
 'Snakes on a Plane': 3.5,
 'Superman Returns': 3.5,
 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.5}

In [4]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [5]:
critics['Toby']['Snakes on a Plane']

4.5

In [6]:
critics['Toby']['Snakes on a Plane'] = 4.5

In [7]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

# 寻找相近的用户

## 欧几里得距离评价

In [8]:
from math import sqrt

In [9]:
sqrt(pow(4.5 - 4, 2) + pow(1 - 2, 2))

1.118033988749895

In [10]:
1 / (1 + sqrt(pow(4.5 - 4, 2) + pow(1 - 2, 2)))

0.4721359549995794

In [11]:
from math import sqrt

def sim_distance(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
            
    if len(si) == 0:
        return 0
    
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2)
                         for item in prefs[person1] if item in prefs[person2]])
    
    return 1 / (1 + sqrt(sum_of_squares))

In [12]:
import recommendations

In [13]:
sim_distance(recommendations.critics, 'Lisa Rose', 'Gene Seymour')

0.29429805508554946

## 皮尔逊相关度评价

In [14]:
def sim_pearson(prefs, p1, p2):
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
            
    n =len(si)
    
    sum1 = sum(prefs[p1][it] for it in si)
    sum2 = sum(prefs[p2][it] for it in si)
    
    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])
    
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return o
    
    r = num / den
    
    return r

In [15]:
import recommendations
sim_pearson(recommendations.critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

## 应该选择哪一种相似性度量方法

## 为评论者打分

In [16]:
def topMatches(prefs, person, n=5, similarity=sim_pearson):
    scores = [(similarity(prefs, person, other), other) for other in prefs if other != person]
    
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [17]:
import recommendations

In [18]:
topMatches(recommendations.critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

# 推荐物品

In [19]:
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)
    print('sim', sim)
    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:
	    
      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        totals.setdefault(item,0)
        totals[item]+=prefs[other][item]*sim
        # Sum of similarities
        simSums.setdefault(item,0)
        simSums[item]+=sim

  # Create the normalized list
  rankings=[(total/simSums[item],item) for item,total in totals.items()]

  # Return the sorted list
  rankings.sort()
  rankings.reverse()
  return rankings

In [20]:
import recommendations
getRecommendations(recommendations.critics, 'Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [21]:
getRecommendations(recommendations.critics, 'Toby', similarity=sim_distance)

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]

In [22]:
getRecommendations(recommendations.critics, 'Toby', similarity=sim_distance)

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]