In [3]:
import os

In [4]:
# data from http://grouplens.org/datasets/movielens/
# local download folder: /Users/wangdongdong/Downloads/ml-latest-small

data_folder = os.path.join(os.path.expanduser("~"), "Downloads", "ml-latest-small")
ratings_filename = os.path.join(data_folder, "ratings.csv")

In [5]:
ratings_filename

'/Users/wangdongdong/Downloads/ml-latest-small/ratings.csv'

In [6]:
import pandas as pd

#all_ratings = pd.read_csv(ratings_filename, delimiter="\t", header=None, names = ["UserID", "MovieID", "Rating", "Datetime"])
all_ratings = pd.read_csv(ratings_filename,delimiter=",",header=0,names = ["UserID", "MovieID", "Rating", "Datetime"] )

In [7]:
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5] # fliter first 5 rows

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [8]:
# 首先要确定用户是不是喜欢某一部电影。为此创建新特征Favorable，若用户喜欢该电影，值为True。

all_ratings['Favorable'] = all_ratings['Rating'] > 3 #创建新列，代表是否喜欢

In [9]:
all_ratings[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,1,163,5.0,2000-07-30 19:00:50,True
11,1,216,5.0,2000-07-30 18:20:08,True
12,1,223,3.0,2000-07-30 18:16:25,False
13,1,231,5.0,2000-07-30 18:19:39,True
14,1,235,4.0,2000-07-30 18:15:08,True


In [30]:
# 从数据集中提取一部分作为训练集

ratings = all_ratings[all_ratings['UserID'].isin(range(150))]
ratings.shape

(22277, 5)

In [31]:
# 然后，创建一个数据集，只包含用户喜欢某部电影的数据行

favorable_ratings = ratings[ratings['Favorable']]

In [32]:
dict((k, frozenset(v.values)) for k, v in favorable_ratings.\
    groupby('UserID')['MovieID']) 
# 把v.values存储为frozenset，便于快速判断用户是否为某部电影打过分

{1: frozenset({1,
            3,
            6,
            47,
            50,
            101,
            110,
            151,
            157,
            163,
            216,
            231,
            235,
            260,
            333,
            349,
            356,
            362,
            367,
            441,
            457,
            480,
            527,
            543,
            552,
            553,
            590,
            592,
            593,
            596,
            608,
            661,
            733,
            804,
            919,
            923,
            940,
            943,
            954,
            1023,
            1024,
            1025,
            1029,
            1031,
            1032,
            1042,
            1049,
            1060,
            1073,
            1080,
            1089,
            1090,
            1092,
            1097,
            1127,
            1136,
            1196,
            1197,


In [33]:
(favorable_ratings.\
    groupby('UserID')['MovieID'])

<pandas.core.groupby.SeriesGroupBy object at 0x110363940>

In [34]:
# 在生成项集时，需要搜索用户喜欢的电影。因此，我们需要知道每个用户各喜欢哪些电影
# 按照UserID分组，并遍历每个用户看过的每一部电影

# 生成用户看过并评分的电影

favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.\
    groupby('UserID')['MovieID']) 

In [35]:
len(favorable_reviews_by_users)

149

In [36]:
# 创建一个数据框，以便了解每部电影的影迷数量

num_favorable_by_movie = ratings[['MovieID','Favorable']].groupby('MovieID').sum()

In [37]:
# 用以下代码查看最后欢迎的五部电影

num_favorable_by_movie.sort_values('Favorable', ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
318,72.0
356,70.0
296,62.0
593,60.0
260,58.0


In [26]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [38]:
import sys
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

There are 6 movies with more than 50 favorable reviews
I found 15 frequent itemsets of length 2
I found 20 frequent itemsets of length 3
I found 15 frequent itemsets of length 4
I found 6 frequent itemsets of length 5
I found 1 frequent itemsets of length 6
Did not find any frequent itemsets of length 7


In [None]:
# Apriori的实现

# 把发现的频繁项集保存到以项集长度为键 的字典中，便于依据长度查找，
# 这样就可以找到最新发现的频繁项集

frequent_itemsets = {} #初始化一个字典

# 确定项集要成为频繁项集所需的最小支持度
min_support = 50



In [None]:
# 为每一部电影生成只包含他自己的项集，检测他是否够频繁
# 电影编号用frozenset，后面要用到集合操作。另外frozenset可以用作字典的键

frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

In [None]:
for itemset in frequent_itemsets[2]:
    print(itemset)

In [None]:
# 使用下面函数接收新发现的频繁项集，创建超集，检测频繁程度

from collections import defaultdict
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        #遍历前面的项集，判断他们是否是当前评分项集的子集。
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                # 接下来遍历用户打过分却没有出现在项集里面的电影，用它生成超集，更新该项集的计数
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    
    return dict([(itemset, frequency) for itemset, frequency in counts.items() \
                 if frequency >= min_support])