In [1]:
import numpy as np
import pandas as pd

In [2]:
all_ratings = pd.read_csv('./Data/u.data', delimiter='\t', header=None, names=['UserID', 'MovieID', 'Rating', 'Datetime'])

In [3]:
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'], unit='s')

In [4]:
all_ratings[:10]

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16
5,298,474,4,1998-01-07 14:20:06
6,115,265,2,1997-12-03 17:51:28
7,253,465,5,1998-04-03 18:34:27
8,305,451,3,1998-02-01 09:20:17
9,6,86,3,1997-12-31 21:16:53


In [5]:
all_ratings['Favorable'] = all_ratings['Rating'] > 3

In [6]:
all_ratings[:3]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,196,242,3,1997-12-04 15:55:49,False
1,186,302,3,1998-04-04 19:22:22,False
2,22,377,1,1997-11-07 07:18:36,False


In [7]:
ratings = all_ratings[all_ratings['UserID'].isin(list(range(200)))]

In [8]:
favorable_ratings = ratings[ratings['Favorable']]    # 新建数据集，只包含用户喜欢某部电影的数据行

In [9]:
len(favorable_ratings)

11043

In [10]:
len(ratings)

19531

In [11]:
favorable_reviews_by_users = {k: frozenset(v.values) for k, v in favorable_ratings.groupby('UserID')['MovieID']}

In [12]:
num_favorable_by_movie = ratings[['MovieID', 'Favorable']].groupby('MovieID').sum()

In [13]:
data = pd.DataFrame({'id': [1, 2, 3, 4, 2, 1, 1, 3, 4, 1], 'favor':[True, False, True, False, True, False, False, True, True, False]})

In [14]:
frequent_itemsets = {}

In [15]:
min_support = 50

In [17]:
frequent_itemsets[1] = {frozenset((movie_id,)): row['Favorable'] for movie_id, row in num_favorable_by_movie.iterrows() if row['Favorable'] > min_support}

In [20]:
from collections import defaultdict

In [38]:
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return {itemset: frequency for itemset, frequency in counts.items() if frequency >= min_support}