In [1]:
import random
import heapq
from collections import defaultdict
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori, association_rules
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

# 14.1 关联规则

e.g.1:手机保护壳交易数据库中的关联规则

In [2]:
# 数据的加载和预处理
fp_df = pd.read_csv(r'Data/Faceplate.csv')
fp_df.set_index('Transaction', inplace=True)
fp_df

Unnamed: 0_level_0,Red,White,Blue,Orange,Green,Yellow
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,1,0
2,0,1,0,1,0,0
3,0,1,1,0,0,0
4,1,1,0,1,0,0
5,1,0,1,0,0,0
6,0,1,1,0,0,0
7,1,0,1,0,0,0
8,1,1,1,0,1,0
9,1,1,1,0,0,0
10,0,0,0,0,0,1


In [3]:
# 建立支持度至少为20%的高频项集
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)
itemsets



Unnamed: 0,support,itemsets
0,0.6,(Red)
1,0.7,(White)
2,0.6,(Blue)
3,0.2,(Orange)
4,0.2,(Green)
5,0.4,"(White, Red)"
6,0.4,"(Blue, Red)"
7,0.2,"(Green, Red)"
8,0.4,"(White, Blue)"
9,0.2,"(White, Orange)"


In [4]:
# 建立满足置信度为50%的关联规则
rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)
rules.sort_values(by='lift', ascending=False).iloc[:, [0,1,4,5,6,7]].head()

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
13,"(White, Red)",(Green),0.2,0.5,2.5,0.12
15,(Green),"(White, Red)",0.2,1.0,2.5,0.12
4,(Green),(Red),0.2,1.0,1.666667,0.08
12,"(White, Green)",(Red),0.2,1.0,1.666667,0.08
7,(Orange),(White),0.2,1.0,1.428571,0.06


e.g.2:图书购买交易数据库中的关联规则

In [5]:
# 数据的加载
book_df = pd.read_csv(r'Data/CharlesBookClub.csv')
book_df

Unnamed: 0,Seq#,ID#,Gender,M,R,F,FirstPurch,ChildBks,YouthBks,CookBks,...,ItalCook,ItalAtlas,ItalArt,Florence,Related Purchase,Mcode,Rcode,Fcode,Yes_Florence,No_Florence
0,1,25,1,297,14,2,22,0,1,1,...,0,0,0,0,0,5,4,2,0,1
1,2,29,0,128,8,2,10,0,0,0,...,0,0,0,0,0,4,3,2,0,1
2,3,46,1,138,22,7,56,2,1,2,...,1,0,0,0,2,4,4,3,0,1
3,4,47,1,228,2,1,2,0,0,0,...,0,0,0,0,0,5,1,1,0,1
4,5,51,1,257,10,1,10,0,0,0,...,0,0,0,0,0,5,3,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,32950,0,141,2,2,6,0,0,0,...,0,0,0,0,0,4,1,2,0,1
3996,3997,32954,1,357,16,9,52,1,1,2,...,0,0,0,0,1,5,4,3,0,1
3997,3998,32955,0,48,12,1,12,0,0,0,...,0,0,0,0,0,2,3,1,0,1
3998,3999,32976,0,214,14,7,38,1,1,3,...,0,0,0,0,0,5,4,3,0,1


In [6]:
# 建立二元关联矩阵
count_books = book_df.iloc[:, 7:18].applymap(lambda x: 1 if x > 0 else x)
display(count_books)
count_books_bool = count_books.astype(bool) # 将数据转换为布尔型

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks,ItalCook,ItalAtlas,ItalArt,Florence
0,0,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,0,1,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3995,0,0,0,0,0,0,0,0,0,0,0
3996,1,1,1,1,1,0,1,0,0,0,0
3997,0,0,0,0,0,0,0,0,0,0,0
3998,1,1,1,1,0,0,0,0,0,0,0


In [7]:
# 建立支持度至少为0.05的高频项集，随后建立满足置信度为0.5的关联规则
itemsets = apriori(count_books_bool, min_support=0.05, use_colnames=True)
rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)
rules.sort_values(by='lift', ascending=False).iloc[:, [0,1,4,5,6,7]].head(25)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
64,"(YouthBks, RefBks)","(ChildBks, CookBks)",0.05525,0.68,2.809917,0.035588
73,"(DoItYBks, RefBks)","(ChildBks, CookBks)",0.06125,0.662162,2.736207,0.038865
60,"(YouthBks, DoItYBks)","(ChildBks, CookBks)",0.067,0.64891,2.681448,0.042014
80,"(GeogBks, RefBks)","(ChildBks, CookBks)",0.05025,0.614679,2.539995,0.030467
69,"(YouthBks, GeogBks)","(ChildBks, CookBks)",0.06325,0.605263,2.501087,0.037961
77,"(DoItYBks, GeogBks)","(ChildBks, CookBks)",0.0605,0.59901,2.475248,0.036058
68,"(ChildBks, GeogBks, CookBks)",(YouthBks),0.06325,0.577626,2.424452,0.037162
72,"(ChildBks, CookBks, RefBks)",(DoItYBks),0.06125,0.591787,2.323013,0.034883
49,"(DoItYBks, GeogBks)",(YouthBks),0.0545,0.539604,2.264864,0.030437
63,"(ChildBks, CookBks, RefBks)",(YouthBks),0.05525,0.533816,2.240573,0.030591


# 14.2 协同过滤

In [8]:
# 准备
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
    'userID': [random.randint(1, 99) for _ in range(nratings)],
    'itemID': [random.randint(1, 999) for _ in range(nratings)],
    'rating': [random.randint(1, 5) for _ in range(nratings)]
})

def get_top_n(predictions, n=10):
    """获取每个用户的前n个推荐"""
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)

    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
    
    return byUser           

In [9]:
#
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25, random_state=1)

# 建立基于用户的协同过滤模型
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
predictions = algo.test(testset)

# 获取每个用户的前4个推荐
top_n = get_top_n(predictions, n=4)
print("Top-4 recommendations for each user:")
for uid, user_ratings in list(top_n.items())[:5]:
    print(f"\nUser {uid}")
    for prediction in user_ratings:
        print(f"\tItem {prediction.iid}: {prediction.est:.2f}", end=' ')
    

Computing the cosine similarity matrix...
Done computing similarity matrix.
Top-4 recommendations for each user:

User 68
	Item 208: 5.00 	Item 958: 5.00 	Item 514: 5.00 	Item 946: 4.53 
User 6
	Item 910: 4.02 	Item 717: 4.00 	Item 223: 3.99 	Item 547: 3.67 
User 46
	Item 32: 4.06 	Item 798: 3.52 	Item 27: 3.41 	Item 297: 3.03 
User 79
	Item 253: 4.00 	Item 891: 4.00 	Item 164: 3.73 	Item 168: 3.69 
User 53
	Item 769: 4.00 	Item 688: 4.00 	Item 930: 4.00 	Item 27: 3.85 