In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

### 电影题材关联
- 数据集：MovieLens(small)(http://files.grouplens.org/datasets/movielens/ml-latest-small.zip)

In [2]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv')

In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


#### 1、将数据集转换为one-hot格式

In [5]:
genres = movies.genres.str.get_dummies()

In [6]:
movies_solved = movies.drop('genres', 1).join(genres)

In [7]:
movies_solved.head(10)

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat (1995),0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death (1995),0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
movies_solved.shape

(9742, 22)

#### 2、重新设置索引

In [9]:
movies_solved.set_index(['movieId', 'title'], inplace=True)

In [10]:
movies_solved.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### 3、获取频繁项集

In [11]:
frequent_itemsets = apriori(movies_solved, use_colnames=True, min_support=0.025)

##### apriori
- use_colnames: bool, True-使用列名, False: 不使用列名表示项集，默认是用索引表示
- min_support: 最小支持度，在数据量较大需慎重选择，否则会导致筛选出的频繁项集为空

In [12]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.187641,(Action)
1,0.129645,(Adventure)
2,0.062718,(Animation)
3,0.068158,(Children)
4,0.385547,(Comedy)
5,0.123075,(Crime)
6,0.045165,(Documentary)
7,0.447649,(Drama)
8,0.079963,(Fantasy)
9,0.10039,(Horror)


#### 3、计算关联规则
- metric: 一般可指定lift(提升度), confidence(置信度), support(支持度)
- min_threshold: 最小阈值，根据数据量需谨慎选择

In [13]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.25)

In [14]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,0.038289,1.571224
1,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,0.038289,1.306247
2,(Crime),(Action),0.123075,0.187641,0.042907,0.348624,1.857929,0.019813,1.247142
3,(Action),(Crime),0.187641,0.123075,0.042907,0.228665,1.857929,0.019813,1.136892
4,(Action),(Sci-Fi),0.187641,0.100595,0.046294,0.246718,2.452576,0.027419,1.193981
5,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,0.027419,1.504937
6,(Thriller),(Action),0.194416,0.187641,0.067235,0.345829,1.843034,0.030754,1.241814
7,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,0.030754,1.25542
8,(Animation),(Adventure),0.062718,0.129645,0.025354,0.404255,3.118175,0.017223,1.460953
9,(Adventure),(Animation),0.129645,0.062718,0.025354,0.195566,3.118175,0.017223,1.165145


#### 4、在根据其他条件筛选出合适的数据
- 如: 获取提升度大于5的关联规则

In [15]:
rules[rules.lift>5]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
16,(Animation),(Children),0.062718,0.068158,0.031,0.494272,7.251799,0.026725,1.842573
17,(Children),(Animation),0.068158,0.062718,0.031,0.454819,7.251799,0.026725,1.719213


##### Children(儿童题材)和Animation(动漫)这两题材是最相关的，根据常识也可分辨出来

#### 5、查看数据集中包含以上两题材的数据的信息

In [17]:
movies[(movies.genres.str.contains('Children'))&(movies.genres.str.contains('Animation'))]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
12,13,Balto (1995),Adventure|Animation|Children
44,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
205,239,"Goofy Movie, A (1995)",Animation|Children|Comedy|Romance
272,313,"Swan Princess, The (1994)",Animation|Children
...,...,...,...
9629,178827,Paddington 2 (2017),Adventure|Animation|Children|Comedy
9657,180987,Ferdinand (2017),Animation|Children|Comedy
9664,182293,Hare-um Scare-um (1939),Animation|Children|Comedy
9666,182299,Porky's Hare Hunt (1938),Animation|Children|Comedy
