# Frequent patterns on MovieLens 25M dataset using Apriori and FP-Growth
This notebook provides code to mine frequent patterns on MovieLens 25M.  
I select only good ratings (>3.0) and apply apriori and fp-growth algorithms (implemented by [mlxtend](http://rasbt.github.io/mlxtend/api_subpackages/mlxtend.frequent_patterns/)) on different size of dataset (1k, 10k, 100k, 1M, 2M, 5M, 10M, all~15M).  
The dataset is downloaded from https://grouplens.org/datasets/movielens/, extracted and copied to the directory `/opt/spark/data`.

The association rules and the elapsed time is shown bellow.  
Please ignore the ordering number of execution cells because I had to restart the notebook several times.

In [1]:
!pip install mlxtend

Collecting mlxtend
  Using cached mlxtend-0.17.2-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.2


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
import time

In [2]:
df = pd.read_csv('/opt/spark/data/ratings.csv')

In [3]:
df.shape

(25000095, 4)

In [4]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
df = df[df.rating > 3.0].drop_duplicates().reset_index(drop=True)

In [6]:
df.shape

(15630129, 4)

In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


## 1000 ratings

In [9]:
df_1k = df.iloc[:1000, :]

In [10]:
df_1k.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [11]:
movie_rating = df_1k.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [12]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [13]:
records = movie_rating.movieId.values.tolist()

In [14]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [15]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,3,10,16,17,18,19,28,29,32,...,176371,179401,179819,182715,187541,187593,189333,195159,200818,203375
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,True,True,...,True,False,True,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,True,True,True,True,True,True
4,True,False,False,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
7,True,True,True,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
%timeit association_rules(fpgrowth(train_df, min_support=0.5), min_threshold=0.5)

5.23 ms ± 91.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
association_rules(fpgrowth(train_df, min_support=0.5), min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(377),(356),0.5,0.5,0.5,1.0,2.0,0.25,inf
1,(356),(377),0.5,0.5,0.5,1.0,2.0,0.25,inf
2,(43),(131),0.625,0.625,0.625,1.0,1.6,0.234375,inf
3,(131),(43),0.625,0.625,0.625,1.0,1.6,0.234375,inf
4,(0),(60),0.5,0.5,0.5,1.0,2.0,0.25,inf
5,(60),(0),0.5,0.5,0.5,1.0,2.0,0.25,inf
6,(49),(86),0.75,0.625,0.5,0.666667,1.066667,0.03125,1.125
7,(86),(49),0.625,0.75,0.5,0.8,1.066667,0.03125,1.25


In [19]:
%timeit association_rules(apriori(train_df, min_support=0.5), min_threshold=0.5)

6.68 ms ± 63.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
association_rules(apriori(train_df, min_support=0.5), min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(0),(60),0.5,0.5,0.5,1.0,2.0,0.25,inf
1,(60),(0),0.5,0.5,0.5,1.0,2.0,0.25,inf
2,(43),(131),0.625,0.625,0.625,1.0,1.6,0.234375,inf
3,(131),(43),0.625,0.625,0.625,1.0,1.6,0.234375,inf
4,(49),(86),0.75,0.625,0.5,0.666667,1.066667,0.03125,1.125
5,(86),(49),0.625,0.75,0.5,0.8,1.066667,0.03125,1.25
6,(377),(356),0.5,0.5,0.5,1.0,2.0,0.25,inf
7,(356),(377),0.5,0.5,0.5,1.0,2.0,0.25,inf


## 10000 ratings

In [22]:
df_10k = df.iloc[:10000, :]

In [23]:
df_10k.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [24]:
movie_rating = df_10k.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [25]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [26]:
records = movie_rating.movieId.values.tolist()

In [27]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [28]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,5,6,7,10,11,14,16,...,189333,192803,194400,194448,194728,195159,195777,195921,200818,203375
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,True,True
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
116,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
117,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
118,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [34]:
%timeit association_rules(fpgrowth(train_df, min_support=0.25), min_threshold=0.5)

12.9 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
association_rules(fpgrowth(train_df, min_support=0.25), min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(120),(150),0.425,0.425,0.258333,0.607843,1.430219,0.077708,1.46625
1,(150),(120),0.425,0.425,0.258333,0.607843,1.430219,0.077708,1.46625
2,(120),(967),0.425,0.383333,0.266667,0.627451,1.636829,0.10375,1.655263
3,(967),(120),0.383333,0.425,0.266667,0.695652,1.636829,0.10375,1.889286
4,(150),(239),0.425,0.366667,0.25,0.588235,1.604278,0.094167,1.538095
5,(239),(150),0.366667,0.425,0.25,0.681818,1.604278,0.094167,1.807143
6,(120),(471),0.425,0.333333,0.308333,0.72549,2.176471,0.166667,2.428571
7,(471),(120),0.333333,0.425,0.308333,0.925,2.176471,0.166667,7.666667
8,(120),(484),0.425,0.308333,0.291667,0.686275,2.225755,0.160625,2.204688
9,(484),(120),0.308333,0.425,0.291667,0.945946,2.225755,0.160625,10.6375


In [35]:
%timeit association_rules(apriori(train_df, min_support=0.25), min_threshold=0.5)

11.3 ms ± 123 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
association_rules(apriori(train_df, min_support=0.25), min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(120),(150),0.425,0.425,0.258333,0.607843,1.430219,0.077708,1.46625
1,(150),(120),0.425,0.425,0.258333,0.607843,1.430219,0.077708,1.46625
2,(120),(471),0.425,0.333333,0.308333,0.72549,2.176471,0.166667,2.428571
3,(471),(120),0.333333,0.425,0.308333,0.925,2.176471,0.166667,7.666667
4,(120),(484),0.425,0.308333,0.291667,0.686275,2.225755,0.160625,2.204688
5,(484),(120),0.308333,0.425,0.291667,0.945946,2.225755,0.160625,10.6375
6,(120),(967),0.425,0.383333,0.266667,0.627451,1.636829,0.10375,1.655263
7,(967),(120),0.383333,0.425,0.266667,0.695652,1.636829,0.10375,1.889286
8,(138),(266),0.408333,0.391667,0.258333,0.632653,1.615284,0.098403,1.656019
9,(266),(138),0.391667,0.408333,0.258333,0.659574,1.615284,0.098403,1.738021


## 100,000 ratings

In [37]:
n_ratings = 100000

In [38]:
df_100k = df.iloc[:n_ratings, :]

In [39]:
df_100k.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [40]:
movie_rating = df_100k.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [41]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [42]:
records = movie_rating.movieId.values.tolist()

In [43]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [44]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,204704,204780,205072,205106,205156,205383,205499,205557,207309,208002
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1124,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1125,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1126,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [50]:
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

360 ms ± 5.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(3705),(3274),0.232270,0.261525,0.206560,0.889313,3.400492,0.145816,6.671741
1,"(3705, 242)",(3274),0.120567,0.261525,0.112589,0.933824,3.570688,0.081057,11.159180
2,"(3274, 242)",(3705),0.139184,0.232270,0.112589,0.808917,3.482666,0.080260,4.017790
3,"(3705, 242)",(1799),0.120567,0.377660,0.103723,0.860294,2.277962,0.058190,4.454647
4,"(3705, 1799)",(3274),0.177305,0.261525,0.164894,0.930000,3.556068,0.118524,10.549645
...,...,...,...,...,...,...,...,...,...
504,"(48, 484)",(474),0.135638,0.369681,0.113475,0.836601,2.263037,0.063332,3.857553
505,"(48, 484)",(242),0.135638,0.392730,0.113475,0.836601,2.130217,0.060206,3.716489
506,"(1194, 474)",(262),0.128546,0.429965,0.111702,0.868966,2.021017,0.056432,4.350271
507,"(296, 1194)",(262),0.130319,0.429965,0.105496,0.809524,1.882769,0.049464,2.992686


In [52]:
%timeit association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)

352 ms ± 4.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(844),(214),0.285461,0.323582,0.233156,0.816770,2.524156,0.140786,3.691640
1,(857),(214),0.255319,0.323582,0.211879,0.829861,2.564612,0.129263,3.975684
2,(786),(242),0.164894,0.392730,0.141844,0.860215,2.190344,0.077085,4.344313
3,(868),(622),0.166667,0.262411,0.149823,0.898936,3.425676,0.106087,7.298246
4,(936),(846),0.179965,0.263298,0.153369,0.852217,3.236702,0.105985,4.985018
...,...,...,...,...,...,...,...,...,...
504,"(4266, 2078, 1799)","(3705, 3274)",0.117021,0.206560,0.101950,0.871212,4.217714,0.077778,6.160826
505,"(3705, 2078, 1799)","(3274, 4266)",0.118794,0.199468,0.101950,0.858209,4.302488,0.078255,5.645857
506,"(3274, 4266, 2078)","(3705, 1799)",0.120567,0.177305,0.101950,0.845588,4.769118,0.080573,5.327930
507,"(3705, 4266, 2078)","(3274, 1799)",0.122340,0.196809,0.101950,0.833333,4.234234,0.077873,4.819149


## 1M ratings

In [54]:
n_ratings = 1000000

In [55]:
df_1M = df.iloc[:n_ratings, :]

In [56]:
df_1M.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [57]:
movie_rating = df_1M.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [58]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [59]:
records = movie_rating.movieId.values.tolist()

In [60]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [61]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,207367,207405,207642,207830,208002,208112,208737,208793,208939,209163
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10636,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10637,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10638,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10639,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [62]:
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

3.98 s ± 41.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [64]:
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.5)

3.92 s ± 40.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [63]:
association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(5242),(4470),0.259280,0.282492,0.232591,0.897064,3.175535,0.159346,6.970436
1,(4470),(5242),0.282492,0.259280,0.232591,0.823353,3.175535,0.159346,4.193228
2,(5242),(6231),0.259280,0.260220,0.223287,0.861182,3.309438,0.155817,5.329121
3,(6231),(5242),0.260220,0.259280,0.223287,0.858072,3.309438,0.155817,5.218965
4,"(5242, 287)",(4470),0.142844,0.282492,0.131379,0.919737,3.255795,0.091026,8.939441
...,...,...,...,...,...,...,...,...,...
928,"(1096, 463)",(253),0.122169,0.343859,0.112489,0.920769,2.677755,0.070481,8.281396
929,"(253, 463)",(1096),0.134198,0.292360,0.112489,0.838235,2.867137,0.073255,4.374503
930,"(1096, 463)",(1109),0.122169,0.266422,0.106287,0.870000,3.265492,0.073738,5.642905
931,"(1109, 463)",(1096),0.118692,0.292360,0.106287,0.895487,3.062963,0.071586,6.770831


In [65]:
%timeit association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)

14.5 s ± 840 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [66]:
association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1096),(253),0.292360,0.343859,0.250540,0.856959,2.492184,0.150010,4.587091
1,(1109),(253),0.266422,0.343859,0.226107,0.848677,2.468099,0.134495,4.336039
2,(1006),(287),0.173198,0.407856,0.149610,0.863809,2.117924,0.078970,4.347891
3,(1120),(793),0.179964,0.279485,0.165962,0.922193,3.299616,0.115665,9.260310
4,(1109),(1096),0.266422,0.292360,0.213890,0.802822,2.746007,0.135999,3.588837
...,...,...,...,...,...,...,...,...,...
928,"(2664, 2305, 4470)","(5242, 6231)",0.135608,0.223287,0.111456,0.821899,3.680903,0.081176,4.361076
929,"(2664, 2305, 6231)","(5242, 4470)",0.128183,0.232591,0.111456,0.869501,3.738329,0.081641,5.880595
930,"(2664, 2305, 5242)","(4470, 6231)",0.127338,0.228644,0.111456,0.875277,3.828122,0.082341,6.184541
931,"(2664, 4470, 6231)","(2305, 5242)",0.138051,0.188422,0.111456,0.807352,4.284804,0.085444,4.212749


## 2M rating

In [76]:
n_ratings = 2000000

In [77]:
df_2M = df.iloc[:n_ratings, :]

In [78]:
df_2M.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [79]:
movie_rating = df_2M.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [80]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [81]:
records = movie_rating.movieId.values.tolist()

In [82]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [83]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,208737,208793,208795,208800,208939,209049,209053,209055,209103,209163
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21017,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
21018,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
21019,True,True,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
21020,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [84]:
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

8.89 s ± 418 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(5499),(4644),0.256874,0.280040,0.230283,0.896481,3.201263,0.158348,6.954891
1,(4644),(5499),0.280040,0.256874,0.230283,0.822320,3.201263,0.158348,4.182394
2,(5499),(6572),0.256874,0.257635,0.220103,0.856852,3.325838,0.153923,5.185992
3,(6572),(5499),0.257635,0.256874,0.220103,0.854321,3.325838,0.153923,5.101105
4,"(288, 5499)",(4644),0.143088,0.280040,0.132052,0.922872,3.295502,0.091982,9.334654
...,...,...,...,...,...,...,...,...,...
957,"(467, 254)",(1127),0.134764,0.291409,0.113120,0.839393,2.880463,0.073848,4.411952
958,"(467, 1127)",(254),0.123775,0.343878,0.113120,0.913912,2.657666,0.070556,7.621562
959,"(467, 1140)",(254),0.118162,0.343878,0.106222,0.898953,2.614165,0.065589,6.493257
960,"(467, 1140)",(1127),0.118162,0.291409,0.106127,0.898148,3.082088,0.071693,6.957075


In [86]:
%timeit association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)

35.7 s ± 1.12 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(844),(214),0.285461,0.323582,0.233156,0.816770,2.524156,0.140786,3.691640
1,(857),(214),0.255319,0.323582,0.211879,0.829861,2.564612,0.129263,3.975684
2,(786),(242),0.164894,0.392730,0.141844,0.860215,2.190344,0.077085,4.344313
3,(868),(622),0.166667,0.262411,0.149823,0.898936,3.425676,0.106087,7.298246
4,(936),(846),0.179965,0.263298,0.153369,0.852217,3.236702,0.105985,4.985018
...,...,...,...,...,...,...,...,...,...
504,"(4266, 2078, 1799)","(3705, 3274)",0.117021,0.206560,0.101950,0.871212,4.217714,0.077778,6.160826
505,"(3705, 2078, 1799)","(3274, 4266)",0.118794,0.199468,0.101950,0.858209,4.302488,0.078255,5.645857
506,"(3274, 4266, 2078)","(3705, 1799)",0.120567,0.177305,0.101950,0.845588,4.769118,0.080573,5.327930
507,"(3705, 4266, 2078)","(3274, 1799)",0.122340,0.196809,0.101950,0.833333,4.234234,0.077873,4.819149


## 5M ratings

In [8]:
n_ratings = 5000000

In [9]:
df_5M = df.iloc[:n_ratings, :]

In [10]:
movie_rating = df_5M.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [11]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [12]:
records = movie_rating.movieId.values.tolist()

In [13]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [14]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,208813,208915,208939,208955,209049,209051,209053,209055,209103,209163
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51947,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51948,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51949,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51950,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [94]:
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

22.6 s ± 904 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(3705),(3274),0.232270,0.261525,0.206560,0.889313,3.400492,0.145816,6.671741
1,"(3705, 242)",(3274),0.120567,0.261525,0.112589,0.933824,3.570688,0.081057,11.159180
2,"(3274, 242)",(3705),0.139184,0.232270,0.112589,0.808917,3.482666,0.080260,4.017790
3,"(3705, 242)",(1799),0.120567,0.377660,0.103723,0.860294,2.277962,0.058190,4.454647
4,"(3705, 1799)",(3274),0.177305,0.261525,0.164894,0.930000,3.556068,0.118524,10.549645
...,...,...,...,...,...,...,...,...,...
504,"(48, 484)",(474),0.135638,0.369681,0.113475,0.836601,2.263037,0.063332,3.857553
505,"(48, 484)",(242),0.135638,0.392730,0.113475,0.836601,2.130217,0.060206,3.716489
506,"(1194, 474)",(262),0.128546,0.429965,0.111702,0.868966,2.021017,0.056432,4.350271
507,"(296, 1194)",(262),0.130319,0.429965,0.105496,0.809524,1.882769,0.049464,2.992686


``` 
%timeit association_rules(apriori(train_df, min_support=0.1), min_threshold=0.8)
```
Program crashed due to out of memory

## 10M ratings

In [9]:
n_ratings = 10000000

In [10]:
df_10M = df.iloc[:n_ratings, :]

In [11]:
df_10M.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [12]:
movie_rating = df_10M.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [13]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [14]:
records = movie_rating.movieId.values.tolist()

In [15]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [16]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,209055,209063,209073,209075,209085,209103,209119,209121,209147,209163
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103581,False,False,False,False,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
103582,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
103583,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
103584,True,False,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)

44.8 s ± 722 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 15M ratings

In [8]:
movie_rating = df.groupby('userId').agg({'movieId':lambda x: list(x)}).reset_index()

In [9]:
movie_rating.head()

Unnamed: 0,userId,movieId
0,1,"[296, 306, 307, 665, 899, 1088, 1175, 1217, 12..."
1,2,"[1, 110, 150, 151, 236, 260, 318, 333, 349, 35..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[260, 296, 541, 589, 924, 1036, 1080, 1136, 11..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."


In [10]:
records = movie_rating.movieId.values.tolist()

In [11]:
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)

In [12]:
train_df = pd.DataFrame(te_ary, columns=te.columns_)
train_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,209121,209123,209129,209131,209133,209135,209147,209151,209155,209163
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162409,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
162410,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
162411,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
162412,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


```
%timeit association_rules(fpgrowth(train_df, min_support=0.1), min_threshold=0.8)
```
Program crash due to out of memory