# Frequent patterns on MovieLens 25M dataset using FP-Growth
This notebook provides code to mine frequent patterns on MovieLens 25M.  
I select only good ratings (>3.0) and apply fp-growth algorithms (implemented by [PySpark](https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html#fp-growth)) on different size of dataset (1k, 10k, 100k, 1M, 2M, 5M, 10M, all~15M) with sorting.  
The dataset is downloaded from https://grouplens.org/datasets/movielens/, extracted and copied to the directory `/opt/spark/data`.

The association rules and the elapsed time is shown bellow.  
Please ignore the ordering number of execution cells because I had to restart the notebook several times.

In [1]:
import pyspark
import os
import socket

In [2]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import functions as F
import time

In [3]:
os.environ['PYSPARK_PYTHON'] = 'python3'
driver_host = socket.gethostbyname(socket.gethostname())

In [134]:
conf = pyspark.SparkConf()

conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443") 

conf.set("spark.kubernetes.container.image", "gcr.io/spark-operator/spark-py:v2.4.5")
conf.set("spark.kubernetes.authenticate.caCertFile", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
conf.set("spark.kubernetes.authenticate.oauthTokenFile", "/var/run/secrets/kubernetes.io/serviceaccount/token")
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark") 
conf.set("spark.executor.instances", "2")
conf.set("spark.executor.memory", "2g")
conf.set("spark.kubernetes.pyspark.pythonVersion", "3")
conf.set("spark.driver.host", driver_host)
conf.set("spark.driver.port", "29413")
conf.set("spark.memory.offHeap.enabled", "false")
conf.set("spark.driver.memory", "2g")

<pyspark.conf.SparkConf at 0x7fad9a88d9d0>

In [135]:
spark = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate()

In [136]:
df = spark.read.load('/opt/spark/data/ratings.csv', format='csv', sep=',', inferSchema=True, header=True)

In [8]:
df.show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
+------+-------+------+----------+
only showing top 10 rows



In [9]:
df.count()

25000095

In [137]:
df = df.filter(df['rating'] > 3.0).drop_duplicates()

In [71]:
# Skip for more than 10M
df = df.sort(F.col('userId'), F.col('movieId'))

In [138]:
df.count()

15630129

## 1k

In [13]:
training_df = df.limit(1000)

In [14]:
training_df.show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
+------+-------+------+----------+
only showing top 10 rows



In [15]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [16]:
movies_rating.show(10)

+------+--------------------+
|userId|            movieIds|
+------+--------------------+
|     1|[5147, 1250, 306,...|
|     2|[356, 2028, 6311,...|
|     3|[356, 1222, 10484...|
|     4|[70286, 2028, 916...|
|     5|[356, 1120, 104, ...|
|     6|[858, 2396, 902, ...|
|     7|[306, 307, 17, 30...|
|     8|[1220, 356, 589, ...|
+------+--------------------+



In [17]:
movies_rating.count()

8

In [18]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.5, minConfidence=0.5)

In [19]:
%%timeit
model = fpGrowth.fit(movies_rating)

21.4 s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

21.20 s


In [21]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

0.48 s


In [22]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+----------+----------+------------------+------------------+
|antecedent|consequent|        confidence|              lift|
+----------+----------+------------------+------------------+
|    [6539]|    [5952]|               1.0|               2.0|
|     [593]|     [296]|               0.8|1.0666666666666667|
|    [1196]|     [260]|               1.0|               1.6|
|     [356]|       [1]|               1.0|               2.0|
|     [260]|    [1196]|               1.0|               1.6|
|     [296]|     [593]|0.6666666666666666|1.0666666666666667|
|       [1]|     [356]|               1.0|               2.0|
|    [5952]|    [6539]|               1.0|               2.0|
+----------+----------+------------------+------------------+

0.12 s


## 10k ratings

In [23]:
training_df = df.limit(10000)

In [24]:
training_df.show(10)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
+------+-------+------+----------+
only showing top 10 rows



In [25]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [26]:
movies_rating.count()

120

In [27]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.25, minConfidence=0.5)

In [28]:
%%timeit
fpGrowth.fit(movies_rating)

22 s ± 170 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

21.83 s


In [30]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

0.50 s


In [31]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+------------+----------+------------------+------------------+
|  antecedent|consequent|        confidence|              lift|
+------------+----------+------------------+------------------+
|       [527]|     [318]|0.6818181818181818|1.6042780748663101|
|       [593]|     [296]|0.6595744680851063| 1.615284411636995|
|       [593]|     [318]|0.6595744680851063|1.5519399249061325|
|      [2571]|     [260]|0.6956521739130435|1.6368286445012787|
| [1210, 260]|    [1196]|0.8857142857142857| 2.657142857142857|
|[1210, 1196]|     [260]|           0.96875|2.2794117647058822|
|      [1196]|    [1210]|               0.8|2.5945945945945947|
|      [1196]|     [260]|             0.925|2.1764705882352944|
|      [1210]|    [1196]|0.8648648648648649|2.5945945945945947|
|      [1210]|     [260]|0.9459459459459459|2.2257551669316378|
|       [260]|    [1210]|0.6862745098039216|2.2257551669316373|
|       [260]|    [2571]|0.6274509803921569|1.6368286445012787|
|       [260]|    [1196]|0.7254901960784

In [32]:
model.associationRules.count()

19

## 100k ratings

In [33]:
training_df = df.limit(100000)

In [34]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [35]:
movies_rating.count()

1128

In [36]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.1, minConfidence=0.8)

In [37]:
%%timeit
fpGrowth.fit(movies_rating)

24.9 s ± 316 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

25.26 s


In [39]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

2.79 s


In [40]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+------------------+----------+------------------+------------------+
|        antecedent|consequent|        confidence|              lift|
+------------------+----------+------------------+------------------+
|       [1270, 589]|    [2571]|0.8308823529411765| 2.200082850041425|
|[1210, 4993, 1196]|    [5952]|0.8625954198473282|3.7137695938465125|
|[1210, 4993, 1196]|     [260]|0.9007633587786259|2.7837289553487397|
|       [541, 1196]|     [260]|0.8778625954198473|2.7129561852975006|
| [1196, 2571, 296]|     [260]|0.8880597014925373|2.7444694336536495|
|   [260, 296, 318]|     [593]|0.8085106382978723|  2.18705035971223|
|   [260, 296, 318]|    [1196]|0.8723404255319149|3.0559006211180124|
|   [527, 593, 356]|     [318]|0.8260869565217391| 1.921290900941282|
|  [260, 2571, 318]|    [1196]|0.8819444444444444| 3.089544513457557|
|[7153, 4993, 2959]|    [2571]|0.8823529411764706| 2.336371168185584|
|[7153, 4993, 2959]|    [5952]|0.9411764705882353|  4.05208801077683|
| [1198, 1196, 356]|

In [41]:
model.associationRules.count()

436

## 1M ratings

In [42]:
training_df = df.limit(1000000)

In [43]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [44]:
movies_rating.count()

10641

In [45]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.1, minConfidence=0.8)

In [46]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

27.23 s


In [47]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

8.87 s


In [48]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+--------------------+----------+------------------+------------------+
|          antecedent|consequent|        confidence|              lift|
+--------------------+----------+------------------+------------------+
|  [1210, 4993, 1196]|     [260]|0.9416224412433661|2.7383996712956162|
|  [1210, 4993, 1196]|    [2571]|0.8498862774829417|  2.25865131835564|
|  [1210, 4993, 1196]|    [5952]|0.8946171341925702| 3.450388156920311|
|  [1210, 4993, 1196]|    [7153]|0.8605003790750568|3.3068199832927703|
|   [1196, 2571, 296]|     [260]|0.8978374347501864| 2.611065357523021|
|     [260, 296, 318]|    [1196]|0.8198263386396527|2.8041697426758416|
|        [3578, 1196]|     [260]|0.8902243589743589|2.5889252265225893|
|        [3578, 1196]|    [2571]|0.8653846153846154|  2.29983958349343|
|        [2762, 2959]|    [2571]|0.8323024054982818| 2.211920553673131|
|           [47, 110]|     [296]|0.8140043763676149|1.9958111909971867|
|    [260, 2571, 318]|    [1196]|0.8641638225255973|2.9558236051

In [49]:
model.associationRules.count()

769

## 2M ratings

In [50]:
training_df = df.limit(2000000)

In [51]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [52]:
movies_rating.count()

21022

In [53]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.1, minConfidence=0.8)

In [66]:
%%timeit
fpGrowth.fit(movies_rating)

29.5 s ± 439 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

30.81 s


In [55]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

16.77 s


In [56]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+--------------------+----------+------------------+------------------+
|          antecedent|consequent|        confidence|              lift|
+--------------------+----------+------------------+------------------+
|  [1210, 4993, 1196]|     [260]|0.9481481481481482|2.7572237336243424|
|  [1210, 4993, 1196]|    [2571]|0.8565302144249513| 2.268041084222361|
|  [1210, 4993, 1196]|    [7153]|0.8643274853801169| 3.354854578593209|
|  [1210, 4993, 1196]|    [5952]|0.8935672514619883| 3.478624214858133|
|         [608, 2858]|     [296]|0.8265384615384616| 2.022993542724594|
|   [1196, 2571, 296]|     [260]|0.9016086793864572| 2.621886520689183|
|     [260, 296, 318]|    [1196]|0.8201077199281868|2.8142841149739377|
|        [3578, 1196]|     [260]|0.8958926392842619|2.6052642223037425|
|        [3578, 1196]|    [2571]|0.8674257828385522| 2.296891901603734|
|        [2762, 2959]|    [2571]|0.8283453732370141| 2.193409300439414|
|           [47, 110]|     [296]|0.8065217391304348|1.9740016299

In [57]:
model.associationRules.count()

801

## 5M ratings

In [73]:
training_df = df.limit(5000000)

In [74]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [None]:
movies_rating.count()

In [75]:
movies_rating.show(10)

+------+--------------------+
|userId|            movieIds|
+------+--------------------+
|     1|[5147, 1250, 306,...|
|     2|[356, 2028, 6311,...|
|     3|[356, 1222, 10484...|
|     4|[70286, 2028, 916...|
|     5|[356, 1120, 104, ...|
|     6|[858, 2396, 902, ...|
|     7|[306, 307, 17, 30...|
|     8|[1220, 356, 589, ...|
|     9|[356, 277, 256, 2...|
|    10|[110, 356, 589, 1...|
+------+--------------------+
only showing top 10 rows



In [76]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.1, minConfidence=0.8)

In [74]:
%%timeit
fpGrowth.fit(movies_rating)

44.8 s ± 1.64 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

44.41 s


In [78]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

58.54 s


In [79]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+--------------------+----------+------------------+------------------+
|          antecedent|consequent|        confidence|              lift|
+--------------------+----------+------------------+------------------+
|  [1210, 4993, 1196]|     [260]| 0.949064130101258|2.7531285770852953|
|  [1210, 4993, 1196]|    [2571]|0.8588524087143296| 2.266654830456025|
|  [1210, 4993, 1196]|    [7153]|0.8659097882786131| 3.309965809774888|
|  [1210, 4993, 1196]|    [5952]|0.8933722000613685|3.4111768732609304|
|         [608, 2858]|     [296]|0.8264399509803921|2.0153590092627365|
|   [1196, 2571, 296]|     [260]|0.8999551368326604| 2.610668896573252|
|     [260, 296, 318]|    [1196]|0.8220769789397241| 2.804790386148062|
|        [3578, 1196]|     [260]|0.8969638242894057|  2.60199143444543|
|        [3578, 1196]|    [2571]|0.8675710594315246| 2.289664804652607|
|        [2762, 2959]|    [2571]|0.8210063238933186| 2.166772696921803|
|           [47, 110]|     [296]|0.8031542056074766| 1.958574318

In [80]:
model.associationRules.count()

916

## 10M ratings

In [145]:
training_df = df.limit(10000000)

In [149]:
movies_rating = df.limit(10000000).groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [150]:
movies_rating.show(10)

+------+--------------------+
|userId|            movieIds|
+------+--------------------+
|     1|[5147, 1250, 306,...|
|     2|[356, 2028, 6311,...|
|     3|[356, 1222, 10484...|
|     4|[70286, 2028, 916...|
|     5|[356, 1120, 104, ...|
|     6|[858, 2396, 902, ...|
|     7|[306, 307, 17, 30...|
|     8|[1220, 356, 589, ...|
|     9|[356, 277, 256, 2...|
|    10|[110, 356, 589, 1...|
+------+--------------------+
only showing top 10 rows



In [151]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.1, minConfidence=0.8)

In [152]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

61.94 s


In [154]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

131.21 s


In [155]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+--------------------+----------+------------------+------------------+
|          antecedent|consequent|        confidence|              lift|
+--------------------+----------+------------------+------------------+
|  [1210, 4993, 1196]|     [260]|0.9492220113851992| 2.732647192244657|
|  [1210, 4993, 1196]|    [2571]| 0.857988614800759|2.2541241922682214|
|  [1210, 4993, 1196]|    [7153]|0.8614800759013282|  3.28633995515633|
|  [1210, 4993, 1196]|    [5952]|0.8924478178368122|3.3988418566287004|
|         [608, 2858]|     [296]|0.8294031799677395|2.0243769509928904|
|   [1196, 2571, 296]|     [260]|0.9035687732342007|2.6012193581301184|
|     [260, 296, 318]|    [1196]|0.8249856486796785| 2.791525280244771|
|        [3578, 1196]|     [260]|0.8997562956945573| 2.590243889884287|
|        [3578, 1196]|    [2571]|0.8656376929325751|2.2742199974666155|
|        [2762, 2959]|    [2571]|0.8209027108848218|2.1566913921506328|
|    [260, 2571, 318]|    [1196]|0.8643940940327958|2.9248726562

In [156]:
model.associationRules.count()

1006

## 15M ratings

In [157]:
training_df = df

In [158]:
movies_rating = training_df.groupBy('userId').agg(F.collect_set('movieId').alias('movieIds'))

In [159]:
movies_rating.show(10)

+------+--------------------+
|userId|            movieIds|
+------+--------------------+
|   148|[110, 356, 2186, ...|
|   463|[799, 785, 648, 3...|
|   471|[356, 103141, 922...|
|   496|[1220, 1947, 4995...|
|   833|[44195, 356, 4551...|
|  1088|[356, 589, 8604, ...|
|  1238|[110, 356, 4262, ...|
|  1342|[356, 4008, 306, ...|
|  1580|[1271, 2273, 277,...|
|  1591|[3863, 2701, 2890...|
+------+--------------------+
only showing top 10 rows



In [160]:
fpGrowth = FPGrowth(itemsCol="movieIds", minSupport=0.1, minConfidence=0.8)

In [161]:
start = time.time()
model = fpGrowth.fit(movies_rating)
end = time.time()
print('{:.2f} s'.format(end - start))

91.64 s


In [162]:
start = time.time()
model.associationRules.count()
end = time.time()
print('{:.2f} s'.format(end - start))

1625.45 s


In [163]:
# Display generated association rules.
start = time.time()
model.associationRules.show()
end = time.time()
print('{:.2f} s'.format(end - start))

+--------------------+----------+------------------+------------------+
|          antecedent|consequent|        confidence|              lift|
+--------------------+----------+------------------+------------------+
|         [858, 2959]|     [296]|0.8097227341606258|1.9737698021276604|
|        [1136, 1196]|     [260]|0.9076998050682261|2.6046494017729835|
|[1291, 1210, 1198...|    [1196]|0.9666776823088786|3.2663155406934945|
|          [608, 593]|     [296]|0.8216192698651268| 2.002768645722954|
|         [480, 1210]|     [260]|0.9049160117121282| 2.596661291982572|
|         [480, 1210]|    [1196]|0.8930497765449221| 3.017533576211683|
|           [1, 1210]|     [260]|0.9071204345009717| 2.602986894859378|
|           [1, 1210]|    [1196]|0.8460311923862673|2.8586620775214433|
|  [7153, 1198, 2571]|    [4993]|0.9319274475524476|3.2543122869658836|
|  [7153, 1198, 2571]|    [5952]|0.9122049825174825|  3.47194553877471|
|        [7153, 2959]|    [2571]|0.8447453155914187| 2.221513717

In [164]:
model.associationRules.count()

1020

In [165]:
model.associationRules.show(50)

+--------------------+----------+------------------+------------------+
|          antecedent|consequent|        confidence|              lift|
+--------------------+----------+------------------+------------------+
|         [858, 2959]|     [296]|0.8097227341606258|1.9737698021276604|
|        [1136, 1196]|     [260]|0.9076998050682261|2.6046494017729835|
|[1291, 1210, 1198...|    [1196]|0.9666776823088786|3.2663155406934945|
|          [608, 593]|     [296]|0.8216192698651268| 2.002768645722954|
|         [480, 1210]|     [260]|0.9049160117121282| 2.596661291982572|
|         [480, 1210]|    [1196]|0.8930497765449221| 3.017533576211683|
|           [1, 1210]|     [260]|0.9071204345009717| 2.602986894859378|
|           [1, 1210]|    [1196]|0.8460311923862673|2.8586620775214433|
|  [7153, 1198, 2571]|    [4993]|0.9319274475524476|3.2543122869658836|
|  [7153, 1198, 2571]|    [5952]|0.9122049825174825|  3.47194553877471|
|        [7153, 2959]|    [2571]|0.8447453155914187| 2.221513717

In [166]:
spark.stop()