In [1]:
from pyspark.ml.fpm import FPGrowth
from numpy import array
from math import sqrt
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import functions as F
import pandas as pd

In [2]:
# set config of spark
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '16g'), ('spark.driver.memory','16g'),("spark.memory.offHeap.enabled","true"),("spark.memory.offHeap.size","4g"),("spark.driver.extraJavaOptions","12g")])

spark = SparkSession.builder.appName('dat500').config(conf=conf).getOrCreate()
spark = SparkSession(spark)

In [3]:
%%time
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files

path="/mydataset/proc_netflix/pre_part-02.json"
# .option("multiline", "true")
init_df = spark.read.json(path)



CPU times: user 780 µs, sys: 11.3 ms, total: 12.1 ms
Wall time: 17.3 s


                                                                                

In [4]:
%%time
# root
#  |-- movie: string (nullable = true)
#  |-- rating: string (nullable = true)
#  |-- review_date: string (nullable = true)
#  |-- review_id: string (nullable = true)
#  |-- reviewer: string (nullable = true)

# set up data frame
# remove null
df = init_df.na.drop()
# remove uesless colums
df = df.drop("review_summary","spoiler_tag","review_detail","helpful","review_date").dropDuplicates(['review_id'])

# only take the rating > 5, group by reviewer
df = df.filter(F.col("rating") >= 5).sort(F.desc("rating"))
df=df.groupBy("reviewer").agg(F.collect_set("movie"))

# df = df.groupBy("reviewer").agg(F.collect_list(F.struct(F.col("movie"),F.col("rating"))).alias("review_info"))
# df = df.groupBy("reviewer").agg(F.collect_list(F.struct(F.col("review_id"),F.col("review_info"))).alias("review"))
# df = df.withColumn("review", F.map_from_entries(F.col("review_map_list"))).drop("review")

df.printSchema()

root
 |-- reviewer: long (nullable = true)
 |-- collect_set(movie): array (nullable = false)
 |    |-- element: string (containsNull = false)

CPU times: user 9.84 ms, sys: 0 ns, total: 9.84 ms
Wall time: 327 ms


In [5]:
row = df.count()
print(f'The number of reviewers are: {row}')
df.show(10,truncate=False)

                                                                                

The number of reviewers are: 304418




+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviewer|collect_set(movie)                                                                                                                                                                                                                                                                                        |
+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|109433  |[Hemlock Grove (2013–2015), Meeting Evil (2012)]            

                                                                                

In [6]:
%%time
# fp-growth

fpGrowth = FPGrowth(itemsCol="collect_set(movie)",minSupport=0.0009, minConfidence=0.06)
model = fpGrowth.fit(df)
# Display frequent itemsets.
model.freqItemsets.show(20,truncate=False)

                                                                                

+-------------------------------------------+----+
|items                                      |freq|
+-------------------------------------------+----+
|[鋒迴路轉 (2019)]                          |284 |
|[Dil Bechara (2020)]                       |7349|
|[小丑 (2019)]                              |5749|
|[Normal People (2020)]                     |283 |
|[STAR WARS：天行者的崛起 (2019)]           |3536|
|[曼達洛人 (2019– )]                        |283 |
|[Gunjan Saxena: The Kargil Girl (2020)]    |3361|
|[A Christmas Carol (2019)]                 |282 |
|[Scam 1992: The Harshad Mehta Story (2020)]|2942|
|[Underwater (2020)]                        |281 |
|[Mrs. Serial Killer (2020)]                |2737|
|[Stranger Things (2016– )]                 |280 |
|[Batman v Superman: Dawn of Justice (2016)]|2231|
|[RoboCop (2014)]                           |280 |
|[獵魔士 (2019– )]                          |2191|
|[Utopia (2020)]                            |280 |
|[Tenet (2020)]                             |1990|


In [7]:
items = model.freqItemsets
# Display generated association rules.
model.associationRules.show(20,truncate=False)
rules = model.associationRules

                                                                                

+-------------------------------------------------------------------------------------------+---------------------------+------------------+------------------+---------------------+
|antecedent                                                                                 |consequent                 |confidence        |lift              |support              |
+-------------------------------------------------------------------------------------------+---------------------------+------------------+------------------+---------------------+
|[Taish (2020), Comedy Couple (2020), Atkan Chatkan (2020)]                                 |[Naxalbari (2020– )]       |0.9579288025889967|559.7135704923938 |9.723472330808296E-4 |
|[Taish (2020), Comedy Couple (2020), Atkan Chatkan (2020)]                                 |[London Confidental (2020)]|0.9546925566343042|562.1384887920727 |9.690622762123134E-4 |
|[Taish (2020), Comedy Couple (2020), Atkan Chatkan (2020)]                               

In [8]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show(20,truncate=False)
transformed = model.transform(df)

                                                                                

+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|reviewer|collect_set(movie)                                                                                                                                                                                                                                                                                        |prediction|
+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|109433  |[Hemlock Grove (2013–2015),

                                                                                