In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("ALS").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [2]:
from pyspark.sql import DataFrame, SparkSession,Row

In [3]:
spark = SparkSession.builder.appName("ALS").master("local").config(conf=conf).getOrCreate()

In [4]:
from pyspark.ml.fpm import FPGrowth

In [6]:
df = spark.createDataFrame([
        (0,[1,2,5]),
        (1,[1,3,5]),
        (2,[1,2])
    ], ["id","items"])

In [7]:
fpGrowth = FPGrowth(itemsCol="items",minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

In [8]:
#display frequent itemsets
model.freqItemsets.show()

+------+----+
| items|freq|
+------+----+
|   [5]|   2|
|[5, 1]|   2|
|   [1]|   3|
|   [2]|   2|
|[2, 1]|   2|
+------+----+



In [9]:
#generate associatedd association rules
model.associationRules.show()

+----------+----------+------------------+
|antecedent|consequent|        confidence|
+----------+----------+------------------+
|       [5]|       [1]|               1.0|
|       [2]|       [1]|               1.0|
|       [1]|       [5]|0.6666666666666666|
|       [1]|       [2]|0.6666666666666666|
+----------+----------+------------------+



In [10]:
#transform examines the input items against the association rules and sumarise the consequent as prediction
predictions = model.transform(df).show()

+---+---------+----------+
| id|    items|prediction|
+---+---------+----------+
|  0|[1, 2, 5]|        []|
|  1|[1, 3, 5]|       [2]|
|  2|   [1, 2]|       [5]|
+---+---------+----------+



In [5]:
data = sc.textFile(r'C:\spark\data\mllib\sample_fpgrowth.txt')
transactions = data.map(lambda line: line.strip().split(' '))

In [16]:
from pyspark.mllib.fpm import FPGrowth

In [17]:
model1 = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
result = model1.freqItemsets().collect()
for fi in result:
    print(fi)

FreqItemset(items=['z'], freq=5)
FreqItemset(items=['x'], freq=4)
FreqItemset(items=['x', 'z'], freq=3)
FreqItemset(items=['y'], freq=3)
FreqItemset(items=['y', 'x'], freq=3)
FreqItemset(items=['y', 'x', 'z'], freq=3)
FreqItemset(items=['y', 'z'], freq=3)
FreqItemset(items=['r'], freq=3)
FreqItemset(items=['r', 'x'], freq=2)
FreqItemset(items=['r', 'z'], freq=2)
FreqItemset(items=['s'], freq=3)
FreqItemset(items=['s', 'y'], freq=2)
FreqItemset(items=['s', 'y', 'x'], freq=2)
FreqItemset(items=['s', 'y', 'x', 'z'], freq=2)
FreqItemset(items=['s', 'y', 'z'], freq=2)
FreqItemset(items=['s', 'x'], freq=3)
FreqItemset(items=['s', 'x', 'z'], freq=2)
FreqItemset(items=['s', 'z'], freq=2)
FreqItemset(items=['t'], freq=3)
FreqItemset(items=['t', 'y'], freq=3)
FreqItemset(items=['t', 'y', 'x'], freq=3)
FreqItemset(items=['t', 'y', 'x', 'z'], freq=3)
FreqItemset(items=['t', 'y', 'z'], freq=3)
FreqItemset(items=['t', 's'], freq=2)
FreqItemset(items=['t', 's', 'y'], freq=2)
FreqItemset(items=['t', '