# **Setting up Spark**

In [113]:
# Install PySpark library on google colab
!pip install pyspark



In [114]:
# Check pyspark installation by building spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("Colab")\
        .config('spark.ui.port', '4050').getOrCreate()

spark

In [115]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [116]:
import pandas as pd

data = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/data/PreprocessedDataCoSupplyChainDataset.csv", encoding='latin1')
data.head()

Unnamed: 0.1,Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,Customer Country,Customer Id,Customer Segment,Customer State,Customer Zipcode,Department Id,Department Name,Market,Order City,Order Country,Order Customer Id,order date (DateOrders),Order Id,Order Item Cardprod Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Product Card Id,Product Category Id,Product Name,Product Price,Shipping Mode,Customer Name
0,0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,20755,Consumer,PR,725.0,2,Fitness,Pacific Asia,Bekasi,Indonesia,20755,1/31/2018 22:56,77202,1360,13.11,0.04,180517,327.75,0.29,1,327.75,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,1360,73,Smart watch,327.75,Standard Class,CallyHolloway
1,1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,Puerto Rico,19492,Consumer,PR,725.0,2,Fitness,Pacific Asia,Bikaner,India,19492,1/13/2018 12:27,75939,1360,16.389999,0.05,179254,327.75,-0.8,1,327.75,311.359985,-249.089996,South Asia,RajastÃ¡n,PENDING,1360,73,Smart watch,327.75,Standard Class,IreneLuna
2,2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,EE. UU.,19491,Consumer,CA,95125.0,2,Fitness,Pacific Asia,Bikaner,India,19491,1/13/2018 12:06,75938,1360,18.030001,0.06,179253,327.75,-0.8,1,327.75,309.720001,-247.779999,South Asia,RajastÃ¡n,CLOSED,1360,73,Smart watch,327.75,Standard Class,GillianMaldonado
3,3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,EE. UU.,19490,Home Office,CA,90027.0,2,Fitness,Pacific Asia,Townsville,Australia,19490,1/13/2018 11:45,75937,1360,22.940001,0.07,179252,327.75,0.08,1,327.75,304.809998,22.860001,Oceania,Queensland,COMPLETE,1360,73,Smart watch,327.75,Standard Class,TanaTate
4,4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,19489,Corporate,PR,725.0,2,Fitness,Pacific Asia,Townsville,Australia,19489,1/13/2018 11:24,75936,1360,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,1360,73,Smart watch,327.75,Standard Class,OrliHendricks


In [117]:
data['Customer Name'].unique()

array(['CallyHolloway', 'IreneLuna', 'GillianMaldonado', ...,
       'AnikaDavenport', 'YuriSmith', 'HyacinthWitt'], dtype=object)

In [118]:
df = data.copy()
df = df[['Customer Name','Product Name']]
df = df.drop_duplicates(['Customer Name', 'Product Name'])
df = df.groupby('Customer Name')['Product Name'].apply(list)\
.reset_index(name="Products")

# df.rename( columns={1 :'Products'}, inplace=True )
df.head()

Unnamed: 0,Customer Name,Products
0,AaronBerger,[Under Armour Girls' Toddler Spine Surge Runni...
1,AaronBoyle,[Diamondback Women's Serene Classic Comfort Bi...
2,AaronBush,"[Nike Men's CJ Elite 2 TD Football Cleat, Unde..."
3,AaronCalhoun,"[Perfect Fitness Perfect Rip Deck, Pelican Sun..."
4,AaronCarr,"[Pelican Sunstream 100 Kayak, Nike Men's CJ El..."


# **Convert to Spark Dataframe**

In [119]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SQLContext

# Convert the pandas dataframe into a spark dataframe
# sc = SparkContext.getOrCreate()
sqlContext = SQLContext(spark)

spark_frame = sqlContext.createDataFrame(df)

In [120]:
spark_frame

DataFrame[Customer Name: string, Products: array<string>]

# **Applying the FP algorithm**

In [121]:
# Create fpGrowth object by instantiating the FPGrowth constructor with the necessary parameters
fpGrowth = FPGrowth(itemsCol="Products",minSupport=0.015, minConfidence=0.35)

# Fit the dataframe into the fpGrowth object to get the model prepared
model = fpGrowth.fit(spark_frame)

In [122]:
# Display the frequently occuring items
model.freqItemsets.sort('freq',ascending=False).show(truncate=False)

+------------------------------------------------------------------------------------+----+
|items                                                                               |freq|
+------------------------------------------------------------------------------------+----+
|[Perfect Fitness Perfect Rip Deck]                                                  |4851|
|[Nike Men's CJ Elite 2 TD Football Cleat]                                           |4711|
|[Nike Men's Dri-FIT Victory Golf Polo]                                              |4603|
|[O'Brien Men's Neoprene Life Vest]                                                  |4505|
|[Field & Stream Sportsman 16 Gun Fire Safe]                                         |4304|
|[Nike Men's CJ Elite 2 TD Football Cleat, Perfect Fitness Perfect Rip Deck]         |4135|
|[Pelican Sunstream 100 Kayak]                                                       |4108|
|[Nike Men's Dri-FIT Victory Golf Polo, Perfect Fitness Perfect Rip Deck]       

In [123]:
from pyspark.sql.functions import col,round

# Display the generated association rules
model.associationRules\
.withColumn("confidence", round(col("confidence"), 3))\
.withColumn("lift", round(col("lift"), 3))\
.withColumn("support", round(col("support"),10))\
.sort('confidence',ascending=False)\
.show(truncate=True)

+--------------------+--------------------+----------+-----+------------+
|          antecedent|          consequent|confidence| lift|     support|
+--------------------+--------------------+----------+-----+------------+
|[Team Golf St. Lo...|[Nike Men's CJ El...|     0.982|2.924|0.0151785078|
|[Team Golf St. Lo...|[Nike Men's CJ El...|     0.978|2.913|0.0158198532|
|[Titleist Pro V1 ...|[Perfect Fitness ...|     0.977|2.827|0.0153922896|
|[Team Golf St. Lo...|[Nike Men's CJ El...|     0.977|2.911|0.0152497684|
|[Titleist Pro V1 ...|[Nike Men's CJ El...|     0.977|2.911|0.0153922896|
|[ENO Atlas Hammoc...|[Perfect Fitness ...|     0.977|2.826|0.0150359866|
|[ENO Atlas Hammoc...|[Perfect Fitness ...|     0.974|2.817|0.0158198532|
|[Titleist Pro V1x...|[Perfect Fitness ...|     0.973|2.814| 0.015321029|
|[Team Golf St. Lo...|[Nike Men's CJ El...|     0.973|2.898| 0.015321029|
|[Titleist Pro V1 ...|[Nike Men's Dri-F...|     0.973|2.966|0.0153922896|
|[Team Golf St. Lo...|[Nike Men's CJ E