In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_set
from pyspark.ml.fpm import FPGrowth
import findspark
from pyspark.sql.functions import col, count, when
from pyspark.sql import functions as F
from pyspark.mllib.stat import Statistics
import numpy as np
from scipy.stats import ttest_ind

In [15]:
findspark.init()

In [2]:
spark = SparkSession.builder.appName("PatternRecognition").getOrCreate()

In [16]:
spark

In [4]:
ds = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\retail.csv", header = True, inferSchema=True, encoding='ISO-8859-1')

In [5]:
ds.show()
ds.printSchema()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [7]:
ds_mba = ds.groupBy("InvoiceNo").agg(collect_set("StockCode").alias("items"))

ds_mba.show(5, truncate=False)

+---------+---------------------------------------------------------------------------------------------------------------------+
|InvoiceNo|items                                                                                                                |
+---------+---------------------------------------------------------------------------------------------------------------------+
|536366   |[22632, 22633]                                                                                                       |
|536367   |[22310, 22622, 21755, 22623, 84879, 84969, 21777, 22748, 48187, 22745, 21754, 22749]                                 |
|536371   |[22086]                                                                                                              |
|536374   |[21258]                                                                                                              |
|536375   |[84029E, 21730, 82483, 82482, 20679, 71053, 37370, 82486, 85123A, 84406B, 21071

In [9]:
fp_growth = FPGrowth(itemsCol="items", minSupport=0.02, minConfidence=0.3)
model = fp_growth.fit(ds_mba)

In [10]:
model.freqItemsets.show(10, truncate=False)

+--------+----+
|items   |freq|
+--------+----+
|[22776] |576 |
|[DOT]   |710 |
|[84375] |520 |
|[84970S]|543 |
|[21213] |664 |
|[22621] |613 |
|[22966] |674 |
|[22952] |623 |
|[20971] |526 |
|[21166] |735 |
+--------+----+
only showing top 10 rows



In [11]:
model.associationRules.show(10, truncate=False)

+----------+----------+-------------------+------------------+--------------------+
|antecedent|consequent|confidence         |lift              |support             |
+----------+----------+-------------------+------------------+--------------------+
|[20725]   |[20728]   |0.34950248756218905|7.717062598346714 |0.0216988416988417  |
|[20725]   |[85099B]  |0.3656716417910448 |4.436016638120871 |0.022702702702702703|
|[20725]   |[22382]   |0.35012437810945274|7.750616575243441 |0.021737451737451736|
|[20725]   |[20727]   |0.40298507462686567|8.059701492537313 |0.025019305019305018|
|[20725]   |[22384]   |0.3812189054726368 |8.895107794361525 |0.023667953667953667|
|[20725]   |[20726]   |0.3308457711442786 |8.263168247480053 |0.02054054054054054 |
|[20725]   |[22383]   |0.4123134328358209 |8.176813101414824 |0.0255984555984556  |
|[22629]   |[22630]   |0.5925110132158591 |17.844227025919476|0.020772200772200773|
|[20724]   |[22356]   |0.5009523809523809 |17.0719298245614  |0.020308880308

In [12]:
dab = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\page.csv", header = True, inferSchema=True)

In [13]:
dab.printSchema()
dab.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- group: string (nullable = true)
 |-- landing_page: string (nullable = true)
 |-- converted: integer (nullable = true)

+-------+--------------------+---------+------------+---------+
|user_id|           timestamp|    group|landing_page|converted|
+-------+--------------------+---------+------------+---------+
| 851104|2017-01-21 22:11:...|  control|    old_page|        0|
| 804228|2017-01-12 08:01:...|  control|    old_page|        0|
| 661590|2017-01-11 16:55:...|treatment|    new_page|        0|
| 853541|2017-01-08 18:28:...|treatment|    new_page|        0|
| 864975|2017-01-21 01:52:...|  control|    old_page|        1|
+-------+--------------------+---------+------------+---------+
only showing top 5 rows



In [17]:
dab_grouped = dab.groupBy("group").agg(
    count(when(col("converted") == 1, 1)).alias("converted_count"),
    count("*").alias("total_users")
)

In [18]:
dab_grouped = dab_grouped.withColumn("conversion_rate", col("converted_count") / col("total_users"))

dab_grouped.show()

+---------+---------------+-----------+-------------------+
|    group|converted_count|total_users|    conversion_rate|
+---------+---------------+-----------+-------------------+
|  control|          17723|     147202|0.12039917935897611|
|treatment|          17514|     147276|0.11891957956489856|
+---------+---------------+-----------+-------------------+



In [21]:
control = dab.filter(dab.group == "control").select("converted").rdd.flatMap(lambda x: x).collect()
treatment = dab.filter(dab.group == "treatment").select("converted").rdd.flatMap(lambda x: x).collect()

In [22]:
t_stat, p_value = ttest_ind(control, treatment)

print(f"T-Test: p-value = {p_value}")

T-Test: p-value = 0.21611748562490837
