## Example of ``ambrosia`` core classes Spark support

In [2]:
import os

import pandas as pd
import pyspark

from ambrosia.designer import Designer

In [3]:
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
spark = pyspark.sql.SparkSession.builder.master("local[1]").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/14 21:32:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create Spark data

In [4]:
ltv_and_retention_dataset = pd.read_csv("./../tests/test_data/ltv_retention.csv")
sdf = spark.createDataFrame(ltv_and_retention_dataset)

In [5]:
sdf.printSchema()

root
 |-- LTV: double (nullable = true)
 |-- retention: double (nullable = true)



## Spark experiment theory design

In [6]:
designer = Designer(dataframe=sdf, effects=[1.05, 1.2], sizes=[100, 1000], metrics='LTV')

In [7]:
designer.run('size', 'theory')

                                                                                

errors,(0.05; 0.2)
effects,Unnamed: 1_level_1
5.0%,6205
20.0%,388


In [8]:
designer.run('effect', 'theory')

errors,(0.05; 0.2)
sample_sizes,Unnamed: 1_level_1
100,39.4%
1000,12.5%


In [9]:
designer.run('power', 'theory')

Unnamed: 0_level_0,sample sizes,100,1000
First type error,Effect,Unnamed: 2_level_1,Unnamed: 3_level_1
0.05,5.0%,5.4%,20.2%
0.05,20.0%,29.6%,99.4%


##  Spark experiment empirical design

In [10]:
designer = Designer(dataframe=sdf, second_type_errors=0.5, effects=1.2, metrics='LTV') 

In [11]:
designer.run('size', 'empiric', bootstrap_size=1)

  0%|          | 0/1 [00:00<?, ?it/s]

errors,"(0.5, 0.05)"
effect,Unnamed: 1_level_1
20.0%,242


In [12]:
designer.run('effect', 'empiric', sizes=155, bootstrap_size=5)

  0%|          | 0/1 [00:00<?, ?it/s]

errors,"(0.5, 0.05)"
group_sizes,Unnamed: 1_level_1
155,34.7%


In [13]:
designer.run('power', 'empiric', sizes=10, bootstrap_size=5)

0it [00:00, ?it/s]

sample sizes,10
effect,Unnamed: 1_level_1
1.2,0.0%


## Binary metrics Spark design

In [14]:
designer = Designer(dataframe=sdf, second_type_errors=0.5, effects=1.2, metrics='retention') 

In [15]:
designer.run('size', 'binary')

Unnamed: 0_level_0,$\delta$-relative,1.2
$\alpha$,$\beta$,Unnamed: 2_level_1
0.05,0.5,295


In [16]:
designer.run('effect', 'binary', sizes=50)

Unnamed: 0_level_0,Sample size,50
$\alpha$,$\beta$,Unnamed: 2_level_1
0.05,0.5,-0.18895


In [17]:
designer.run('power', 'binary', sizes=500)

sample sizes,500
$\delta$-relative,Unnamed: 1_level_1
1.2,0.7363


In [18]:
spark.stop()