In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/09/30 19:30:00 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.25.9.137 instead (on interface eth0)
22/09/30 19:30:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/30 19:30:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/30 19:30:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/30 19:30:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# Profit per merchant per day

In [2]:
finaldf = spark.read.parquet("../data/curated/finaldf.parquet/")
finaldf = finaldf.withColumn('profit', round(finaldf.dollar_value * (finaldf.take_rate / 100), 2))
finaldf.limit(5)

                                                                                

22/09/30 19:30:13 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%,profit
2021-08-21,94729574738,16982,4821,23684,156.3091245999424,407ce815-54e1-4ae...,QLD,Female,Scelerisque Corpo...,computer programm...,4.51,b,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,7.05
2022-04-24,45572698303,16982,4821,23684,115.38672513864692,72160f0d-06c5-432...,QLD,Female,Libero Proin Corp.,shoe shops,3.3,b,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,3.81
2021-08-21,34695415993,16982,4821,23684,35.730567249104645,d4524ef6-1f75-4f2...,QLD,Female,Ultrices Vivamus ...,"cable, satellite,...",6.02,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,2.15
2022-04-24,67400260923,16982,4821,23684,73.85811747094266,7acb9a9f-3ef6-4d4...,QLD,Female,Eleifend PC,computer programm...,5.97,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,4.41
2021-08-19,86578477987,16982,4821,23684,41.45540307953735,e8be544a-a311-432...,QLD,Female,Leo In Consulting,"watch, clock, and...",6.43,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,2.67


In [3]:
from pyspark.sql.functions import *

modeldf = finaldf.withColumn("year", date_format(col("order_datetime"), "yyyy").cast("long"))
modeldf = modeldf.withColumn("month", date_format(col("order_datetime"), "MM").cast("long"))
modeldf = modeldf.withColumn("day", date_format(col("order_datetime"), "dd").cast("long"))

for field in ("postcode","take_rate"):
    modeldf = modeldf.withColumn(
        field,
        col(field).cast('double')
    )

modeldf = modeldf.drop("order_datetime","order_id","company_name","consumer_fraud_%","merchant_fraud_%")

modeldf.limit(5)

merchant_abn,user_id,postcode,consumer_id,dollar_value,state,gender,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,profit,year,month,day
94729574738,16982,4821.0,23684,156.3091245999424,QLD,Female,computer programm...,4.51,b,3128,468,271,707,947,735,1972,44,45102,50599,7.05,2021,8,21
45572698303,16982,4821.0,23684,115.38672513864692,QLD,Female,shoe shops,3.3,b,3128,468,271,707,947,735,1972,44,45102,50599,3.81,2022,4,24
34695415993,16982,4821.0,23684,35.730567249104645,QLD,Female,"cable, satellite,...",6.02,a,3128,468,271,707,947,735,1972,44,45102,50599,2.15,2021,8,21
67400260923,16982,4821.0,23684,73.85811747094266,QLD,Female,computer programm...,5.97,a,3128,468,271,707,947,735,1972,44,45102,50599,4.41,2022,4,24
86578477987,16982,4821.0,23684,41.45540307953735,QLD,Female,"watch, clock, and...",6.43,a,3128,468,271,707,947,735,1972,44,45102,50599,2.67,2021,8,19


In [4]:
modeldf = modeldf.groupBy("year","month","day","merchant_abn").agg(
    round(sum("profit"),2).alias("daily_profit"),
)

modeldf.limit(5)

                                                                                

year,month,day,merchant_abn,daily_profit
2021,3,2,45899477665,90.78
2022,6,24,60978195146,39.79
2021,9,27,55179323922,4.48
2022,2,13,71041015148,45.42
2021,7,3,46804135891,241.59


In [5]:
modeldf.write.mode("overwrite").parquet("../data/curated/profitmodeldf.parquet")

[Stage 27:>                                                       (0 + 16) / 17]

22/09/30 19:31:06 WARN MemoryManager: Total allocation exceeds 95.00% (1,843,370,370 bytes) of heap memory
Scaling row group sizes to 98.10% for 14 writers
22/09/30 19:31:06 WARN MemoryManager: Total allocation exceeds 95.00% (1,843,370,370 bytes) of heap memory
Scaling row group sizes to 91.56% for 15 writers
22/09/30 19:31:06 WARN MemoryManager: Total allocation exceeds 95.00% (1,843,370,370 bytes) of heap memory
Scaling row group sizes to 85.84% for 16 writers
22/09/30 19:31:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,843,370,370 bytes) of heap memory
Scaling row group sizes to 91.56% for 15 writers
22/09/30 19:31:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,843,370,370 bytes) of heap memory
Scaling row group sizes to 98.10% for 14 writers


                                                                                

In [5]:
from scipy.stats import f_oneway

df = modeldf.toPandas()

print(f_oneway(df["year"],df["daily_profit"]))
print(f_oneway(df["month"],df["daily_profit"]))
print(f_oneway(df["day"],df["daily_profit"]))
print(f_oneway(df["merchant_abn"],df["daily_profit"]))

                                                                                

F_onewayResult(statistic=309172028.9907342, pvalue=0.0)
F_onewayResult(statistic=277257.99111675523, pvalue=0.0)
F_onewayResult(statistic=196631.0444930565, pvalue=0.0)
F_onewayResult(statistic=5292918.117913812, pvalue=0.0)


In [13]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086970 120611


                                                                                

year,month,day,merchant_abn,daily_profit
2021,2,28,11802512399,70.58
2021,2,28,12497320096,10.95
2021,2,28,13118172970,56.0
2021,2,28,14602793938,30.88
2021,2,28,15061716544,66.61


In [14]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_profit').head(5)

In [15]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_profit',
    maxDepth=12,
    maxBins=1500     
).fit(model_sdf)

                                                                                

In [16]:
# preprocess for predictions
predict_test = tdf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test).select(features)

#predict_sdf.show(1, vertical=True)

In [17]:
predictions = model.transform(predict_sdf)
#predictions.limit(5)

In [18]:
import numpy as np
ytest =  np.array(tdf.select("daily_profit").collect())
ypred = np.array(predictions.select("prediction").collect())
#ypred

                                                                                

In [19]:
import numpy as np
from sklearn.metrics import mean_absolute_error as mae
print(np.round(mae(ytest,ypred), 2))
print(ytest.min(),ytest.max(),ytest.mean(),ytest.std())

60.76
0.0 3417.9 65.14528956728657 122.3481086324676
