In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/09/30 18:06:35 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.25.9.137 instead (on interface eth0)
22/09/30 18:06:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/30 18:06:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/30 18:06:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Profit per merchant per month

In [2]:
finaldf = spark.read.parquet("../data/curated/finaldf.parquet/")
finaldf = finaldf.withColumn('profit', round(finaldf.dollar_value * (finaldf.take_rate / 100), 2))
finaldf.limit(5)

                                                                                

22/09/30 18:06:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%,profit
2021-08-21,94729574738,16982,4821,23684,156.3091245999424,407ce815-54e1-4ae...,QLD,Female,Scelerisque Corpo...,computer programm...,4.51,b,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,7.05
2022-04-24,45572698303,16982,4821,23684,115.38672513864692,72160f0d-06c5-432...,QLD,Female,Libero Proin Corp.,shoe shops,3.3,b,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,3.81
2021-08-21,34695415993,16982,4821,23684,35.730567249104645,d4524ef6-1f75-4f2...,QLD,Female,Ultrices Vivamus ...,"cable, satellite,...",6.02,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,2.15
2022-04-24,67400260923,16982,4821,23684,73.85811747094266,7acb9a9f-3ef6-4d4...,QLD,Female,Eleifend PC,computer programm...,5.97,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,4.41
2021-08-19,86578477987,16982,4821,23684,41.45540307953735,e8be544a-a311-432...,QLD,Female,Leo In Consulting,"watch, clock, and...",6.43,a,3128,468,271,707,947,735,1972,44,45102,50599,0.1,0.1,2.67


In [3]:
from pyspark.sql.functions import *

modeldf = finaldf.withColumn("year", date_format(col("order_datetime"), "yyyy").cast("long"))
modeldf = modeldf.withColumn("month", date_format(col("order_datetime"), "MM").cast("long"))
modeldf = modeldf.withColumn("day", date_format(col("order_datetime"), "dd").cast("long"))

for field in ("postcode","take_rate"):
    modeldf = modeldf.withColumn(
        field,
        col(field).cast('double')
    )

modeldf = modeldf.drop("order_datetime","order_id","company_name","consumer_fraud_%","merchant_fraud_%")

modeldf.limit(5)

merchant_abn,user_id,postcode,consumer_id,dollar_value,state,gender,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,profit,year,month,day
94729574738,16982,4821.0,23684,156.3091245999424,QLD,Female,computer programm...,4.51,b,3128,468,271,707,947,735,1972,44,45102,50599,7.05,2021,8,21
45572698303,16982,4821.0,23684,115.38672513864692,QLD,Female,shoe shops,3.3,b,3128,468,271,707,947,735,1972,44,45102,50599,3.81,2022,4,24
34695415993,16982,4821.0,23684,35.730567249104645,QLD,Female,"cable, satellite,...",6.02,a,3128,468,271,707,947,735,1972,44,45102,50599,2.15,2021,8,21
67400260923,16982,4821.0,23684,73.85811747094266,QLD,Female,computer programm...,5.97,a,3128,468,271,707,947,735,1972,44,45102,50599,4.41,2022,4,24
86578477987,16982,4821.0,23684,41.45540307953735,QLD,Female,"watch, clock, and...",6.43,a,3128,468,271,707,947,735,1972,44,45102,50599,2.67,2021,8,19


In [4]:
modeldf = modeldf.groupBy("year","month","merchant_abn").agg(
    round(sum("profit"),2).alias("monthly_profit"),
)

modeldf.limit(5)

                                                                                

year,month,merchant_abn,monthly_profit
2022,6,38736067045,2822.71
2021,11,90477374366,1090.76
2022,4,52160665475,3216.54
2021,5,80779820715,2377.41
2021,11,74019238521,1415.73


In [5]:
from scipy.stats import f_oneway

df = modeldf.toPandas()

print(f_oneway(df["year"],df["monthly_profit"]))
print(f_oneway(df["month"],df["monthly_profit"]))
#print(f_oneway(df["day"],df["daily_profit"]))
print(f_oneway(df["merchant_abn"],df["monthly_profit"]))

                                                                                

F_onewayResult(statistic=10804.949047729311, pvalue=0.0)
F_onewayResult(statistic=12478.133604736146, pvalue=0.0)
F_onewayResult(statistic=331106.6266711847, pvalue=0.0)


In [6]:
sdf, tdf = modeldf.randomSplit([0.99, 0.01], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

74249 721


                                                                                

year,month,merchant_abn,monthly_profit
2021,2,10930486968,4.31
2021,2,11121775571,60.58
2021,2,11173270682,1.72
2021,2,11215815177,85.82
2021,2,11355018654,53.7


In [7]:
print(sdf.filter(sdf["merchant_abn"] == 10930486968))



+----+-----+------------+--------------+
|year|month|merchant_abn|monthly_profit|
+----+-----+------------+--------------+
|2021|    2| 10930486968|          4.31|
|2022|    8| 10930486968|        191.86|
|2022|    3| 10930486968|         74.97|
|2021|    8| 10930486968|        128.72|
|2021|   10| 10930486968|         147.6|
|2022|    5| 10930486968|        141.13|
|2022|    9| 10930486968|        164.92|
|2021|    5| 10930486968|        111.05|
|2021|   12| 10930486968|         153.6|
|2022|    7| 10930486968|        168.63|
|2021|    3| 10930486968|         66.92|
|2021|    4| 10930486968|        105.94|
|2022|    2| 10930486968|        108.25|
|2021|   11| 10930486968|        162.93|
|2022|    4| 10930486968|        120.22|
|2022|    6| 10930486968|        108.03|
|2021|    7| 10930486968|        126.05|
|2022|    1| 10930486968|          57.6|
|2022|   10| 10930486968|        144.62|
|2021|    6| 10930486968|        112.29|
+----+-----+------------+--------------+
only showing top

                                                                                

In [8]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
model_sdf.select('features').head(5), model_sdf.select('monthly_profit').head(5)

                                                                                

([Row(features=DenseVector([2021.0, 2.0, 10930486968.0])),
  Row(features=DenseVector([2021.0, 2.0, 11121775571.0])),
  Row(features=DenseVector([2021.0, 2.0, 11173270682.0])),
  Row(features=DenseVector([2021.0, 2.0, 11215815177.0])),
  Row(features=DenseVector([2021.0, 2.0, 11355018654.0]))],
 [Row(monthly_profit=4.31),
  Row(monthly_profit=60.58),
  Row(monthly_profit=1.72),
  Row(monthly_profit=85.82),
  Row(monthly_profit=53.7)])

In [9]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='monthly_profit',
    maxDepth=12,
    maxBins=15000
).fit(model_sdf)

                                                                                

In [10]:
# preprocess for predictions
predict_test = tdf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test).select(features)

predict_sdf.show(1, vertical=True)



-RECORD 0------------------------
 features | [2021.0,2.0,7.556... 
only showing top 1 row



                                                                                

In [11]:
predictions = model.transform(predict_sdf)
predictions.limit(5)

                                                                                

features,prediction
"[2021.0,2.0,7.556...",51.4073919107392
"[2021.0,3.0,7.731...",877.9582543146206
"[2021.0,4.0,2.120...",189.06
"[2021.0,4.0,2.386...",815.7596950240772
"[2021.0,4.0,6.095...",877.9582543146206


In [12]:
import numpy as np
ytest =  np.array(tdf.select("monthly_profit").collect())
ypred = np.array(predictions.select("prediction").collect())
ypred

                                                                                

array([[5.14073919e+01],
       [8.77958254e+02],
       [1.89060000e+02],
       [8.15759695e+02],
       [8.77958254e+02],
       [8.77958254e+02],
       [8.77958254e+02],
       [1.02640363e+03],
       [1.03088604e+03],
       [1.83681429e+02],
       [1.03088604e+03],
       [6.53393062e+02],
       [9.13812259e+02],
       [9.04516116e+02],
       [1.03088604e+03],
       [1.03088604e+03],
       [1.03088604e+03],
       [1.03088604e+03],
       [1.03088604e+03],
       [1.03088604e+03],
       [1.35763268e+03],
       [1.35763268e+03],
       [7.77608611e+02],
       [1.35763268e+03],
       [1.35763268e+03],
       [8.26105176e+02],
       [6.33778491e+02],
       [1.04951487e+03],
       [9.13812259e+02],
       [1.06422238e+03],
       [1.35590935e+03],
       [1.24213036e+03],
       [1.24213036e+03],
       [1.24213036e+03],
       [1.18332867e+03],
       [1.24213036e+03],
       [3.21335236e+02],
       [1.24213036e+03],
       [5.47190535e+02],
       [4.87419500e+03],


In [13]:
import numpy as np
from sklearn.metrics import mean_absolute_error as mae
print(np.round(mae(ytest,ypred), 2))
print(ytest.min(),ytest.max(),ytest.mean(),ytest.std())

1165.79
0.15 23479.07 1017.4637725381415 2424.2146302384876
