In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

# Simulated transaction data

In [39]:
modeldf = spark.read.parquet("../data/curated/transmodeldf.parquet/")

In [40]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086981 120600


                                                                                

year,month,day,merchant_abn,daily_transactions
2021,2,28,10264435225,8
2021,2,28,10323485998,10
2021,2,28,10342410215,1
2021,2,28,10530696903,2
2021,2,28,10901349044,1


In [41]:
"""
year = [2025]

month = []
for a in range(1,13,1):
    month.append(a)

day = []
for b in range(1,31,1):
    day.append(b)
#print(day)

merchants = []
merchantdf = spark.read.parquet("../data/tables/tbl_merchants.parquet").toPandas()
for ind in merchantdf.index:
    merchants.append(merchantdf['merchant_abn'][ind])
#print(merchants)

columns = ["year","month","day","merchant_abn"]
data = []

for i in range(len(year)):
    for j in range(len(month)):
        for k in range(len(day)):
            for l in range(len(merchants)):
                data.append((year[i],month[j],day[k],merchants[l]))

futuredf = spark.createDataFrame(data).toDF(*columns)
futuredf.orderBy("day", ascending = False)
"""

'\nyear = [2025]\n\nmonth = []\nfor a in range(1,13,1):\n    month.append(a)\n\nday = []\nfor b in range(1,31,1):\n    day.append(b)\n#print(day)\n\nmerchants = []\nmerchantdf = spark.read.parquet("../data/tables/tbl_merchants.parquet").toPandas()\nfor ind in merchantdf.index:\n    merchants.append(merchantdf[\'merchant_abn\'][ind])\n#print(merchants)\n\ncolumns = ["year","month","day","merchant_abn"]\ndata = []\n\nfor i in range(len(year)):\n    for j in range(len(month)):\n        for k in range(len(day)):\n            for l in range(len(merchants)):\n                data.append((year[i],month[j],day[k],merchants[l]))\n\nfuturedf = spark.createDataFrame(data).toDF(*columns)\nfuturedf.orderBy("day", ascending = False)\n'

In [42]:
from pyspark.sql.functions import lit

futuredf = modeldf.withColumn('year', lit(2025))
futuredf = futuredf.drop("daily_transactions")
futuredf = futuredf.orderBy("merchant_abn")
futuredf.limit(5)

year,month,day,merchant_abn
2025,10,4,10023283211
2025,1,3,10023283211
2025,6,9,10023283211
2025,7,15,10023283211
2025,9,23,10023283211


In [43]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_transactions').head(5)

In [44]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_transactions',
    maxDepth=12,
    maxBins=1000
).fit(model_sdf)

                                                                                

In [45]:
# preprocess for predictions
predict_test = futuredf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test)

#predict_sdf.show(1, vertical=True)

In [46]:
predictions = model.transform(predict_sdf)
predictions = predictions.drop(features)
predictions.limit(5)

year,month,day,merchant_abn,prediction
2025,7,10,10023283211,6.308565210632676
2025,9,23,10023283211,6.308565210632676
2025,6,9,10023283211,6.308565210632676
2025,10,4,10023283211,6.308565210632676
2025,7,15,10023283211,6.308565210632676


# Simulated profit data

In [47]:
modeldf = spark.read.parquet("../data/curated/profitmodeldf.parquet/")

In [48]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086992 120589


                                                                                

year,month,day,merchant_abn,daily_profit
2021,2,28,10264435225,25.95
2021,2,28,10323485998,73.77
2021,2,28,10342410215,34.67
2021,2,28,10530696903,78.15
2021,2,28,10901349044,2.26


In [49]:
from pyspark.sql.functions import lit

futuredf = modeldf.withColumn('year', lit(2025))
futuredf = futuredf.drop("daily_profit")
futuredf = futuredf.orderBy("merchant_abn")
futuredf.limit(5)

year,month,day,merchant_abn
2025,10,4,10023283211
2025,1,3,10023283211
2025,6,9,10023283211
2025,7,15,10023283211
2025,9,23,10023283211


In [50]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_transactions').head(5)

In [51]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_profit',
    maxDepth=12,
    maxBins=1500
).fit(model_sdf)

                                                                                

In [52]:
# preprocess for predictions
predict_test = futuredf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test)

#predict_sdf.show(1, vertical=True)

In [53]:
predictions = model.transform(predict_sdf)
predictions = predictions.drop('features')
predictions.limit(5)

year,month,day,merchant_abn,prediction
2025,7,10,10023283211,1.9113333333333329
2025,9,23,10023283211,1.8523636363636369
2025,6,9,10023283211,1.9113333333333329
2025,10,4,10023283211,2.201038961038961
2025,7,15,10023283211,1.9113333333333329
