In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/02 20:00:45 WARN Utils: Your hostname, DESKTOP-JJJD94T resolves to a loopback address: 127.0.1.1; using 192.168.177.62 instead (on interface eth0)
22/10/02 20:00:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/02 20:00:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/02 20:00:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Simulated transaction data

In [19]:
modeldf = spark.read.parquet("../data/curated/transmodeldf.parquet/")

In [20]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086508 121073


                                                                                

year,month,day,merchant_abn,daily_transactions
2021,2,28,10323485998,10
2021,2,28,10342410215,1
2021,2,28,10530696903,2
2021,2,28,10714068705,2
2021,2,28,10945019164,2


In [41]:
"""
year = [2025]

month = []
for a in range(1,13,1):
    month.append(a)

day = []
for b in range(1,31,1):
    day.append(b)
#print(day)

merchants = []
merchantdf = spark.read.parquet("../data/tables/tbl_merchants.parquet").toPandas()
for ind in merchantdf.index:
    merchants.append(merchantdf['merchant_abn'][ind])
#print(merchants)

columns = ["year","month","day","merchant_abn"]
data = []

for i in range(len(year)):
    for j in range(len(month)):
        for k in range(len(day)):
            for l in range(len(merchants)):
                data.append((year[i],month[j],day[k],merchants[l]))

futuredf = spark.createDataFrame(data).toDF(*columns)
futuredf.orderBy("day", ascending = False)
"""

'\nyear = [2025]\n\nmonth = []\nfor a in range(1,13,1):\n    month.append(a)\n\nday = []\nfor b in range(1,31,1):\n    day.append(b)\n#print(day)\n\nmerchants = []\nmerchantdf = spark.read.parquet("../data/tables/tbl_merchants.parquet").toPandas()\nfor ind in merchantdf.index:\n    merchants.append(merchantdf[\'merchant_abn\'][ind])\n#print(merchants)\n\ncolumns = ["year","month","day","merchant_abn"]\ndata = []\n\nfor i in range(len(year)):\n    for j in range(len(month)):\n        for k in range(len(day)):\n            for l in range(len(merchants)):\n                data.append((year[i],month[j],day[k],merchants[l]))\n\nfuturedf = spark.createDataFrame(data).toDF(*columns)\nfuturedf.orderBy("day", ascending = False)\n'

In [21]:
from pyspark.sql.functions import lit

futuredf = modeldf.withColumn('year', lit(2025))
futuredf = futuredf.drop("daily_transactions")
futuredf = futuredf.orderBy("merchant_abn")
futuredf.limit(5)

year,month,day,merchant_abn
2025,4,30,10023283211
2025,3,10,10023283211
2025,11,13,10023283211
2025,8,2,10023283211
2025,1,23,10023283211


In [22]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_transactions').head(5)

In [23]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_transactions',
    maxDepth=12,
    maxBins=1000
).fit(model_sdf)

                                                                                

In [24]:
# preprocess for predictions
predict_test = futuredf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test)

#predict_sdf.show(1, vertical=True)

In [25]:
predictions = model.transform(predict_sdf)
predictions = predictions.drop(features)
predictions.limit(5)

year,month,day,merchant_abn,prediction
2025,2,5,10023283211,6.192745873052778
2025,8,9,10023283211,6.192745873052778
2025,7,28,10023283211,6.192745873052778
2025,8,2,10023283211,6.192745873052778
2025,4,30,10023283211,6.192745873052778


In [26]:
predictions.write.mode("overwrite").parquet("../data/curated/futuretrans.parquet")

                                                                                

# Simulated profit data

In [27]:
modeldf = spark.read.parquet("../data/curated/profitmodeldf.parquet/")

In [28]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086583 120998


                                                                                

year,month,day,merchant_abn,daily_profit
2021,2,28,11802512399,70.58
2021,2,28,12014115545,9.52
2021,2,28,12426913844,6.42
2021,2,28,12497320096,10.95
2021,2,28,13376463476,0.78


In [29]:
from pyspark.sql.functions import lit

futuredf = modeldf.withColumn('year', lit(2025))
futuredf = futuredf.drop("daily_profit")
futuredf = futuredf.orderBy("merchant_abn")
futuredf.limit(5)

year,month,day,merchant_abn
2025,7,6,10023283211
2025,10,2,10023283211
2025,7,16,10023283211
2025,12,24,10023283211
2025,9,7,10023283211


In [30]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_transactions').head(5)

In [31]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_profit',
    maxDepth=12,
    maxBins=1500
).fit(model_sdf)

                                                                                

In [32]:
# preprocess for predictions
predict_test = futuredf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test)

#predict_sdf.show(1, vertical=True)

In [33]:
predictions = model.transform(predict_sdf)
predictions = predictions.drop('features')
predictions.limit(5)

year,month,day,merchant_abn,prediction
2025,8,16,10023283211,1.7604580152671752
2025,12,24,10023283211,2.128461538461538
2025,10,22,10023283211,2.128461538461538
2025,10,18,10023283211,1.444736842105263
2025,7,6,10023283211,2.7110000000000007


In [34]:
predictions.write.mode("overwrite").parquet("../data/curated/futureprofit.parquet")

                                                                                