In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/09/30 20:58:53 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.25.9.137 instead (on interface eth0)
22/09/30 20:58:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/30 20:58:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Simulated transaction data

In [2]:
modeldf = spark.read.parquet("../data/curated/transmodeldf.parquet/")

                                                                                

In [3]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086863 120718


year,month,day,merchant_abn,daily_transactions
2021,2,28,10142254217,1
2021,2,28,11173270682,3
2021,2,28,11285988014,7
2021,2,28,11439466003,31
2021,2,28,13120157976,3


In [None]:
"""
year = [2025]

month = []
for a in range(1,13,1):
    month.append(a)

day = []
for b in range(1,31,1):
    day.append(b)
#print(day)

merchants = []
merchantdf = spark.read.parquet("../data/tables/tbl_merchants.parquet").toPandas()
for ind in merchantdf.index:
    merchants.append(merchantdf['merchant_abn'][ind])
#print(merchants)

columns = ["year","month","day","merchant_abn"]
data = []

for i in range(len(year)):
    for j in range(len(month)):
        for k in range(len(day)):
            for l in range(len(merchants)):
                data.append((year[i],month[j],day[k],merchants[l]))

futuredf = spark.createDataFrame(data).toDF(*columns)
futuredf.orderBy("day", ascending = False)
"""

In [4]:
from pyspark.sql.functions import lit

futuredf = modeldf.withColumn('year', lit(2025))
futuredf = futuredf.drop("daily_transactions")
futuredf = futuredf.orderBy("merchant_abn")
futuredf.limit(5)

year,month,day,merchant_abn
2025,10,26,10023283211
2025,4,19,10023283211
2025,7,11,10023283211
2025,4,17,10023283211
2025,8,15,10023283211


In [5]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_transactions').head(5)

In [6]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_transactions',
    maxDepth=12,
    maxBins=1000
).fit(model_sdf)

                                                                                

In [7]:
# preprocess for predictions
predict_test = futuredf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test).select(features)

#predict_sdf.show(1, vertical=True)

In [8]:
predictions = model.transform(predict_sdf)
predictions.limit(5)

                                                                                

features,prediction
"[2025.0,5.0,6.0,1...",5.471064094586185
"[2025.0,6.0,7.0,1...",5.471064094586185
"[2025.0,4.0,24.0,...",5.471064094586185
"[2025.0,4.0,17.0,...",5.471064094586185
"[2025.0,10.0,26.0...",5.471064094586185


# Simulated profit data

In [12]:
modeldf = spark.read.parquet("../data/curated/profitmodeldf.parquet/")

In [13]:
sdf, tdf = modeldf.randomSplit([0.9, 0.1], seed=22)
print(sdf.count(),tdf.count())
sdf.limit(5)

                                                                                

1086953 120628


year,month,day,merchant_abn,daily_profit
2021,2,28,10023283211,1.26
2021,2,28,10648956813,79.19
2021,2,28,12870663624,54.79
2021,2,28,13285958344,49.39
2021,2,28,15130143460,24.57


In [14]:
from pyspark.sql.functions import lit

futuredf = modeldf.withColumn('year', lit(2025))
futuredf = futuredf.drop("daily_profit")
futuredf = futuredf.orderBy("merchant_abn")
futuredf.limit(5)

year,month,day,merchant_abn
2025,8,6,10023283211
2025,10,31,10023283211
2025,8,27,10023283211
2025,3,14,10023283211
2025,8,2,10023283211


In [15]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ["year","month","day","merchant_abn"]

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model_sdf = assembler.transform(sdf.dropna('any'))
# Display the features and targets for our model
#model_sdf.select('features').head(5), model_sdf.select('daily_transactions').head(5)

In [17]:
from pyspark.ml.regression import DecisionTreeRegressor

model = DecisionTreeRegressor(
    featuresCol='features', 
    labelCol='daily_profit',
    maxDepth=12,
    maxBins=1500
).fit(model_sdf)

                                                                                

In [18]:
# preprocess for predictions
predict_test = futuredf.select(*input_cols)

assembler = VectorAssembler(
    inputCols=input_cols, 
    outputCol=features
)

predict_sdf = assembler.transform(predict_test).select(features)

#predict_sdf.show(1, vertical=True)

In [19]:
predictions = model.transform(predict_sdf)
predictions.limit(5)

features,prediction
"[2025.0,12.0,9.0,...",2.2672463768115945
"[2025.0,3.0,14.0,...",1.9050442477876104
"[2025.0,10.0,6.0,...",2.2672463768115945
"[2025.0,8.0,6.0,1...",2.2672463768115945
"[2025.0,8.0,27.0,...",2.2672463768115945
