In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from pyspark.sql.functions import *
from pyspark.sql.types import DateType
from pyspark.sql import SparkSession, DataFrame

In [2]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml import Pipeline
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

In [3]:
sp = (
    SparkSession.builder.appName("Model")
    .config("spark.sql.session.timeZone", "+11")
    #.config("spark.driver.memory", "10g")
    #.config("spark.executor.memory", "10g")
    #.config('spark.sql.parquet.cacheMetadata', 'True')
    .getOrCreate()
)
sp

22/10/08 22:33:18 WARN Utils: Your hostname, J-L resolves to a loopback address: 127.0.1.1; using 172.28.113.244 instead (on interface eth0)
22/10/08 22:33:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/08 22:33:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
transactions = sp.read.option("inferSchema", True).parquet("../data/processed/transactions")
merchants = sp.read.option("inferSchema", True).parquet("../data/processed/merchants")
customers = sp.read.option("inferSchema", True).parquet("../data/processed/customers")

                                                                                

In [9]:
transactions.show(1)
merchants.show(1)

+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|order_id|user_id|merchant_abn|dollar_value|order_datetime|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
|       3|      3| 60956456424|      136.68|    2021-08-20|          0|                0|      0|        20|    8|        6|
+--------+-------+------------+------------+--------------+-----------+-----------------+-------+----------+-----+---------+
only showing top 1 row

+------------+-------------+--------------+--------+----+---------------+---------------+----------------+-----------------+
|merchant_abn|         name|Earnings_Class|BNPL_Fee|tags|avg_monthly_inc|monthly_entropy|postcode_entropy|          revenue|
+------------+-------------+--------------+--------+----+---------------+---------------+------------

In [11]:
transactions = transactions.withColumn("year", year(col("order_datetime"))).drop("order_datetime")
transactions.show(1)

+--------+-------+------------+------------+-----------+-----------------+-------+----------+-----+---------+----+
|order_id|user_id|merchant_abn|dollar_value|Natural_var|Potential_Outlier|holiday|dayofmonth|month|dayofweek|year|
+--------+-------+------------+------------+-----------+-----------------+-------+----------+-----+---------+----+
|       3|      3| 60956456424|      136.68|          0|                0|      0|        20|    8|        6|2021|
+--------+-------+------------+------------+-----------+-----------------+-------+----------+-----+---------+----+
only showing top 1 row



In [18]:
monthly_revenue = transactions.groupBy(["merchant_abn", "year", "month"]).sum("dollar_value").withColumnRenamed("sum(dollar_value)", "prev_month_val")

In [20]:
merged = monthly_revenue.join(merchants, on="merchant_abn").drop("name", "revenue")
merged.show(1)

                                                                                

+------------+----+-----+-----------------+--------------+--------+----+---------------+---------------+----------------+
|merchant_abn|year|month|   prev_month_val|Earnings_Class|BNPL_Fee|tags|avg_monthly_inc|monthly_entropy|postcode_entropy|
+------------+----+-----+-----------------+--------------+--------+----+---------------+---------------+----------------+
| 41383736952|2021|    8|58710.59999227524|             c|    2.25|   1|    -0.71428573|      2.9871547|       7.8581905|
+------------+----+-----+-----------------+--------------+--------+----+---------------+---------------+----------------+
only showing top 1 row



In [22]:
merged = merged.filter(((col("year") == 2021) & (col("month") > 2)) | ((col("year") == 2022) & (col("month") < 9)))

In [39]:
from pyspark.sql.types import IntegerType, DoubleType

@udf(returnType=IntegerType())
def split6(year: int, month: int):
    """
    Function to split into 3 x 6 month halves
    """
    if year == 2021:
        if month < 9:
            return 0
        return 1
    else:   # year is 2022
        if month < 3:
            return 1
    return 2

In [40]:
merged = merged.withColumn("semi_annual", split6(col("year"), col("month")))
merged.show(1)

+------------+----+-----+-----------------+--------------+--------+----+---------------+---------------+----------------+-----------+
|merchant_abn|year|month|   prev_month_val|Earnings_Class|BNPL_Fee|tags|avg_monthly_inc|monthly_entropy|postcode_entropy|semi_annual|
+------------+----+-----+-----------------+--------------+--------+----+---------------+---------------+----------------+-----------+
| 41383736952|2021|    8|58710.59999227524|             c|    2.25|   1|    -0.71428573|      2.9871547|       7.8581905|          0|
+------------+----+-----+-----------------+--------------+--------+----+---------------+---------------+----------------+-----------+
only showing top 1 row



In [41]:
base = merged.groupBy(["merchant_abn", "semi_annual"]).avg("avg_monthly_inc", "avg_monthly_inc", "avg_monthly_inc", "prev_month_val")
base.show(1)

+------------+-----------+--------------------+--------------------+--------------------+-------------------+
|merchant_abn|semi_annual|avg(avg_monthly_inc)|avg(avg_monthly_inc)|avg(avg_monthly_inc)|avg(prev_month_val)|
+------------+-----------+--------------------+--------------------+--------------------+-------------------+
| 68559320474|          0|  -1.952380895614624|  -1.952380895614624|  -1.952380895614624| 331634.50163896877|
+------------+-----------+--------------------+--------------------+--------------------+-------------------+
only showing top 1 row



In [48]:
base_pd = base.join(merchants.select("merchant_abn", "Earnings_Class", "BNPL_Fee", "tags"), on="merchant_abn").toPandas()
base_pd.head()

Unnamed: 0,merchant_abn,semi_annual,avg(avg_monthly_inc),avg(avg_monthly_inc).1,avg(avg_monthly_inc).2,avg(prev_month_val),Earnings_Class,BNPL_Fee,tags
0,68559320474,0,-1.952381,-1.952381,-1.952381,331634.501639,b,4.2,11
1,58688453868,2,-0.095238,-0.095238,-0.095238,11676.213308,b,4.4,18
2,33437657911,1,-0.047619,-0.047619,-0.047619,10473.841675,c,1.91,5
3,70223714517,2,0.095238,0.095238,0.095238,53736.749983,b,3.51,6
4,13118172970,0,-0.52381,-0.52381,-0.52381,53856.799963,b,4.57,5


In [57]:
outcome = []
for key, row in base_pd.iterrows():
    semi = row["semi_annual"]

    # Last half
    if semi == 2:
        outcome.append(np.nan)
        continue

    small = base_pd[base_pd["merchant_abn"] == row["merchant_abn"]]
    if (semi + 1) in small["semi_annual"]:
        outcome.append(small[small["semi_annual"] == (semi + 1)]["avg(prev_month_val)"].values[0])
        continue

    outcome.append(0)
    

base_pd["outcome"] = outcome


In [58]:
base_pd.head()

Unnamed: 0,merchant_abn,semi_annual,avg(avg_monthly_inc),avg(avg_monthly_inc).1,avg(avg_monthly_inc).2,avg(prev_month_val),Earnings_Class,BNPL_Fee,tags,outcome
0,68559320474,0,-1.952381,-1.952381,-1.952381,331634.501639,b,4.2,11,0.0
1,58688453868,2,-0.095238,-0.095238,-0.095238,11676.213308,b,4.4,18,
2,33437657911,1,-0.047619,-0.047619,-0.047619,10473.841675,c,1.91,5,11503.426658
3,70223714517,2,0.095238,0.095238,0.095238,53736.749983,b,3.51,6,
4,13118172970,0,-0.52381,-0.52381,-0.52381,53856.799963,b,4.57,5,0.0


In [60]:
base_pd[base_pd["outcome"] != 0].count()

merchant_abn            3999
semi_annual             3999
avg(avg_monthly_inc)    3999
avg(avg_monthly_inc)    3999
avg(avg_monthly_inc)    3999
avg(prev_month_val)     3999
Earnings_Class          3999
BNPL_Fee                3999
tags                    3999
outcome                    2
dtype: int64

In [62]:
base_pd[base_pd["outcome"] != 0].dropna()

Unnamed: 0,merchant_abn,semi_annual,avg(avg_monthly_inc),avg(avg_monthly_inc).1,avg(avg_monthly_inc).2,avg(prev_month_val),Earnings_Class,BNPL_Fee,tags,outcome
2,33437657911,1,-0.047619,-0.047619,-0.047619,10473.841675,c,1.91,5,11503.426658
11469,58688453868,0,-0.095238,-0.095238,-0.095238,9619.079993,b,4.4,18,11819.85999


22/10/09 14:47:05 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 53928439 ms exceeds timeout 120000 ms
22/10/09 14:47:05 WARN SparkContext: Killing executors is not supported by current scheduler.
