In [3]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, month, dayofmonth, dayofweek

In [4]:
sp = SparkSession.builder.appName("Entropy").getOrCreate()

22/10/05 17:39:22 WARN Utils: Your hostname, J-L resolves to a loopback address: 127.0.1.1; using 172.18.71.108 instead (on interface eth0)
22/10/05 17:39:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 17:39:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
trans = sp.read.option("inferSchema", True).parquet("../data/curated/transactions")
trans.show(3)

                                                                                

+-------+------------+------------+--------------+-----------+
|user_id|merchant_abn|dollar_value|order_datetime|   order_id|
+-------+------------+------------+--------------+-----------+
|  14935| 79417999332|      136.07|    2021-11-26|68719476736|
|      1| 46451548968|       72.62|    2021-11-26|68719476737|
|  14936| 89518629617|        3.08|    2021-11-26|68719476738|
+-------+------------+------------+--------------+-----------+
only showing top 3 rows



In [6]:
trans.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: float (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- order_id: long (nullable = true)



### Checking Entropy

In [12]:
from scipy.stats import entropy
from pyspark.sql import DataFrame
from pyspark.sql.functions import date_format

def compute_postcode_entropy(transactions: DataFrame, customers: DataFrame):
    '''
        function to compute entropy for each merchant based on different postcode of the customers of each 
        transaction.
    '''
    trans_with_postcode = transactions.join(customers.select(["user_id", "postcode"]), on="user_id")
    by_postcode = trans_with_postcode.groupBy("merchant_abn", "postcode").count()
    by_postcode = by_postcode.toPandas()
    merchants_list = by_postcode["merchant_abn"].unique().tolist()
    

    entropies = {}
    for abn in  merchants_list:
        this_merchant = by_postcode.loc[by_postcode['merchant_abn'] == abn]
        num_transc = this_merchant["count"]
        entropies[abn] = entropy(num_transc)
    return entropies
    
    

def compute_monthly_entropy(transactions: DataFrame):
    '''
    Compute entropy for each merchant, base on number of transactions each month
    '''
    monthly_trans = transactions.withColumn("order_month", 
                                date_format('order_datetime','yyyy-MM'))
    monthly = monthly_trans.groupBy("merchant_abn", "order_month").count()

    monthly = monthly.toPandas()
    a = monthly["merchant_abn"].unique().tolist()
    #print(a)

    entropies = {}
    for abn in a:

        this_merchant = monthly.loc[monthly['merchant_abn'] == abn]
        by_month = this_merchant["count"]
        entropies[abn] = entropy(by_month)
    return entropies

In [8]:
custs = sp.read.option("inferSchema", True).parquet("../data/curated/consumer_details/")
custs.printSchema()
custs.show(2)

root
 |-- consumer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- user_id: long (nullable = true)

+-----------+-----------------+--------------------+-----+--------+------+-------+
|consumer_id|             name|             address|state|postcode|gender|user_id|
+-----------+-----------------+--------------------+-----+--------+------+-------+
|     870353|    Charles Davis|     048 Ward Common|   SA|    5261|  Male| 213579|
|     923963|Jacqueline Nelson|151 Lynn Gateway ...|  QLD|    4744|Female| 213580|
+-----------+-----------------+--------------------+-----+--------+------+-------+
only showing top 2 rows



In [21]:
postcode_entropy = compute_postcode_entropy(trans,custs)
m_entropy = compute_monthly_entropy(trans)

[Stage 21:>                                                         (0 + 8) / 8]

22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/05 17:52:25 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                

In [24]:
m_entropy_pd = pd.DataFrame({"abn" : m_entropy.keys(), "monthly entropy" : m_entropy.values()})
p_entropy_pd = pd.DataFrame({"abn": postcode_entropy.keys(), "postcode entropy": postcode_entropy.values()})

final_entropy = pd.merge(m_entropy_pd, p_entropy_pd, on="abn")

In [25]:
final_entropy.isnull().any()

abn                 False
monthly entropy     False
postcode entropy    False
dtype: bool

In [34]:
final_entropy.columns = ["merchant_abn", "monthly_entropy", "postcode_entropy"]
final_entropy_sp = sp.createDataFrame(final_entropy)

In [35]:
merchant_data = final_entropy_sp.join(trans.groupBy("merchant_abn").sum("dollar_value"), on="merchant_abn").withColumnRenamed("sum(dollar_value)", "revenue")
merchant_data.show(2)

                                                                                

+------------+------------------+-----------------+-----------------+
|merchant_abn|   monthly_entropy| postcode_entropy|          revenue|
+------------+------------------+-----------------+-----------------+
| 83412691377|2.9862164292283113|7.875183515435049|498536.9797888398|
| 38700038932|2.9892000618696457|7.755078148797462|9546185.241102219|
+------------+------------------+-----------------+-----------------+
only showing top 2 rows



In [36]:
from pyspark.sql.types import FloatType

merchant_data = merchant_data.withColumn("monthly_entropy", col("monthly_entropy").cast(FloatType()))
merchant_data = merchant_data.withColumn("postcode_entropy", col("postcode_entropy").cast(FloatType()))

merchant_data.show(2)

+------------+---------------+----------------+-----------------+
|merchant_abn|monthly_entropy|postcode_entropy|          revenue|
+------------+---------------+----------------+-----------------+
| 83412691377|      2.9862165|       7.8751836|498536.9797888398|
| 38700038932|         2.9892|       7.7550783|9546185.241102219|
+------------+---------------+----------------+-----------------+
only showing top 2 rows



In [43]:
merchant_data.write.parquet("../data/tables/entropy", mode="overwrite")

                                                                                