# Testing environment to transfer functions to script

In [1]:
from collections import defaultdict
import os
os.chdir("/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/scripts")
import sys
import argparse
import re
# ... TODO: Add to this as necessary

# External Libraries
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
# ... TODO: Add to this as necessary

# Our Modules
from utilities.log_utilities import logger
import utilities.print_utilities as PRINT
import utilities.read_utilities as READ
import utilities.clean_utilities as CLEAN
import utilities.agg_utilities as AGG
import utilities.write_utilities as WRITE

[nltk_data] Downloading package omw-1.4 to /Users/oliver/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/oliver/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /Users/oliver/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/oliver/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
spark = (
        SparkSession.builder.appName("MAST30034 Project 2")
        .config("spark.sql.repl.eagerEval.enabled", True) 
        .config("spark.sql.parquet.cacheMetadata", "true")
        .config("spark.sql.session.timeZone", "Etc/UTC")
        .config("spark.driver.memory", "4g")
        .getOrCreate()
    )

In [3]:
os.getcwd()

'/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/scripts'

In [4]:
data_dict = READ.read_data(spark, "../data/tables")

In [5]:
CLEAN.clean_data(spark, data_dict)

defaultdict(<function utilities.read_utilities.read_data.<locals>.<lambda>()>,
            {'transactions': +-------+------------+------------------+--------------------+--------------+------------------+
             |user_id|merchant_abn|      dollar_value|            order_id|order_datetime| log(dollar_value)|
             +-------+------------+------------------+--------------------+--------------+------------------+
             |  18482| 89295426212|132.46099397329627|5cea6247-6cac-42c...|    2021-08-12| 4.886288216979278|
             |      1| 74019238521| 5.228330955001773|9138e9c7-3c1a-40d...|    2021-08-12|1.6540920980815574|
             |      2| 64203420245|30.681990243789528|690d590e-caa3-489...|    2021-08-12|3.4236758454442224|
             |  18484| 96513857365|231.25294636553346|c838a359-60f4-40f...|    2021-08-12| 5.443512117351513|
             |      6| 60836029312| 536.4105511922465|e9de1d4a-3eb4-4fa...|    2021-08-12| 6.284899821594837|
             |  18485| 49

In [6]:
data_dict.keys()

dict_keys(['transactions', 'consumer_user_mappings', 'consumers', 'merchants', 'postcodes', 'census', 'merchant_tags'])

In [None]:
data_dict_2 = data_dict.copy()

In [None]:
spark.catalog.clearCache()

In [None]:
AGG.compute_aggregates(spark, data_dict)

In [None]:
data_dict.keys()

In [24]:
from pyspark.sql import functions as F
import pandas as pd
from datetime import datetime

def compute_merchant_metric(spark: SparkSession, merchant_sales: DataFrame,
                           merchant: DataFrame) -> DataFrame:
    
    
    # This part is taking a while 
    date_range = merchant_sales.select(F.min(F.col("order_datetime")), 
                                       F.max(F.col("order_datetime"))
                                      ).first()
    
    min_date, max_date = (datetime.strptime(date_range[0], "%Y-%m-%d"), 
                          datetime.strptime(date_range[1], "%Y-%m-%d"))
    
    num_days = (max_date - min_date).days
    
    # Group first to reduce the table size before joining
    merchant_daily_sales = merchant_sales.groupby('merchant_abn').agg(
        (F.sum(F.col('sales_revenue')) / num_days).alias('avg_daily_rev'),
        (F.sum(F.col('sales_revenue')) / F.sum(F.col('no_orders'))).alias('avg_value_per_order'),
        (F.sum(F.col('no_orders')) / num_days).alias('avg_daily_order')
    )
    
    merchant_daily_sales = merchant.join(
        merchant_daily_sales, 
        on=["merchant_abn"],
        how='left'
    ).toPandas()
    
    
    merchant_daily_sales['avg_daily_commission'] = merchant_daily_sales['avg_daily_rev'] * (merchant_daily_sales['take_rate']/100)
    merchant_daily_sales['avg_commission_per_order'] = merchant_daily_sales['avg_value_per_order'] * (merchant_daily_sales['take_rate']/100)
    
    return merchant_daily_sales
    

In [None]:
data_dict['merchant_summary'] = compute_merchant_metric(spark, data_dict['merchant_sales'], data_dict['merchants'])

In [None]:
date_range = data_dict['merchant_sales'].select(F.min(F.col("order_datetime")), F.max(F.col("order_datetime"))).first()

In [9]:
def compute_merchant_consumer(spark: SparkSession, transaction_df: DataFrame) -> DataFrame:
    return transaction_df \
        .groupby(['merchant_abn', 'user_id']) \
        .agg({'dollar_value':'sum', 'order_id':'count'}) \
        .withColumnRenamed('sum(dollar_value)', 'dollar_spent') \
        .withColumnRenamed('count(order_id)', 'no_orders')


    

In [10]:
merchant_customer = compute_merchant_customer(spark, data_dict['transactions'])

In [11]:
merchant_customer

merchant_abn,user_id,dollar_spent,no_orders
89502033586,18672,79.39589714207766,1
81877438808,250,21.67800035261789,1
37459245212,18792,226.32665157972545,3
47086412084,18819,64.71157289167199,3
37379915451,391,274.04849398273313,2
21439773999,18949,798.2492487164099,9
57566534349,19070,196.09081098506056,2
96152467973,1039,111.24787724380104,5
79417999332,20076,226.10664454357268,5
64203420245,1688,214.3562613262408,8


In [12]:
merchant_customer.count()

8361769

In [52]:
data_dict['consumer_user_mappings']

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975
6,407340
7,511685
8,448088
9,650435
10,1058499


In [63]:
data_dict['postcodes'].select('sa2_code').distinct().count()

2222

In [57]:
consumer_sa2_data = data_dict['consumers'].select(['consumer_id','postcode']).join(data_dict['postcodes'], 'postcode', 'left')
consumer_sa2_data = consumer_sa2_data.withColumn('sa2_code', consumer_sa2_data['sa2_code'].cast(IntegerType()))
consumer_sa2_data = consumer_sa2_data.join(data_dict['consumer_user_mappings'], 'consumer_id', 'left')

In [59]:
merchant_cust_detail = merchant_customer.select(['merchant_abn', 'user_id']).join(consumer_sa2_data, 'user_id', 'left')

In [64]:
merchant_cust_detail_v2 = merchant_cust_detail.sample(0.1)

In [65]:
merchant_cust_detail_v2.groupby('merchant_abn').agg(F.countDistinct('sa2_code').alias('sa2_region_count'))

merchant_abn,sa2_region_count
38700038932,1092
73841664453,329
83412691377,1446
24406529929,849
73256306726,964
35344855546,467
60654402457,66
15613631617,528
48214071373,206
19839532017,258


In [None]:
from pyspark.sql.types import IntegerType
census_data = census_data.withColumn("sa2_code", census_data['sa2_code'].cast(IntegerType()))

In [44]:
census_data = data_dict['census'].select(['sa2_code','median_tot_prsnl_inc_weekly'])

In [73]:
#example 1213496 consumer_id 9389 user_id

In [85]:
consumer_sa2_data

consumer_id,postcode,sa2_code,user_id
1213496,2040,120021388,9389
1213496,2040,120021389,9389
238808,2040,120021388,9742
238808,2040,120021389,9742
245147,2040,120021388,11893
245147,2040,120021389,11893
532404,2040,120021388,12185
532404,2040,120021389,12185
749917,2040,120021388,19947
749917,2040,120021389,19947


In [67]:
inc_join = consumer_sa2_data.join(data_dict['census'].select(['sa2_code','median_tot_prsnl_inc_weekly']), 'sa2_code','left')

In [71]:
inc_agg = inc_join.groupby('user_id').agg({'median_tot_prsnl_inc_weekly':'mean'})

In [74]:
inc_agg.where(F.col('user_id') == 9389)

user_id,avg(median_tot_prsnl_inc_weekly)
9389,1542.0


In [43]:
from pyspark.sql.types import IntegerType
census_df = spark.read.csv(f"/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/data/tables/SA2/AUS/2021Census_G02_AUST_SA2.csv", header = True)
census_df = census_df.select([
    F.col(colname).alias(colname.lower()) for colname in census_df.columns
])

sa2_code_colname = ''
for colname in census_df.columns:
    if re.search(r'sa2_code_\d{4}', colname.lower()) is not None:
        logger.debug(f'The SA2 colname is "{colname}"')
        sa2_code_colname = colname.lower()

census_df = census_df.withColumn(
    sa2_code_colname, 
    census_df[sa2_code_colname].cast(IntegerType())
)

data_dict['census'] = census_df.select([
    F.col(colname).alias(colname.lower()) for colname in census_df.columns
]).withColumnRenamed(sa2_code_colname, 'sa2_code')

In [91]:
def compute_consumer_region(spark: SparkSession, consumers: DataFrame, 
                            postcodes: DataFrame, user_mapping: DataFrame) -> DataFrame:
    
    return consumers.select(
            ['consumer_id','postcode']
        ).join(
            postcodes, 
            'postcode', 
            'left'
        ).withColumn(
            'sa2_code', 
            F.col('sa2_code').cast(IntegerType())
        ).join(
            user_mapping, 
            'consumer_id', 
            'left'
        )


def compute_region_income(spark: SparkSession, consumer_region: DataFrame,
                         census: DataFrame) -> DataFrame:
    
    return consumer_region.join(
                census.select([
                    'sa2_code',
                    'median_tot_prsnl_inc_weekly'
                ]), 
                'sa2_code',
                'left'
            ).groupby(
                'user_id'
            ).agg(
                {'median_tot_prsnl_inc_weekly':'mean'}
            ).withColumnRenamed(
                'avg(median_tot_prsnl_inc_weekly)', 
                'median_weekly_income'
            )
    
    
def compute_merchant_region(spark: SparkSession, merchant_consumer: DataFrame,
                           consumer_region: DataFrame) -> DataFrame:
    
    return merchant_consumer.select([
            'merchant_abn', 
            'user_id'
        ]).join(
            consumer_region, 
            'user_id', 
            'left'
        ).groupby(
            'merchant_abn'
        ).agg(
            F.countDistinct('sa2_code').alias('sa2_region_count')
        )


In [84]:
region = compute_consumer_region(spark, data_dict['consumers'], data_dict['postcodes'], data_dict['consumer_user_mappings'])

In [92]:
inc = compute_region_income(spark, region, census_data)

In [93]:
inc

user_id,median_weekly_income
206429,1542.0
59861,
25084,549.4074074074074
30428,549.4074074074074
431338,549.4074074074074
271690,751.6
302881,950.0
367355,699.0
167219,770.875
219901,770.875
