# Testing environment to transfer functions to script

In [1]:
from collections import defaultdict
import os
os.chdir("/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/scripts")
import sys
import argparse
import re
# ... TODO: Add to this as necessary

# External Libraries
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, DataFrame
# ... TODO: Add to this as necessary

# Our Modules
from utilities.log_utilities import logger
import utilities.print_utilities as PRINT
import utilities.read_utilities as READ
import utilities.clean_utilities as CLEAN
import utilities.agg_utilities as AGG
import utilities.write_utilities as WRITE

[nltk_data] Downloading package omw-1.4 to /Users/oliver/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/oliver/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /Users/oliver/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/oliver/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
spark = (
        SparkSession.builder.appName("MAST30034 Project 2")
        .config("spark.sql.repl.eagerEval.enabled", True) 
        .config("spark.sql.parquet.cacheMetadata", "true")
        .config("spark.sql.session.timeZone", "Etc/UTC")
        .config("spark.driver.memory", "4g")
        .getOrCreate()
    )

In [None]:
os.getcwd()

In [None]:
data_dict = READ.read_data(spark, "../data/tables")

In [None]:
CLEAN.clean_data(spark, data_dict)

In [None]:
data_dict.keys()

In [None]:
data_dict_2 = data_dict.copy()

In [None]:
spark.catalog.clearCache()

In [None]:
AGG.compute_aggregates(spark, data_dict)

In [None]:
data_dict.keys()

In [16]:
from pyspark.sql import functions as F
import pandas as pd
from datetime import datetime

def compute_merchant_metric(spark: SparkSession, merchant_sales: DataFrame,
                           merchant: DataFrame) -> DataFrame:
    
    
    # This part is taking a while 
    date_range = merchant_sales.select(F.min(F.col("order_datetime")), 
                                       F.max(F.col("order_datetime"))
                                      ).first()
    
    min_date, max_date = (datetime.strptime(date_range[0], "%Y-%m-%d"), 
                          datetime.strptime(date_range[1], "%Y-%m-%d"))
    
    num_days = (max_date - min_date).days
    
    # Group first to reduce the table size before joining
    merchant_daily_sales = merchant_sales.groupby('merchant_abn').agg(
        (F.sum(F.col('sales_revenue')) / num_days).alias('avg_daily_rev'),
        (F.sum(F.col('sales_revenue')) / F.sum(F.col('no_orders'))).alias('avg_value_per_order'),
        (F.sum(F.col('no_orders')) / num_days).alias('avg_daily_order')
    )
    
    merchant_daily_sales = merchant.join(
        merchant_daily_sales, 
        on=["merchant_abn"],
        how='left'
    ).toPandas()
    
    
    merchant_daily_sales['avg_daily_commission'] = merchant_daily_sales['avg_daily_rev'] * (merchant_daily_sales['take_rate']/100)
    merchant_daily_sales['avg_commission_per_order'] = merchant_daily_sales['avg_value_per_order'] * (merchant_daily_sales['take_rate']/100)
    
    return merchant_daily_sales
    

In [None]:
data_dict['merchant_summary'] = compute_merchant_metric(spark, data_dict['merchant_sales'], data_dict['merchants'])

In [None]:
date_range = data_dict['merchant_sales'].select(F.min(F.col("order_datetime")), F.max(F.col("order_datetime"))).first()

In [7]:
def compute_merchant_consumer(spark: SparkSession, transaction_df: DataFrame) -> DataFrame:
    return transaction_df \
        .groupby(['merchant_abn', 'user_id']) \
        .agg({'dollar_value':'sum', 'order_id':'count'}) \
        .withColumnRenamed('sum(dollar_value)', 'dollar_spent') \
        .withColumnRenamed('count(order_id)', 'no_orders')


    

In [None]:
merchant_customer = compute_merchant_customer(spark, data_dict['transactions'])

In [None]:
merchant_customer

In [None]:
merchant_customer.count()

In [None]:
data_dict['consumer_user_mappings']

In [None]:
data_dict['postcodes'].select('sa2_code').distinct().count()

In [None]:
consumer_sa2_data = data_dict['consumers'].select(['consumer_id','postcode']).join(data_dict['postcodes'], 'postcode', 'left')
consumer_sa2_data = consumer_sa2_data.withColumn('sa2_code', consumer_sa2_data['sa2_code'].cast(IntegerType()))
consumer_sa2_data = consumer_sa2_data.join(data_dict['consumer_user_mappings'], 'consumer_id', 'left')

In [None]:
merchant_cust_detail = merchant_customer.select(['merchant_abn', 'user_id']).join(consumer_sa2_data, 'user_id', 'left')

In [None]:
merchant_cust_detail_v2 = merchant_cust_detail.sample(0.1)

In [None]:
merchant_cust_detail_v2.groupby('merchant_abn').agg(F.countDistinct('sa2_code').alias('sa2_region_count'))

In [None]:
from pyspark.sql.types import IntegerType
census_data = census_data.withColumn("sa2_code", census_data['sa2_code'].cast(IntegerType()))

In [None]:
census_data = data_dict['census'].select(['sa2_code','median_tot_prsnl_inc_weekly'])

In [None]:
#example 1213496 consumer_id 9389 user_id

In [None]:
consumer_sa2_data

In [None]:
inc_join = consumer_sa2_data.join(data_dict['census'].select(['sa2_code','median_tot_prsnl_inc_weekly']), 'sa2_code','left')

In [None]:
inc_agg = inc_join.groupby('user_id').agg({'median_tot_prsnl_inc_weekly':'mean'})

In [None]:
inc_agg.where(F.col('user_id') == 9389)

In [None]:
from pyspark.sql.types import IntegerType
census_df = spark.read.csv(f"/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/data/tables/SA2/AUS/2021Census_G02_AUST_SA2.csv", header = True)
census_df = census_df.select([
    F.col(colname).alias(colname.lower()) for colname in census_df.columns
])

sa2_code_colname = ''
for colname in census_df.columns:
    if re.search(r'sa2_code_\d{4}', colname.lower()) is not None:
        logger.debug(f'The SA2 colname is "{colname}"')
        sa2_code_colname = colname.lower()

census_df = census_df.withColumn(
    sa2_code_colname, 
    census_df[sa2_code_colname].cast(IntegerType())
)

data_dict['census'] = census_df.select([
    F.col(colname).alias(colname.lower()) for colname in census_df.columns
]).withColumnRenamed(sa2_code_colname, 'sa2_code')

In [54]:
def compute_consumer_region(spark: SparkSession, consumers: DataFrame, 
                            postcodes: DataFrame, user_mapping: DataFrame) -> DataFrame:
    
    return consumers.select(
            ['consumer_id','postcode']
        ).join(
            postcodes, 
            'postcode', 
            'left'
        ).withColumn(
            'sa2_code', 
            F.col('sa2_code').cast(IntegerType())
        ).join(
            user_mapping, 
            'consumer_id', 
            'left'
        )


def compute_region_income(spark: SparkSession, consumer_region: DataFrame,
                         census: DataFrame) -> DataFrame:
    
    return consumer_region.join(
                census.select([
                    'sa2_code',
                    'median_tot_prsnl_inc_weekly'
                ]), 
                'sa2_code',
                'left'
            ).groupby(
                'user_id'
            ).agg(
                {'median_tot_prsnl_inc_weekly':'mean'}
            ).withColumnRenamed(
                'avg(median_tot_prsnl_inc_weekly)', 
                'median_weekly_income'
            )
    
    
def compute_merchant_region(spark: SparkSession, merchant_consumer: DataFrame,
                           consumer_region: DataFrame) -> DataFrame:
    
    return merchant_consumer.select([
            'merchant_abn', 
            'user_id'
        ]).join(
            consumer_region, 
            'user_id', 
            'left'
        ).groupby(
            'merchant_abn'
        ).agg(
            F.countDistinct('sa2_code').alias('sa2_region_count')
        )

def compute_merchant_customer_income(spark: SparkSession, merchant_consumer: DataFrame,
                           consumer_region_income: DataFrame) -> DataFrame:
    
    return merchant_consumer.select([
            'merchant_abn', 
            'user_id'
        ]).join(
            consumer_region_income, 
            'user_id', 
            'left'
        ).groupby(
            'merchant_abn'
        ).agg(
            F.mean(F.col('median_weekly_income')).alias('median_customer_income')
        )

def compute_returning_customer(spark: SparkSession, 
                               merchant_consumer: DataFrame) -> DataFrame:
    
    return merchant_consumer.groupby(
            'merchant_abn'
        ).agg(
            F.count(
                    F.when(F.col('no_orders')>2, True)
                ).alias(
                    'returning_customer'
                ),
            F.mean(F.col('dollar_spent')).alias('mean_spending'),
            F.stddev(F.col('dollar_spent')).alias('std_spending')
        )


def compute_vip_customer(spark: SparkSession, merchant_consumer: DataFrame,
                        merchant_statistics: DataFrame) -> DataFrame:

    return merchant_consumer.join(
        merchant_statistics, 
        'merchant_abn',
        'left'
    ).groupby(
        'merchant_abn'
    ).agg(
        F.count(
            F.when(
                (F.col('dollar_spent') > 100) &
                (F.col('dollar_spent') > F.col('mean_spending') + 2 * F.col('std_spending')),
                True
            )
        ).alias(
            'vip_customer'
        )
    )

In [None]:
region = compute_consumer_region(spark, data_dict['consumers'], data_dict['postcodes'], data_dict['consumer_user_mappings'])

In [None]:
inc = compute_region_income(spark, region, census_data)

In [None]:
inc

In [4]:
path = "/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/data/tables/transactions_20210228_20210827_snapshot/"
transaction = spark.read.parquet(path,header = True)


In [8]:
merc_cons = compute_merchant_consumer(spark, transaction)

In [44]:
merchant_statistics = merc_cons.groupby(
    'merchant_abn'
).agg(
    F.mean(F.col('dollar_spent')).alias('mean_spending'),
    F.stddev(F.col('dollar_spent')).alias('std_spending')
)

merc_cons.join(
    merchant_statistics, 
    'merchant_abn',
    'left'
).groupby(
    'merchant_abn'
).agg(
    F.count(
        F.when(
            (F.col('dollar_spent') > 100) &
            (F.col('dollar_spent') > F.col('mean_spending') + 2 * F.col('std_spending')),
            True
        )
    ).alias(
        'vip_customer'
    )
)

merchant_abn,vip_customer
12516851436,0
15613631617,10
19839532017,2
24406529929,19
28767881738,0
34440496342,1
35344855546,5
37935728745,0
38700038932,18
38986645707,0


In [48]:
merchant_statistics = compute_returning_customer(spark, merc_cons)

In [55]:
merchant_statistics.withColumn('new', F.col('mean_spending')*100)

merchant_abn,returning_customer,mean_spending,std_spending,new
24406529929,0,68.45607596038252,63.91682510434578,6845.607596038252
15613631617,0,302.4760492581066,200.53183984696304,30247.60492581066
83412691377,11,37.62677301716173,27.28524252497756,3762.677301716173
38700038932,0,1361.0240586198474,795.7031236859461,136102.4058619847
73256306726,0,301.32681089641665,251.345104976122,30132.681089641665
35344855546,0,88.94095273560168,64.37579959825493,8894.095273560168
48214071373,0,292.4918702910962,209.91786087901653,29249.187029109617
96946925998,0,960.4679852638474,675.4122858954569,96046.79852638474
73841664453,0,89.56630681095595,55.29437738358223,8956.630681095596
19839532017,0,158.67021276595744,16.149957896584137,15867.021276595744


In [53]:
compute_vip_customer(spark, merc_cons, merchant_statistics).count()

4359