## Analyzing Customer Demographic

In [1]:
# Start Spark Session
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 BNPL")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/05 15:28:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [127]:
# Opening 'BNPL' dataset
merchants = spark.read.parquet("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/tbl_merchants.parquet")
consumer = spark.read.csv("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/tbl_consumer.csv", header=True, sep="|")
details = spark.read.parquet("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/consumer_user_details.parquet")

In [128]:
# load all transactions datasets
paths=['/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/transactions_20210228_20210827_snapshot',
       '/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/transactions_20210828_20220227_snapshot']

first = 1
for path in paths:
    if first:
        transactions = spark.read.parquet(path)
        print(f'added {path.split("/")[3]}')
        first = 0
    else:
        append_transactions = spark.read.parquet(path)
        transactions = transactions.union(append_transactions)
        print(f'added {path.split("/")[3]}')

                                                                                

added Documents


                                                                                

added Documents


In [129]:
# rename columns
merchants = merchants.withColumnRenamed('name', 'merchant_name')
consumer = consumer.withColumnRenamed('name', 'consumer_name')

In [130]:
# Join consumers with their respective details
consumer_detail = consumer.join(details, on="consumer_id")

# Join consumers with their respective transactions
consumer_trx = consumer_detail.join(transactions, on="user_id")

# Join transactions with the respective merchants
df_trx = consumer_trx.join(merchants, on="merchant_abn")

---
### Consumer gender proportion

In [3]:
# Opening 'tbl_consumer.csv' dataset
consumer = spark.read.csv("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/tbl_consumer.csv", header=True, sep="|")

                                                                                

In [132]:
# Find distinct values on the 'gender' column
consumer.select('gender').distinct().show()



+-----------+
|     gender|
+-----------+
|Undisclosed|
|     Female|
|       Male|
+-----------+



                                                                                



In [18]:
# Count the proportion on each of the distinct values in 'gender' (on percentage)
total_consumer = consumer.select('gender').count()
female_proportion = (consumer.select('gender').where(consumer.gender=='Female').count() / total_consumer)*100
male_proportion = (consumer.select('gender').where(consumer.gender=='Male').count() / total_consumer)*100
undisclosed_proportion = (consumer.select('gender').where(consumer.gender=='Undisclosed').count() / total_consumer)*100


                                                                                

44.989289978579954

---
### Calculate proportion of merchants level

In [40]:
from pyspark.sql.functions import split

In [126]:
# Opening 'BNPL' dataset
merchants = spark.read.parquet("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/tbl_merchants.parquet")
consumer = spark.read.csv("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/tbl_consumer.csv", header=True, sep="|")
details = spark.read.parquet("/Users/roseline/Documents/GitHub/generic-buy-now-pay-later-project-group-11/data/tables/consumer_user_details.parquet")

In [137]:
# Extracting the merchants level from 'tags' column
from pyspark.sql.functions import substring, length, col, expr
merchants = merchants.withColumn('temp_level',expr("substring(tags, 1, length(tags)-21)"))
merchants = merchants.withColumn('level', merchants.temp_level.substr(-1,1))
merchants.select('level').show(truncate=False)

+-----+
|level|
+-----+
|e    |
|b    |
|b    |
|b    |
|a    |
|a    |
|b    |
|c    |
|a    |
|a    |
|a    |
|b    |
|b    |
|b    |
|a    |
|a    |
|a    |
|b    |
|a    |
|c    |
+-----+
only showing top 20 rows



                                                                                



In [144]:
merchants.select('level').distinct().show()

+-----+
|level|
+-----+
|    e|
|    d|
|    c|
|    b|
|    a|
+-----+



In [145]:
# Proportion of merchants level
total_merchants = merchants.count()
merchants_level_a = (merchants.select('level').where(merchants.level == 'a').count() / total_merchants)*100
merchants_level_b = (merchants.select('level').where(merchants.level == 'b').count() / total_merchants)*100
merchants_level_c = (merchants.select('level').where(merchants.level == 'c').count() / total_merchants)*100
merchants_level_d = (merchants.select('level').where(merchants.level == 'd').count() / total_merchants)*100
merchants_level_e = (merchants.select('level').where(merchants.level == 'e').count() / total_merchants)*100

---
### Transaction Frequency (Monthly)

In [49]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [74]:
# Splitting the 'order_datetime' column based on year, month, and date
transactions = transactions.withColumn('year', split(transactions['order_datetime'], '-').getItem(0)) \
                           .withColumn('month', split(transactions['order_datetime'], '-').getItem(1)) \
                           .withColumn('date', split(transactions['order_datetime'], '-').getItem(2))
transactions.show(truncate=False)

+-------+------------+------------------+------------------------------------+--------------+----+-----+----+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|year|month|date|
+-------+------------+------------------+------------------------------------+--------------+----+-----+----+
|18478  |62191208634 |63.255848959735246|949a63c8-29f7-4ab0-ada4-99ac50a88952|2021-08-20    |2021|08   |20  |
|2      |15549624934 |130.3505283105634 |6a84c3cf-612a-4574-835b-144a47353eff|2021-08-20    |2021|08   |20  |
|18479  |64403598239 |120.15860593212783|b10dcc33-e53f-4254-863c-de5266810cbc|2021-08-20    |2021|08   |20  |
|3      |60956456424 |136.6785200286976 |0f09c5a5-784e-4477-b049-8ee4dd069b7b|2021-08-20    |2021|08   |20  |
|18479  |94493496784 |72.96316578355305 |f6c78c1a-4600-4c5f-8e97-6e9eb534b586|2021-08-20    |2021|08   |20  |
|3      |76819856970 |448.529684285612  |5ace6a24-cdf0-4aa3-b571-1d9406b352b5|2021-08-20    |2021|08   |20  |
|18479  |6

In [93]:
# Divide transactions according to year
tx_2021 = transactions.filter(transactions.year == '2021')
tx_2022 = transactions.filter(transactions.year == '2022')

In [101]:
# Find distinct month on both year '2021' and '2022'
tx_2021.select('month').distinct().show()
tx_2022.select('month').distinct().show()

                                                                                

+-----+
|month|
+-----+
|   07|
|   05|
|   08|
|   06|
|   04|
|   03|
|   02|
|   11|
|   12|
|   09|
|   10|
+-----+



[Stage 255:>                                                        (0 + 4) / 4]

+-----+
|month|
+-----+
|   01|
|   02|
+-----+



                                                                                

In [112]:
# Count transactions monthly for year '2021'
tx_02_2021 = tx_2021.select('month').where(tx_2021.month == 2).count()
tx_03_2021 = tx_2021.select('month').where(tx_2021.month == 3).count()
tx_04_2021 = tx_2021.select('month').where(tx_2021.month == 4).count()
tx_05_2021 = tx_2021.select('month').where(tx_2021.month == 5).count()
tx_06_2021 = tx_2021.select('month').where(tx_2021.month == 6).count()
tx_07_2021 = tx_2021.select('month').where(tx_2021.month == 7).count()
tx_08_2021 = tx_2021.select('month').where(tx_2021.month == 8).count()
tx_09_2021 = tx_2021.select('month').where(tx_2021.month == 9).count()
tx_10_2021 = tx_2021.select('month').where(tx_2021.month == 10).count()
tx_11_2021 = tx_2021.select('month').where(tx_2021.month == 11).count()
tx_12_2021 = tx_2021.select('month').where(tx_2021.month == 12).count()

17107

In [103]:
# Count transactions monthly for year '2022'
tx_01_2022 = tx_2022.select('month').where(tx_2022.month == 1).count()
tx_02_2022 = tx_2022.select('month').where(tx_2022.month == 2).count()

In [108]:
# Create new dataframe to store transaction frequency data
from pyspark.sql.types import StructType, StructField, StringType

data = [('2021', '02', tx_02_2021), ('2021', '03', tx_03_2021), ('2021', '04', tx_04_2021), ('2021', '05', tx_05_2021),
        ('2021', '06', tx_06_2021), ('2021', '07', tx_07_2021), ('2021', '08', tx_08_2021), ('2021', '09', tx_09_2021),
        ('2021', '10', tx_10_2021), ('2021', '11', tx_11_2021), ('2021', '12', tx_12_2021), ('2022', '01', tx_01_2022),
        ('2022', '02', tx_02_2022)]

schema = StructType([\
    StructField('year',StringType(),True), \
    StructField('month',StringType(),True), \
    StructField('transaction_frequency',StringType(),True), \
        ])

tx_frequency = spark.createDataFrame(data=data,schema=schema)
tx_frequency.printSchema()
tx_frequency.show(truncate=False)

root
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- transaction_frequency: string (nullable = true)



[Stage 269:>                                                        (0 + 1) / 1]

+----+-----+---------------------+
|year|month|transaction_frequency|
+----+-----+---------------------+
|2021|02   |17107                |
|2021|03   |546333               |
|2021|04   |560680               |
|2021|05   |636666               |
|2021|06   |627148               |
|2021|07   |655279               |
|2021|08   |688822               |
|2021|09   |678524               |
|2021|10   |729619               |
|2021|11   |991226               |
|2021|12   |957647               |
|2022|01   |552478               |
|2022|02   |509843               |
+----+-----+---------------------+



                                                                                

---
### Counting the number of unique customers

In [110]:
# Calculating the number of unique customers in year '2021' and '2022'
from pyspark.sql.functions import countDistinct

tx_2021.select(countDistinct('merchant_abn')).show()
tx_2022.select(countDistinct('merchant_abn')).show()

                                                                                

+----------------------------+
|count(DISTINCT merchant_abn)|
+----------------------------+
|                        4406|
+----------------------------+

+----------------------------+
|count(DISTINCT merchant_abn)|
+----------------------------+
|                        4166|
+----------------------------+



---
### Calculate average transaction amount per merchant 

In [148]:
df_trx.show()



22/09/07 22:48:07 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks 
java.io.IOException: Failed to connect to /10.12.51.70:56489
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.start(RetryingBlockTransferor.java:133)
	at org.apache.spark.network.netty.NettyBlockTransferService.fetchBlocks(NettyBlockTransferService.scala:146)
	at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:102)
	at org.apache.spark.storage.BlockManager.fetchRemoteManagedBuffer(BlockMana

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/roseline/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/roseline/opt/anaconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/roseline/opt/anaconda3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

--- 
### Calculate the median of age from the population dataset

--- 
### Calculate the mode of age group