Start spark session.

In [1]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project2 Pre_process")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/09/03 16:41:59 WARN Utils: Your hostname, DESKTOP-1ML24G5 resolves to a loopback address: 127.0.1.1; using 172.26.94.46 instead (on interface eth0)
22/09/03 16:41:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/03 16:42:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Merchants data:

In [2]:
merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")
merchants = merchants.withColumnRenamed("name", "merchant_name")
merchants.show(2, vertical=True)
merchants.count()

                                                                                

-RECORD 0-----------------------------
 merchant_name | Felis Limited        
 tags          | ((furniture, home... 
 merchant_abn  | 10023283211          
-RECORD 1-----------------------------
 merchant_name | Arcu Ac Orci Corp... 
 tags          | ([cable, satellit... 
 merchant_abn  | 10142254217          
only showing top 2 rows



4026

An example of an instance of the "tag" column.

In [3]:
merchants.collect()[0][1]

[Stage 5:>                                                          (0 + 1) / 1]                                                                                

'((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))'

Extract revenue level and take rate from the "tags" column.

In [42]:
from pyspark.sql import functions as F

merchants = merchants.withColumn("tag", F.split("tags", pattern = "(\],\s\[)|(\),\s\()").getItem(0))
merchants = merchants.withColumn("tag", F.expr("substring(tag, 3, length(tag)-2)"))
merchants = merchants.withColumn("revenue_level", F.split("tags", pattern = "(\],\s\[)|(\),\s\()").getItem(1))
merchants = merchants.withColumn("take_rate", F.split("tags", pattern = "(\],\s\[)|(\),\s\()").getItem(2))
merchants = merchants.withColumn("take_rate", F.substring("take_rate", 12, 4))
merchants.show(20, vertical=True, truncate = False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------
 merchant_name                     | Felis Limited                                                                                                     
 tags                              | ((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18)) 
 merchant_abn                      | 10023283211                                                                                                       
 tag                               | furniture, home furnishings and equipment shops, and manufacturers, except appliances                             
 revenue_level                     | e                                                                                                                 
 take_rate                         | 0.18                                               

In [9]:
# Convert all tags to lower for grammatical consistency.
merchants = merchants.withColumn("tag", F.lower(F.col("tag")))

# Replace unneccessary (multiple) spaces with single space for grammatical consistency.
merchants = merchants.withColumn("tag", F.regexp_replace(F.col("tag"), " +", " "))

In [10]:
# Check modified 'tag' column to compare changes to original data frame above.

merchants.show(20, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------
 merchant_name | Felis Limited                                                                                                     
 tags          | ((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18)) 
 merchant_abn  | 10023283211                                                                                                       
 tag           | furniture, home furnishings and equipment shops, and manufacturers, except appliances                             
 revenue_level | e                                                                                                                 
 take_rate     | 0.18                                                                                                              
-RECORD 1-------------------------------------------------------------------

In [17]:
# Check distinct tags and number of occurrences from all the merchants.

tag_occurrences = merchants \
                .groupby("tag") \
                .agg(
                    F.count("tag").alias("no_of_occurrences"), \
                ).orderBy(F.count("tag"), ascending=False)

tag_occurrences.show(50, truncate = False)

+-------------------------------------------------------------------------------------+-----------------+
|tag                                                                                  |no_of_occurrences|
+-------------------------------------------------------------------------------------+-----------------+
|digital goods: books, movies, music                                                  |195              |
|artist supply and craft shops                                                        |193              |
|computer programming , data processing, and integrated systems design services       |191              |
|shoe shops                                                                           |185              |
|gift, card, novelty, and souvenir shops                                              |182              |
|furniture, home furnishings and equipment shops, and manufacturers, except appliances|182              |
|computers, computer peripheral equipment, and

In [18]:
# Add row number column to aggregate table and convert to pandas data frame for easier data manipulation.

from pyspark.sql.window import Window

tag_occurrences = tag_occurrences.select("tag", "no_of_occurrences", F.row_number().over(Window.partitionBy(). \
                                          orderBy(tag_occurrences['no_of_occurrences'])).alias("row_num"))

tag_pdf = tag_occurrences.toPandas()
tag_pdf.head()

22/09/03 16:53:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/03 16:53:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/03 16:53:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/03 16:53:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/03 16:53:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/03 16:53:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/09/03 1

Unnamed: 0,tag,no_of_occurrences,row_num
0,"jewelry, watch, clock, and silverware shops",91,1
1,art dealers and galleries,112,2
2,telecom,125,3
3,"antique shops - sales, repairs, and restoratio...",129,4
4,"equipment, tool, furniture, and appliance rent...",134,5


In [20]:
# Convert merchants pyspark data frame to pandas data frame for easier data manipulation.

merchants_pdf = merchants.toPandas()
merchants_pdf.head()

Unnamed: 0,merchant_name,tags,merchant_abn,tag,revenue_level,take_rate
0,Felis Limited,"((furniture, home furnishings and equipment sh...",10023283211,"furniture, home furnishings and equipment shop...",e,0.18
1,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...",10142254217,"cable, satellite, and other pay television and...",b,4.22
2,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]...",10165489824,"jewelry, watch, clock, and silverware shops",b,4.4
3,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops], [b]...",10187291046,"watch, clock, and jewelry repair shops",b,3.29
4,Enim Condimentum PC,"([music shops - musical instruments, pianos, a...",10192359162,"music shops - musical instruments, pianos, and...",a,6.33


https://business.gov.au/planning/industry-information <br>
Using the above link as a reference, we are able to pick out five industry segments that accommodated all the distinct tags above. The five categories are: <br>
1) AGRICULTURE <br>
2) ARTS AND RECREATION <br>
3) INFORMATION MEDIA AND TELECOMMUNICATIONS INDUSTRY <br>
4) RENTAL, HIRING AND REAL ESTATE SERVICES INDUSTRY <br>
5) RETAIL AND WHOLESALE TRADE INDUSTRY

In [21]:
# Convert numbers to their respective tag labels based on their row number from tag_pdf.

def assign_tags(arr_of_numbers):
    tags = []
    for num in arr_of_numbers:
        tags.append(tag_pdf['tag'][num-1])
    return tags

In [22]:
# Populate tag labels for each category.

agriculture = assign_tags([9, 18])
arts_and_recreation = assign_tags([2])
info_media_and_telecommunications = assign_tags([3, 11, 16, 23, 25])
rental_hiring_and_real_estate = assign_tags([5])
retail_and_wholesale_trade = assign_tags([1, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 17, 19, 20, 21, 22, 24])

In [23]:
# Example of tag labels for the retail and wholesale trade industry.

print(retail_and_wholesale_trade)

['jewelry, watch, clock, and silverware shops', 'antique shops - sales, repairs, and restoration services', 'hobby, toy and game shops', 'opticians, optical goods, and eyeglasses', 'motor vehicle supplies and new parts', 'lawn and garden supply outlets, including nurseries', 'stationery, office supplies and printing and writing paper', 'health and beauty spas', 'music shops - musical instruments, pianos, and sheet music', 'watch, clock, and jewelry repair shops', 'bicycle shops - sales and service', 'tent and awning shops', 'computers, computer peripheral equipment, and software', 'gift, card, novelty, and souvenir shops', 'furniture, home furnishings and equipment shops, and manufacturers, except appliances', 'shoe shops', 'artist supply and craft shops']


In [24]:
# Insert new column 'industry_category' to capture which industry a merchant belongs to and intialise
# five new columns to 0 (one for each category) to start the one-hot encode process.

new_col_names = ['industry_category' ,'agriculture', 'arts_and_recreation', 'info_media_and_telecommunications',
                 'rental_hiring_and_real_estate', 'retail_and_wholesale_trade']

for i in range(len(new_col_names)):
    if i == 0:
        merchants_pdf[new_col_names[i]] = ""
    else:
        merchants_pdf[new_col_names[i]] = 0

In [25]:
# Check newly inserted columns.

merchants_pdf.head(3)

Unnamed: 0,merchant_name,tags,merchant_abn,tag,revenue_level,take_rate,industry_category,agriculture,arts_and_recreation,info_media_and_telecommunications,rental_hiring_and_real_estate,retail_and_wholesale_trade
0,Felis Limited,"((furniture, home furnishings and equipment sh...",10023283211,"furniture, home furnishings and equipment shop...",e,0.18,,0,0,0,0,0
1,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...",10142254217,"cable, satellite, and other pay television and...",b,4.22,,0,0,0,0,0
2,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]...",10165489824,"jewelry, watch, clock, and silverware shops",b,4.4,,0,0,0,0,0


In [26]:
# Assign an industry category to each merchant entry and one-hot encode.

for i in range(len(merchants_pdf)):
    if merchants_pdf.loc[i, 'tag'] in agriculture:
        merchants_pdf.loc[i, 'industry_category'] = 'agriculture'
        merchants_pdf.loc[i, 'agriculture'] = 1
    elif merchants_pdf.loc[i, 'tag'] in arts_and_recreation:
        merchants_pdf.loc[i, 'industry_category'] = 'arts_and_recreation'
        merchants_pdf.loc[i, 'arts_and_recreation'] = 1
    elif merchants_pdf.loc[i, 'tag'] in info_media_and_telecommunications:
        merchants_pdf.loc[i, 'industry_category'] = 'info_media_and_telecommunications'
        merchants_pdf.loc[i, 'info_media_and_telecommunications'] = 1
    elif merchants_pdf.loc[i, 'tag'] in rental_hiring_and_real_estate:
        merchants_pdf.loc[i, 'industry_category'] = 'rental_hiring_and_real_estate'
        merchants_pdf.loc[i, 'rental_hiring_and_real_estate'] = 1
    else:
        merchants_pdf.loc[i, 'industry_category'] = 'retail_and_wholesale_trade'
        merchants_pdf.loc[i, 'retail_and_wholesale_trade'] = 1

In [27]:
# Check newly inserted values for industry category and their respective one-hot encoded columns.

merchants_pdf.head(3)

Unnamed: 0,merchant_name,tags,merchant_abn,tag,revenue_level,take_rate,industry_category,agriculture,arts_and_recreation,info_media_and_telecommunications,rental_hiring_and_real_estate,retail_and_wholesale_trade
0,Felis Limited,"((furniture, home furnishings and equipment sh...",10023283211,"furniture, home furnishings and equipment shop...",e,0.18,retail_and_wholesale_trade,0,0,0,0,1
1,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...",10142254217,"cable, satellite, and other pay television and...",b,4.22,info_media_and_telecommunications,0,0,1,0,0
2,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]...",10165489824,"jewelry, watch, clock, and silverware shops",b,4.4,retail_and_wholesale_trade,0,0,0,0,1


Compare the number of merchants and number of unique ABN values. If different, further investigation would be required as <br>
every merchant should have their own unique ABN. In this case, no further investigation is required as every merchant has <br>
their own unique ABN.

In [33]:
print(len(merchants_pdf))
print(merchants_pdf['merchant_abn'].nunique())

4026
4026


In [34]:
# Convert modified merchants pandas data frame to pyspark dataframe.

merchants=spark.createDataFrame(merchants_pdf)

Consumer data:

In [35]:
consumer = spark.read.option("delimiter", "|").option("header", "true").csv("../data/tables/tbl_consumer.csv")
consumer = consumer.withColumnRenamed("name", "consumer_name")
consumer.show(3, vertical=True)
consumer.count()

-RECORD 0-----------------------------
 consumer_name | Yolanda Williams     
 address       | 413 Haney Gardens... 
 state         | WA                   
 postcode      | 6935                 
 gender        | Female               
 consumer_id   | 1195503              
-RECORD 1-----------------------------
 consumer_name | Mary Smith           
 address       | 3764 Amber Oval      
 state         | NSW                  
 postcode      | 2782                 
 gender        | Female               
 consumer_id   | 179208               
-RECORD 2-----------------------------
 consumer_name | Jill Jones MD        
 address       | 40693 Henry Greens   
 state         | NT                   
 postcode      | 862                  
 gender        | Female               
 consumer_id   | 1194530              
only showing top 3 rows



                                                                                

499999

Consumer user details:

In [36]:
userdetails = spark.read.parquet("../data/tables/consumer_user_details.parquet")
userdetails.show(1, vertical=True)

-RECORD 0--------------
 user_id     | 1       
 consumer_id | 1195503 
only showing top 1 row



Transactions:

In [37]:
transactions = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
transactions.show(2, vertical = True)
transactions.count()

                                                                                

-RECORD 0------------------------------
 user_id        | 18478                
 merchant_abn   | 62191208634          
 dollar_value   | 63.255848959735246   
 order_id       | 949a63c8-29f7-4ab... 
 order_datetime | 2021-08-20           
-RECORD 1------------------------------
 user_id        | 2                    
 merchant_abn   | 15549624934          
 dollar_value   | 130.3505283105634    
 order_id       | 6a84c3cf-612a-457... 
 order_datetime | 2021-08-20           
only showing top 2 rows



                                                                                

3643266

1. Use consumer user details to find the consumer_id for each transaction.
2. Use consumer_id to join transaction data and consumer data.
3. Use merchant_abn to join transaction data and merchant data.

In [38]:
transactions = transactions.join(userdetails, on="user_id", how="left")
transactions = transactions.join(consumer, on="consumer_id", how="left")
transactions = transactions.join(merchants, on="merchant_abn", how="left")
transactions.show(2, vertical = True)
transactions.count()

                                                                                

-RECORD 0-------------------------------------------------
 merchant_abn                      | 56946407125          
 consumer_id                       | 1398491              
 user_id                           | 18480                
 dollar_value                      | 26.106408993894117   
 order_id                          | 04357bc2-0a50-400... 
 order_datetime                    | 2021-06-29           
 consumer_name                     | John Fischer         
 address                           | 4770 Mayer Passag... 
 state                             | QLD                  
 postcode                          | 4887                 
 gender                            | Male                 
 merchant_name                     | Quisque Fringilla... 
 tags                              | ([computers, comp... 
 tag                               | computers, comput... 
 revenue_level                     | a                    
 take_rate                         | 5.80               

                                                                                

3643266

Group transactions by merchant name, and show merchants with highest and lowest total sales.

In [39]:
import pandas as pd

df = transactions.groupBy("merchant_name").sum("dollar_value").toPandas()
df = df.sort_values(by = "sum(dollar_value)", ascending = False)
df

                                                                                

Unnamed: 0,merchant_name,sum(dollar_value)
807,,5.162072e+07
2769,Arcu Morbi Institute,2.586773e+06
1083,Amet Risus Inc.,2.569946e+06
1828,Lacus Aliquam Corporation,2.506652e+06
2556,Placerat Orci Institute,2.505284e+06
...,...,...
338,Malesuada Fames Limited,4.010094e+02
828,Ac Orci Ut Foundation,3.840798e+02
3977,Magna Praesent Interdum Industries,3.396543e+02
3936,Fringilla Mi Lacinia Incorporated,3.296614e+02


Save data as a parquet file in the curated data folder

In [40]:
transactions.write.mode('overwrite').parquet('../data/curated/rawdata.parquet')

                                                                                