In [50]:
import sys
sys.path.append('../scripts/')
from utility import read_file, create_folder, temp_record_query, temp_record_sdf

In [51]:
from pyspark.sql import SparkSession

In [52]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10 GiB')
    .config('spark.shuffle.file.buffer', '64k')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

In [53]:
print("================ Connect Table ================")
conn = read_file(spark, 'consumer_user_details.parquet')

print("================ Consumer ================")
sdf = read_file(spark, 'tbl_consumer.csv', type='csv', sep='|')

|> Loading File...
|> Loading Finished!
-RECORD 0--------------
 user_id     | 1       
 consumer_id | 1195503 
only showing top 1 row

|> Loading File...
|> Loading Finished!
-RECORD 0---------------------------------
 name        | Yolanda Williams           
 address     | 413 Haney Gardens Apt. 742 
 state       | WA                         
 postcode    | 6935                       
 gender      | Female                     
 consumer_id | 1195503                    
only showing top 1 row



In [54]:
conn

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975
6,407340
7,511685
8,448088
9,650435
10,1058499


- First, balabala

In [55]:
sdf 

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


Based on the two separate tables conn and sdf, we try to merge them first.

In [56]:
merge_data= sdf.join(conn, on=['consumer_id'], how='left_outer')

In [57]:
merge_data

consumer_id,name,address,state,postcode,gender,user_id
1195503,Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1
179208,Mary Smith,3764 Amber Oval,NSW,2782,Female,2
1194530,Jill Jones MD,40693 Henry Greens,NT,862,Female,3
154128,Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,4
712975,Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,5
407340,Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,6
511685,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,7
448088,Stephen Williams,6804 Wright Crest...,WA,6056,Male,8
650435,Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,9
1058499,Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,10


For the obtained table, we look for the data in it to see if there is empty data.

In [58]:
import pyspark.sql.functions as f
from functools import reduce
sdf.where(reduce(lambda x, y: x | y, (f.col(x).isNull() for x in sdf.columns))).show()

+----+-------+-----+--------+------+-----------+
|name|address|state|postcode|gender|consumer_id|
+----+-------+-----+--------+------+-----------+
+----+-------+-----+--------+------+-----------+



In [59]:
print(sdf.toPandas().isnull().sum())

name           0
address        0
state          0
postcode       0
gender         0
consumer_id    0
dtype: int64


In [60]:
sdf

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


After having a certain understanding of the data, we will store the merged data for subsequent merging with the merchant table.

In [61]:
def temp_record_sdf(sdf:SparkSession, path = '../data/temp', overwrite = False):
    '''
Save current progress for future steps

Parameters
----------
sdf : spark dataframe
path : str
    Path to save data, defualt as `../data/temp`
overwrite : bool
    Set if cover the origin data, defualt False

Examples
--------
>>> temp_record_sdf(sdf, path='../data/temp')
>>> temp_record_sdf(sdf, path='../data/temp')
>>> temp_record_sdf(sdf, path='../data/temp', overwrite=True)
|> Waitting for saving...
|> Save Successfully!
--
|> Waitting for saving...
|> The folder already exist! Change the attr `overwrite` to cover the origin data.
-- 
|> Waitting for saving...
|> Save Successfully!

>>> print(os.listdir( '../data' ))
>>> print(os.path.isfile( '../data/temp.parquet' ))
>>> temp_record_sdf(sdf, path='../data/temp.parquet')
>>> temp_record_sdf(sdf, path='../data/temp.parquet', overwrite=True)
['tables', '.gitkeep', 'README.md', 'temp.parquet', 'curated']
--
True
--
|> The name duplicated with a file!
   Change the name or change the attr `overwrite` to cover the origin data.
--
|> Waitting for saving...
|> Save Successfully!
    '''


    # folder should not already exist
    if not os.path.exists(path):
        print('|> Waitting for saving...')
        sdf.write.parquet(path)
        print('|> Save Successfully!')
    
    # if the folder aleady created, the print out the files under this folder
    elif os.path.isdir(path):
        try:
            print('|> Waitting for saving...')
            if (overwrite):
                sdf.write.partitionBy('order_datetime').parquet(path, mode = 'overwrite')
            else:
                sdf.write.parquet(path)
            print('|> Save Successfully!')
        except Exception:
            print('|> The folder already exist! Change the attr `overwrite` to cover the origin data.')
    
    # the name of the new folder is the same as a file already exist under the upper folder
    elif os.path.isfile(path):
        if (overwrite):
            print('|> Waitting for saving...')
            sdf.write.parquet(path, mode = 'overwrite')
            print('|> Save Successfully!')
        else:
            print(f'|> The name duplicated with a file!\n   Change the name or change the attr `overwrite` to cover the origin data.')

    return 

In [62]:
temp_record_sdf(merge_data, '../data/curated/new_consumer_data.parquet')

|> Waitting for saving...
|> The folder already exist! Change the attr `overwrite` to cover the origin data.


Start experimenting with visualization

In [63]:
df=sdf.groupby('state').count()
df = df.toPandas()
result = df.values.tolist()

    
result

[['NT', 7764],
 ['ACT', 4664],
 ['SA', 54973],
 ['TAS', 18878],
 ['WA', 79146],
 ['QLD', 72861],
 ['VIC', 117525],
 ['NSW', 144188]]

In [64]:
x_vals =[]
y_vals = []
for i in range(len(result)):
    x_vals.append(result[i][0])
    y_vals.append(result[i][1])
    for j in range(len(result[i])):
        if result[i][0]  == 'WA':
            result[i][0] = 'Western Australia'
        elif result[i][0]  == 'NT':
            result[i][0] = 'Northern Territory'
        elif result[i][0]  == 'ACT':
            result[i][0] = 'Australian Capital Territory'
        elif result[i][0]  == 'SA':
            result[i][0] = 'South Australia'
        elif result[i][0]  == 'TAS':
            result[i][0] = 'Tasmania'
        elif result[i][0]  == 'QLD':
            result[i][0] = 'Queensland'
        elif result[i][0]  == 'VIC':
            result[i][0] = 'Victoria'
        elif result[i][0]  == 'NSW':
            result[i][0] = 'New South Wales'


In [65]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.faker import Collector, Faker
from pyecharts.datasets import register_url
register_url("https://echarts-maps.github.io/echarts-countries-js/")
c = (
    Map()
    .add("The consumer of Australiar", result, "澳大利亚")
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Map-VisualMap"),
        visualmap_opts=opts.VisualMapOpts(max_=150000),
    )
)
c.render_notebook()
#c.render("demo11.html")

In [66]:
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker

c = (
    Pie()
    .add(
        "",
        result,
        radius=["40%", "75%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Pie-Radius"),
        legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
#     .render("pie_radius.html")
)
c.render_notebook()

In [67]:
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.faker import Faker

y = Faker.values()
y[3], y[5] = None, None
c = (
    Line()
    .add_xaxis(x_vals)
    .add_yaxis("consumer",y_vals, is_connect_nones=True)
    .set_global_opts(title_opts=opts.TitleOpts(title="Line-Australia"))
#     .render("line_connect_null.html")
)
c.render_notebook()