### read_file

In [1]:
from pyspark.sql import SparkSession

In [2]:
def read_file(spark_session: SparkSession, file_name, ptw='../data/tables/', type='parquet', truncate=80, sep=','):
    '''
To read different type of file use spark. And show the first metadata after read. 

Parameters
----------
spark_session : DataFrame
file_name: str
    The full name of the file to read. 
ptw : 
    The relative path of the file to read, default '../data/tables/'
type : {'parquet', 'csv'}, default 'parquet'
truncate : int, default 80
    Parameter of `show` function spark dataframe, which control the maximum 
    number of characters per row.
sep : str, default ','
    For csv reading, control the seperate character.


Returns
-------
Spark DataFrame
    A DataFrame of the read file.


Examples
--------
>>> sdf = read_file(spark, 'tbl_merchants.parquet')
|> Loading File...
|> Loading Finished!
-RECORD 0----------------------------------------------------------------------------------------
 name         | Felis Limited                                                                    
 tags         | ((furniture, home furnishings and equipment shops, and manufacturers, except ... 
 merchant_abn | 10023283211                                                                      
only showing top 1 row

>>> sdf = read_file(spark, 'tbl_merchants.parquet', truncate=20)
|> Loading File...
|> Loading Finished!
-RECORD 0----------------------------
 name         | Felis Limited        
 tags         | ((furniture, home... 
 merchant_abn | 10023283211          
only showing top 1 row

>>> sdf = read_file(spark, 'tbl_consumer.csv', type='csv', sep='|')
|> Loading File...
|> Loading Finished!
-RECORD 0---------------------------------
 name        | Yolanda Williams           
 address     | 413 Haney Gardens Apt. 742 
 state       | WA                         
 postcode    | 6935                       
 gender      | Female                     
 consumer_id | 1195503                    
only showing top 1 row
    '''

    # read file
    print('|> Loading File...')
    if type == 'csv':
        sdf = spark_session.read.csv(f'{ptw}{file_name}', sep=sep, header=True)

    elif type == 'parquet':
        sdf = spark_session.read.parquet(f'{ptw}{file_name}')
    print('|> Loading Finished!')

    # print the first row of data 
    sdf.show(1, vertical=True, truncate=truncate)
    return sdf

In [16]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10 GiB')
    .config('spark.shuffle.file.buffer', '64k')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )
sdf = read_file(spark, 'tbl_merchants.parquet')

|> Loading File...
|> Loading Finished!
-RECORD 0----------------------------------------------------------------------------------------
 name         | Felis Limited                                                                    
 tags         | ((furniture, home furnishings and equipment shops, and manufacturers, except ... 
 merchant_abn | 10023283211                                                                      
only showing top 1 row



In [17]:
sdf.show(10)

+--------------------+--------------------+------------+
|                name|                tags|merchant_abn|
+--------------------+--------------------+------------+
|       Felis Limited|((furniture, home...| 10023283211|
|Arcu Ac Orci Corp...|([cable, satellit...| 10142254217|
|    Nunc Sed Company|([jewelry, watch,...| 10165489824|
|Ultricies Digniss...|([wAtch, clock, a...| 10187291046|
| Enim Condimentum PC|([music shops - m...| 10192359162|
|       Fusce Company|[(gift, card, nov...| 10206519221|
|Aliquam Enim Inco...|[(computers, comP...| 10255988167|
|    Ipsum Primis Ltd|[[watch, clock, a...| 10264435225|
|Pede Ultrices Ind...|([computer progra...| 10279061213|
|           Nunc Inc.|[(furniture, home...| 10323485998|
+--------------------+--------------------+------------+
only showing top 10 rows



In [18]:
sdf = read_file(spark, 'consumer_user_details.parquet')

|> Loading File...
|> Loading Finished!
-RECORD 0--------------
 user_id     | 1       
 consumer_id | 1195503 
only showing top 1 row



In [19]:
sdf.show(10)

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
|      3|    1194530|
|      4|     154128|
|      5|     712975|
|      6|     407340|
|      7|     511685|
|      8|     448088|
|      9|     650435|
|     10|    1058499|
+-------+-----------+
only showing top 10 rows



In [20]:
sdf = read_file(spark, 'transactions_20210228_20210827_snapshot')

|> Loading File...


                                                                                

|> Loading Finished!
-RECORD 0----------------------------------------------
 user_id        | 18478                                
 merchant_abn   | 62191208634                          
 dollar_value   | 63.255848959735246                   
 order_id       | 949a63c8-29f7-4ab0-ada4-99ac50a88952 
 order_datetime | 2021-08-20                           
only showing top 1 row



In [21]:
sdf.show(10)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|
|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|
|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|
|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|
|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|
|      3| 76819856970|  448.529684285612|5ace6a24-cdf0-4aa...|    2021-08-20|
|  18479| 67609108741|  86.4040605836911|d0e180f0-cb06-42a...|    2021-08-20|
|      3| 34096466752| 301.5793450525113|6fb1ff48-24bb-4f9...|    2021-08-20|
|  18482| 70501974849| 68.75486276223054|8505fb33-b69a-412...|    2021-08-20|
|      4| 49891706470| 48.89796461900801|ed11e477-b09f-4ae...|  

In [24]:
csv_file = read_file(spark, 'tbl_consumer.csv', type='csv', sep='|')

|> Loading File...
|> Loading Finished!
-RECORD 0---------------------------------
 name        | Yolanda Williams           
 address     | 413 Haney Gardens Apt. 742 
 state       | WA                         
 postcode    | 6935                       
 gender      | Female                     
 consumer_id | 1195503                    
only showing top 1 row



In [25]:
csv_file.show(10)

+-----------------+--------------------+-----+--------+------+-----------+
|             name|             address|state|postcode|gender|consumer_id|
+-----------------+--------------------+-----+--------+------+-----------+
| Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|    1195503|
|       Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|     179208|
|    Jill Jones MD|  40693 Henry Greens|   NT|     862|Female|    1194530|
|  Lindsay Jimenez|00653 Davenport C...|  NSW|    2780|Female|     154128|
|Rebecca Blanchard|9271 Michael Mano...|   WA|    6355|Female|     712975|
|    Karen Chapman|2706 Stewart Oval...|  NSW|    2033|Female|     407340|
|     Andrea Jones|   122 Brandon Cliff|  QLD|    4606|Female|     511685|
| Stephen Williams|6804 Wright Crest...|   WA|    6056|  Male|     448088|
|  Stephanie Reyes|5813 Denise Land ...|  NSW|    2482|Female|     650435|
| Jillian Gonzales|461 Ryan Common S...|  VIC|    3220|Female|    1058499|
+-----------------+------

### create_folder

In [6]:
import os

In [7]:
def create_folder(path):
    '''
Create folder.

Parameters
----------
path : str
    The relative path of the new folder. 


Examples
--------
>>> create_folder('../data/temp')
|> Create Successfully!

>>> create_folder('../data/tables/consumer_user_details.parquet')
|> The folder name duplicated with a file!
|> Files already exist under the upper folder:
   ['transactions_20210228_20210827_snapshot', '.DS_Store', '.gitkeep', 'consumer_user_details.parquet', 'tbl_consumer.csv', 'tbl_merchants.parquet']

>>> create_folder('../data/tables')
|> The folder already exist!
|> Files already exist under this folder:
   ['transactions_20210228_20210827_snapshot', '.DS_Store', '.gitkeep', 'consumer_user_details.parquet', 'tbl_consumer.csv', 'tbl_merchants.parquet']
    '''

    # folder should not already exist
    if not os.path.exists(path):
        os.makedirs(path)
        print('|> Create Successfully!')
    
    # if the folder aleady created, the print out the files under this folder
    elif os.path.isdir(path):
        print(f'|> The folder already exist!\n|> Files already exist under this folder:\n   {os.listdir(path)}')
    
    # the name of the new folder is the same as a file already exist under the upper folder
    elif os.path.isfile(path):
        upper_path = '/'.join(path.split('/')[:-1])
        print(f'|> The folder name duplicated with a file!\n|> Files already exist under the upper folder:\n   {os.listdir( upper_path )}')
    return 

In [8]:
create_folder('../data/temp')

|> Create Successfully!


### temp_record_sdf

In [10]:
def temp_record_sdf(sdf:SparkSession, path = '../data/temp', overwrite = False):
    '''
Save current progress for future steps

Parameters
----------
sdf : spark dataframe
path : str
    Path to save data, defualt as `../data/temp`
overwrite : bool
    Set if cover the origin data, defualt False

Examples
--------
>>> temp_record_sdf(sdf, path='../data/temp')
>>> temp_record_sdf(sdf, path='../data/temp')
>>> temp_record_sdf(sdf, path='../data/temp', overwrite=True)
|> Waitting for saving...
|> Save Successfully!
--
|> Waitting for saving...
|> The folder already exist! Change the attr `overwrite` to cover the origin data.
-- 
|> Waitting for saving...
|> Save Successfully!

>>> print(os.listdir( '../data' ))
>>> print(os.path.isfile( '../data/temp.parquet' ))
>>> temp_record_sdf(sdf, path='../data/temp.parquet')
>>> temp_record_sdf(sdf, path='../data/temp.parquet', overwrite=True)
['tables', '.gitkeep', 'README.md', 'temp.parquet', 'curated']
--
True
--
|> The name duplicated with a file!
   Change the name or change the attr `overwrite` to cover the origin data.
--
|> Waitting for saving...
|> Save Successfully!
    '''


    # folder should not already exist
    if not os.path.exists(path):
        print('|> Waitting for saving...')
        sdf.write.parquet(path)
        print('|> Save Successfully!')
    
    # if the folder aleady created, the print out the files under this folder
    elif os.path.isdir(path):
        try:
            print('|> Waitting for saving...')
            if (overwrite):
                sdf.write.partitionBy('order_datetime').parquet(path, mode = 'overwrite')
            else:
                sdf.write.parquet(path)
            print('|> Save Successfully!')
        except Exception:
            print('|> The folder already exist! Change the attr `overwrite` to cover the origin data.')
    
    # the name of the new folder is the same as a file already exist under the upper folder
    elif os.path.isfile(path):
        if (overwrite):
            print('|> Waitting for saving...')
            sdf.write.parquet(path, mode = 'overwrite')
            print('|> Save Successfully!')
        else:
            print(f'|> The name duplicated with a file!\n   Change the name or change the attr `overwrite` to cover the origin data.')

    return 

In [11]:
sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [13]:
temp_record_sdf(sdf, path='../data/temp')
# temp_record_sdf(sdf, path='../data/temp')
# temp_record_sdf(sdf, path='../data/temp', overwrite=True)

|> Waitting for saving...
|> The folder already exist! Change the attr `overwrite` to cover the origin data.


### temp_record_query

In [14]:
def temp_record_query(sql_query:SparkSession.sql, *cols,\
    path='../data/temp', overwrite = False):
    '''
Save current progress for future steps

Parameters
----------
sql_query : spark sql query
*cols : 'ColumnsOrName'
    Name of columns.
path : str
    Path to save data, defualt as `../data/temp`
overwrite : bool
    Set if cover the origin data, default False


Examples
--------
>>> sql_query = sdf.orderBy('merchant_abn')
>>> temp_record_query(sql_query, 'name', 'tags', 'merchant_abn')
|> Waitting for saving...
|> Save Successfully!
    '''
    # convert to spark dataframe and save
    temp_record_sdf(sql_query.toDF(*cols), path=path, overwrite=overwrite)
    return 

In [15]:
# sample
sql_query = sdf.orderBy('merchant_abn')
temp_record_query(sql_query, 'name', 'tags', 'merchant_abn', overwrite=True)

IllegalArgumentException: requirement failed: The number of columns doesn't match.
Old column names (5): user_id, merchant_abn, dollar_value, order_id, order_datetime
New column names (3): name, tags, merchant_abn

### transfor between DataFrame and RDD

In [None]:
from pyspark.sql import Row

# DataFrame to RDD
srdd = sdf.rdd.map ( lambda p: Row(user_id=p.user_id, merchant_abn=p.merchant_abn, \
    dollar_value=p.dollar_value, order_id=p.order_id, order_datetime=p.order_datetime) )

# RDD to DataFrame
sdf = spark.createDataFrame( srdd )

In [32]:
import sys
path = os.path.join(os.path.dirname(os.getcwd()), "scripts")
sys.path.append(path)
from utils import Utils_function

In [33]:
sys.path

['/Users/runyuyang/Desktop/generic-buy-now-pay-later-project-group-24/notebooks',
 '/private/var/folders/px/8f9gkhvs7wnf17_m27m6s_fc0000gn/T/spark-5a426a32-7419-4b8f-bd90-2ebf41846fbf/userFiles-08d20a81-81b7-4a4a-841f-513a1ce75fa3',
 '/Users/runyuyang/.vscode/extensions/ms-toolsai.jupyter-2022.6.1201981810/pythonFiles',
 '/Users/runyuyang/.vscode/extensions/ms-toolsai.jupyter-2022.6.1201981810/pythonFiles/lib/python',
 '/Library/Frameworks/Python.framework/Versions/3.7/lib/python37.zip',
 '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7',
 '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/lib-dynload',
 '',
 '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages',
 '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/IPython/extensions',
 '/Users/runyuyang/.ipython',
 'scripts',
 '/Users/runyuyang/Desktop/generic-buy-now-pay-later-project-group-24/scripts']