Packages

In [1]:
import os
import numpy
import pandas as pd
import tqdm
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

22/10/04 09:43:41 WARN Utils: Your hostname, Runyus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.3.12 instead (on interface en0)
22/10/04 09:43:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 09:43:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sdf = read_file(spark, 'filled.parquet', '../data/curated/')

|> Loading File...


                                                                                

|> Loading Finished!




-RECORD 0--------------------------------------------------------------------------
 user_id           | 7                                                             
 merchant_abn      | 17488304283                                                   
 dollar_value      | 67.95495287248738                                             
 order_id          | e637ca66-ed07-42c3-a39b-d49a1f97dde8                          
 merchant_name     | Posuere Cubilia Curae Corporation                             
 tags              | cable, satellite, and other pay television and radio services 
 take_rate         | 6.18                                                          
 type              | a                                                             
 postcode          | 4606                                                          
 consumer_id       | 511685                                                        
 name              | Andrea Jones                                           

                                                                                

In [5]:
# type
type_indexer = StringIndexer(inputCol = "type", outputCol = "typeIndex", handleInvalid="skip")

# gender
gender_indexer = StringIndexer(inputCol = "gender", outputCol = "genderIndex", handleInvalid="skip")

# tag
tag_indexer = StringIndexer(inputCol = "tags",outputCol = "tagIndex", handleInvalid="skip")
onehotencoder_tag_vector = OneHotEncoder(inputCol = "tagIndex", outputCol = "tag_vec")

#Create pipeline and pass all stages
pipeline = Pipeline(stages=[gender_indexer, 
                            type_indexer,
                            tag_indexer,
                            onehotencoder_tag_vector])

In [6]:
sdf.show(5)

+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------+--------------------+-----+------+----------+-----------------+---------------+----------+-------+--------------+-----------------+
|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|        name|             address|state|gender|POA_CODE21|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|
+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------+--------------------+-----+------+----------+-----------------+---------------+----------+-------+--------------+-----------------+
|      7| 17488304283| 67.95495287248738|e637ca66-ed07-42c...|Posuere Cubilia C...|cable, satellite,...|     6.18|   a|    4606|     511685|Andre

In [7]:
sdf_transformed = pipeline.fit(sdf).transform(sdf)
sdf_transformed.show(5)

                                                                                

+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------+--------------------+-----+------+----------+-----------------+---------------+----------+-------+--------------+-----------------+-----------+---------+--------+---------------+
|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|        name|             address|state|gender|POA_CODE21|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|genderIndex|typeIndex|tagIndex|        tag_vec|
+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------+--------------------+-----+------+----------+-----------------+---------------+----------+-------+--------------+-----------------+-----------+---------+--------+---------------+
|   

In [8]:
sdf_transformed = (sdf_transformed
                   .withColumnRenamed("2021_popu", 'population')
                   .withColumnRenamed('total','total_retail')
                   )

In [9]:
sdf_transformed.show()

+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------------+--------------------+-----+------+----------+-----------------+---------------+----------+------------+--------------+-----------------+-----------+---------+--------+---------------+
|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|              name|             address|state|gender|POA_CODE21|       population|New cases / day|     month|total_retail|order_datetime|__index_level_0__|genderIndex|typeIndex|tagIndex|        tag_vec|
+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------------+--------------------+-----+------+----------+-----------------+---------------+----------+------------+--------------+-----------------+-----------+-------

In [10]:
path = '../data/curated/changed_data'
if (create_folder(path)):
    sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='append')
else:
    sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='overwrite')

|> The folder already exist!
|> Files already exist under this folder:
   ['order_datetime=2021-10-19', 'order_datetime=2021-10-26', 'order_datetime=2021-10-21', 'order_datetime=2021-10-28', 'order_datetime=2021-10-17', 'order_datetime=2021-10-10', 'order_datetime=2022-02-24', 'order_datetime=2022-01-03', 'order_datetime=2022-01-04', 'order_datetime=2022-02-23', 'order_datetime=2022-02-15', 'order_datetime=2022-02-12', 'order_datetime=2021-10-11', 'order_datetime=2021-10-29', 'order_datetime=2021-10-16', 'order_datetime=2021-10-20', 'order_datetime=2021-10-18', 'order_datetime=2021-10-27', 'order_datetime=2022-02-13', 'order_datetime=2022-02-14', 'order_datetime=2022-02-22', 'order_datetime=2022-01-05', 'order_datetime=2022-01-02', 'order_datetime=2022-02-25', 'order_datetime=2021-09-25', 'order_datetime=2021-12-09', 'order_datetime=2021-11-11', 'order_datetime=2021-11-29', 'order_datetime=2021-09-22', 'order_datetime=2021-11-16', 'order_datetime=2021-12-31', 'order_datetime=2021-09-14

                                                                                

- figure out that tag_vec

In [11]:
SAMPLE_SIZE = 0.05
sdf_transformed = sdf_transformed.sample(SAMPLE_SIZE, seed=0)

In [12]:
df_transformed = sdf_transformed.toPandas()
df_transformed.head()

                                                                                

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,take_rate,type,postcode,consumer_id,...,population,New cases / day,month,total_retail,order_datetime,__index_level_0__,genderIndex,typeIndex,tagIndex,tag_vec
0,10203,86578477987,23.913531,54f331ff-9151-4554-b0ff-dd0298f65c53,Leo In Consulting,"watch, clock, and jewelry repair shops",6.43,a,3808,1112199,...,1910.435591030362,1122,2021-08-01,29261.9,2021-08-28,15145,0.0,0.0,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,9792,21772962346,136.351214,f6c7d575-4860-4c2f-9de9-ebf34795e01d,Purus Gravida Sagittis Ltd,"florists supplies, nursery stock, and flowers",6.63,a,6320,470865,...,704.0596653270673,1122,2021-08-01,29261.9,2021-08-28,15078,0.0,0.0,5.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
2,9765,26599529197,18.545336,965c7cde-113e-483d-9c34-84c3241a2836,Neque Company,"digital goods: books, movies, music",3.91,b,6937,1222050,...,25647.94680893945,1122,2021-08-01,29261.9,2021-08-28,15074,0.0,1.0,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,9895,99009287608,62.059175,0e924be2-d6e3-4001-bc6e-761b9530fca6,Nunc Risus LLP,"computer programming , data processing, and in...",3.15,b,3916,511779,...,737.1457842967403,1122,2021-08-01,29261.9,2021-08-28,15096,0.0,1.0,8.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
4,9858,49322182190,89.476384,03ed28e3-e215-41a3-8fd0-ec6eb0f6dc79,Gravida Mauris Incorporated,"watch, clock, and jewelry repair shops",6.35,a,6150,548779,...,16506.44877310965,1122,2021-08-01,29261.9,2021-08-28,15091,1.0,0.0,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


- Figure out corresponding index

gender

In [14]:
gender_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['gender','genderIndex']])

In [15]:
gender_correspond.drop('order_id', 'merchant_name', 'tags', 'type', 'name', 'address', 'state', 'population','month','__index_level_0__', 'typeIndex', 'tagIndex','tag_vec').show()



+-------+------------+------------------+---------+--------+-----------+-----------+----------+---------------+------------+--------------+-----------+
|user_id|merchant_abn|      dollar_value|take_rate|postcode|consumer_id|     gender|POA_CODE21|New cases / day|total_retail|order_datetime|genderIndex|
+-------+------------+------------------+---------+--------+-----------+-----------+----------+---------------+------------+--------------+-----------+
|   9858| 49322182190| 89.47638379424112|     6.35|    6150|     548779|     Female|      6150|          1,122|     29261.9|    2021-08-28|        1.0|
|  10203| 86578477987|23.913530850853704|     6.43|    3808|    1112199|       Male|      3808|          1,122|     29261.9|    2021-08-28|        0.0|
|  10699| 98545158925| 45.89774538496057|     2.13|    6043|     608778|Undisclosed|      6043|          1,122|     29261.9|    2021-08-28|        2.0|
+-------+------------+------------------+---------+--------+-----------+-----------+----

                                                                                

type

In [16]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['type','typeIndex']])

In [17]:
type_correspond.drop('order_id', 'merchant_name', 'tags', 'gender', 'name', 'address', 'state','month','__index_level_0__', 'genderIndex', 'tagIndex','tag_vec').show()



+-------+------------+------------------+---------+----+--------+-----------+----------+-----------------+---------------+------------+--------------+---------+
|user_id|merchant_abn|      dollar_value|take_rate|type|postcode|consumer_id|POA_CODE21|       population|New cases / day|total_retail|order_datetime|typeIndex|
+-------+------------+------------------+---------+----+--------+-----------+----------+-----------------+---------------+------------+--------------+---------+
|  10203| 86578477987|23.913530850853704|     6.43|   a|    3808|    1112199|      3808|1910.435591030362|          1,122|     29261.9|    2021-08-28|      0.0|
|   9765| 26599529197|18.545335792143806|     3.91|   b|    6937|    1222050|      6060|25647.94680893945|          1,122|     29261.9|    2021-08-28|      1.0|
|  10699| 98545158925| 45.89774538496057|     2.13|   c|    6043|     608778|      6043|206.6451754029667|          1,122|     29261.9|    2021-08-28|      2.0|
|   9174| 65453072511|431.93100670

                                                                                

tag

In [18]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['tag','tagIndex', 'tag_vec']])

In [19]:
type_correspond.drop('order_id', 'merchant_name', 'type', 'gender', 'name', 'address', 'state','month','__index_level_0__', 'genderIndex', 'typeIndex').show()

22/10/04 09:47:40 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




+-------+------------+------------------+--------------------+---------+--------+-----------+----------+-----------------+---------------+------------+--------------+--------+---------------+
|user_id|merchant_abn|      dollar_value|                tags|take_rate|postcode|consumer_id|POA_CODE21|       population|New cases / day|total_retail|order_datetime|tagIndex|        tag_vec|
+-------+------------+------------------+--------------------+---------+--------+-----------+----------+-----------------+---------------+------------+--------------+--------+---------------+
|  10590| 17324645993| 4.876447995446519|tent and awning s...|     5.73|    2644|      48738|      2644|4971.786201014342|          1,122|     29261.9|    2021-08-28|     0.0| (24,[0],[1.0])|
|  14059| 94378706737|29.862323249599637|gift, card, novel...|     3.29|    3155|    1219235|      3155|23920.03564972908|          1,122|     29261.9|    2021-08-28|     1.0| (24,[1],[1.0])|
|   9765| 26599529197|18.545335792143806

                                                                                

- Drop unused column

In [42]:
sdf_transformed = sdf_transformed.drop('order_id', 'merchant_name', 'tags', 'type', 'name', 'address', 'state', 'gender','month','__index_level_0__')

In [43]:
sdf_transformed.show()

+-------+------------+------------------+---------+--------+-----------+----------+-----------------+---------------+-------+--------------+-----------+---------+--------+---------------+
|user_id|merchant_abn|      dollar_value|take_rate|postcode|consumer_id|POA_CODE21|        2021_popu|New cases / day|  total|order_datetime|genderIndex|typeIndex|tagIndex|        tag_vec|
+-------+------------+------------------+---------+--------+-----------+----------+-----------------+---------------+-------+--------------+-----------+---------+--------+---------------+
|      7| 17488304283| 67.95495287248738|     6.18|    4606|     511685|      4606|557.0307901493427|          1,122|29261.9|    2021-08-28|        1.0|      0.0|     4.0| (24,[4],[1.0])|
|  10087| 41251795489|43.036903367549215|     2.91|    2573|     624829|      2573|1705.489731489324|          1,122|29261.9|    2021-08-28|        1.0|      2.0|     6.0| (24,[6],[1.0])|
|  10087| 52606993642|17.161402492530158|     2.35|    2573|

In [44]:
from pyspark.sql.functions import year, month, dayofmonth

In [45]:
sdf_transformed = (sdf_transformed
                   .withColumnRenamed("2021_popu", 'population')
                   .withColumnRenamed('total','total_retail')
                   )

In [48]:
sdf_transformed.show()

+-------+------------+------------------+---------+--------+-----------+----------+-----------------+---------------+------------+--------------+-----------+---------+--------+---------------+
|user_id|merchant_abn|      dollar_value|take_rate|postcode|consumer_id|POA_CODE21|       population|New cases / day|total_retail|order_datetime|genderIndex|typeIndex|tagIndex|        tag_vec|
+-------+------------+------------------+---------+--------+-----------+----------+-----------------+---------------+------------+--------------+-----------+---------+--------+---------------+
|      7| 17488304283| 67.95495287248738|     6.18|    4606|     511685|      4606|557.0307901493427|          1,122|     29261.9|    2021-08-28|        1.0|      0.0|     4.0| (24,[4],[1.0])|
|  10087| 41251795489|43.036903367549215|     2.91|    2573|     624829|      2573|1705.489731489324|          1,122|     29261.9|    2021-08-28|        1.0|      2.0|     6.0| (24,[6],[1.0])|
|  10087| 52606993642|17.1614024925