Packages

In [35]:
import os
import numpy
import pandas as pd
import tqdm
import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

In [36]:
from pyspark.sql import SparkSession

In [37]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

In [38]:
sdf = read_file(spark, 'filled.parquet', '../data/curated/')

|> Loading File...
|> Loading Finished!




-RECORD 0--------------------------------------------------------------------------
 user_id           | 7                                                             
 merchant_abn      | 17488304283                                                   
 dollar_value      | 67.95495287248738                                             
 order_id          | e637ca66-ed07-42c3-a39b-d49a1f97dde8                          
 merchant_name     | Posuere Cubilia Curae Corporation                             
 tags              | cable, satellite, and other pay television and radio services 
 take_rate         | 6.18                                                          
 type              | a                                                             
 postcode          | 4606                                                          
 consumer_id       | 511685                                                        
 name              | Andrea Jones                                           

                                                                                

In [48]:
sdf.show(5)

                                                                                

+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|          name|             address|state|gender|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|          latitude|         longitude|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+
|      5570|  10056| 8657

In [49]:
# type
type_indexer = StringIndexer(inputCol = "type", outputCol = "typeIndex", handleInvalid="skip")

# gender
gender_indexer = StringIndexer(inputCol = "gender", outputCol = "genderIndex", handleInvalid="skip")

# tag
tag_indexer = StringIndexer(inputCol = "tags",outputCol = "tagIndex", handleInvalid="skip")
onehotencoder_tag_vector = OneHotEncoder(inputCol = "tagIndex", outputCol = "tag_vec")

#Create pipeline and pass all stages
pipeline = Pipeline(stages=[gender_indexer, 
                            type_indexer,
                            tag_indexer,
                            onehotencoder_tag_vector])

In [50]:
sdf.show(5)

                                                                                

+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|          name|             address|state|gender|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|          latitude|         longitude|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+
|      5570|  10056| 8657

                                                                                

In [51]:
sdf_transformed = pipeline.fit(sdf).transform(sdf)
sdf_transformed.show(5)



+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+-----------+---------+--------+--------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|            name|             address|state|gender|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|          latitude|         longitude|genderIndex|typeIndex|tagIndex|       tag_vec|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+--------------------+-----+------+-----------------+---------------+----------+-------

                                                                                

In [52]:
sdf_transformed = (sdf_transformed
                   .withColumnRenamed("2021_popu", 'population')
                   .withColumnRenamed('total','total_retail')
                   )

In [53]:
sdf_transformed.show()



+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------------+--------------------+-----+-----------+-----------------+---------------+----------+------------+--------------+-----------------+------------------+-------------------+-----------+---------+--------+---------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|              name|             address|state|     gender|       population|New cases / day|     month|total_retail|order_datetime|__index_level_0__|          latitude|          longitude|genderIndex|typeIndex|tagIndex|        tag_vec|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------------+--------------------+-----+-----------+-----------------

                                                                                

In [54]:
path = '../data/curated/changed_data'
if (create_folder(path)):
    sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='append')
else:
    sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='overwrite')

|> The folder already exist!
|> Files already exist under this folder:
   ['order_datetime=2021-10-19', 'order_datetime=2021-10-26', 'order_datetime=2021-10-21', 'order_datetime=2021-10-28', 'order_datetime=2021-10-17', 'order_datetime=2021-10-10', 'order_datetime=2022-02-24', 'order_datetime=2022-01-03', 'order_datetime=2022-01-04', 'order_datetime=2022-02-23', 'order_datetime=2022-02-15', 'order_datetime=2022-02-12', 'order_datetime=2021-10-11', 'order_datetime=2021-10-29', 'order_datetime=2021-10-16', 'order_datetime=2021-10-20', 'order_datetime=2021-10-18', 'order_datetime=2021-10-27', 'order_datetime=2022-02-13', 'order_datetime=2022-02-14', 'order_datetime=2022-02-22', 'order_datetime=2022-01-05', 'order_datetime=2022-01-02', 'order_datetime=2022-02-25', 'order_datetime=2021-09-25', 'order_datetime=2021-12-09', 'order_datetime=2021-11-11', 'order_datetime=2021-11-29', 'order_datetime=2021-09-22', 'order_datetime=2021-11-16', 'order_datetime=2021-12-31', 'order_datetime=2021-09-14

                                                                                

- figure out that tag_vec

In [55]:
SAMPLE_SIZE = 0.005
sdf_transformed = sdf_transformed.sample(SAMPLE_SIZE, seed=0)

In [56]:
df_transformed = sdf_transformed.toPandas()
df_transformed.head()

                                                                                

Unnamed: 0,POA_CODE21,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,take_rate,type,postcode,...,month,total_retail,order_datetime,__index_level_0__,latitude,longitude,genderIndex,typeIndex,tagIndex,tag_vec
0,6721,6007,17324645993,7.77052,971730f8-f855-4721-887b-f6a9931bd3c9,Eget Metus In Corporation,tent and awning shops,5.73,a,6721,...,2021-08-01,29261.9,2021-08-28,17388,118.836846,-20.536596,2.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3506,10946,54272781746,33.079915,a5afeb04-7429-4ee9-bb98-4198d1a71759,Non Massa Institute,bicycle shops - sales and service,4.54,b,3506,...,2021-08-01,29261.9,2021-08-29,23957,141.365262,-35.188186,1.0,1.0,18.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2453,16578,85139489422,16.731617,b9918232-20b8-4bf2-b1c7-fab2d69a18db,Ut Institute,"cable, satellite, and other pay television and...",4.25,b,2453,...,2021-09-01,29756.7,2021-09-01,104265,152.544599,-30.250198,1.0,1.0,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2214,3465,86578477987,20.82033,fb1f1249-4b2c-4042-b667-62c3cc28de21,Leo In Consulting,"watch, clock, and jewelry repair shops",6.43,a,2214,...,2021-09-01,29756.7,2021-09-03,137165,150.981488,-33.938837,0.0,0.0,3.0,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4823,11051,31101120643,170.777406,afca9de8-4310-44b2-8133-441596774377,Commodo Hendrerit Donec Corp.,"cable, satellite, and other pay television and...",6.37,a,4823,...,2021-09-01,29756.7,2021-09-03,141013,141.061093,-20.424052,2.0,0.0,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


- Figure out corresponding index

gender

In [57]:
gender_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['gender','genderIndex']])

In [58]:
gender_correspond.drop('order_id', 'merchant_name', 'tags', 'type', 'name', 'address', 'state', 'population','month','__index_level_0__', 'typeIndex', 'tagIndex','tag_vec').show()



+----------+-------+------------+------------------+---------+--------+-----------+-----------+---------------+------------+--------------+------------------+-------------------+-----------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|take_rate|postcode|consumer_id|     gender|New cases / day|total_retail|order_datetime|          latitude|          longitude|genderIndex|
+----------+-------+------------+------------------+---------+--------+-----------+-----------+---------------+------------+--------------+------------------+-------------------+-----------+
|      3506|  10946| 54272781746| 33.07991535733474|     4.54|    3506|     729506|     Female|          1,320|     29261.9|    2021-08-29|141.36526207519685| -35.18818636789844|        1.0|
|      2214|   3465| 86578477987| 20.82033031130609|     6.43|    2214|    1073253|       Male|          1,635|     29756.7|    2021-09-03|  150.981487861616|  -33.9388368430949|        0.0|
|      6721|   6007| 17324645993|7.7705197714

                                                                                

type

In [59]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['type','typeIndex']])

In [60]:
type_correspond.drop('order_id', 'merchant_name', 'tags', 'gender', 'name', 'address', 'state','month','__index_level_0__', 'genderIndex', 'tagIndex','tag_vec').show()



+----------+-------+------------+------------------+---------+----+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+---------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|take_rate|type|postcode|consumer_id|       population|New cases / day|total_retail|order_datetime|          latitude|          longitude|typeIndex|
+----------+-------+------------+------------------+---------+----+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+---------+
|      6721|   6007| 17324645993|7.7705197714145395|     5.73|   a|    6721|     937112|4471.784851175305|          1,122|     29261.9|    2021-08-28| 118.8368457027438|-20.536596488642612|      0.0|
|      3506|  10946| 54272781746| 33.07991535733474|     4.54|   b|    3506|     729506|74.80310587884506|          1,320|     29261.9|    2021-08-29|141.36526207519685| -35.18818636789844|      1.0|


                                                                                

tag

In [61]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['tag','tagIndex', 'tag_vec']])

In [62]:
type_correspond.drop('order_id', 'merchant_name', 'type', 'gender', 'name', 'address', 'state','month','__index_level_0__', 'genderIndex', 'typeIndex').show()



+----------+-------+------------+------------------+--------------------+---------+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+--------+---------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|                tags|take_rate|postcode|consumer_id|       population|New cases / day|total_retail|order_datetime|          latitude|          longitude|tagIndex|        tag_vec|
+----------+-------+------------+------------------+--------------------+---------+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+--------+---------------+
|      6721|   6007| 17324645993|7.7705197714145395|tent and awning s...|     5.73|    6721|     937112|4471.784851175305|          1,122|     29261.9|    2021-08-28| 118.8368457027438|-20.536596488642612|     0.0| (24,[0],[1.0])|
|      2453|   2851| 57471217202|129.65436236629736|gift, card, novel...|   

                                                                                