Packages

In [5]:
import os
import numpy
import pandas as pd
import tqdm
import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

In [6]:
from pyspark.sql import SparkSession

In [7]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

In [4]:
sdf = read_file(spark, 'filled.parquet', '../data/curated/')

|> Loading File...


                                                                                

|> Loading Finished!




-RECORD 0--------------------------------------------------------------------------
 user_id           | 7                                                             
 merchant_abn      | 17488304283                                                   
 dollar_value      | 67.95495287248738                                             
 order_id          | e637ca66-ed07-42c3-a39b-d49a1f97dde8                          
 merchant_name     | Posuere Cubilia Curae Corporation                             
 tags              | cable, satellite, and other pay television and radio services 
 take_rate         | 6.18                                                          
 type              | a                                                             
 postcode          | 4606                                                          
 consumer_id       | 511685                                                        
 name              | Andrea Jones                                           

                                                                                

In [8]:
poa_sf = gpd.read_file('../data/curated/geo_sa2_pos/geo_sa2_pos.shp')
poa_sf['geometry'] = poa_sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
poa_gdf = gpd.GeoDataFrame(poa_sf)

In [9]:
poa_gdf.head()

Unnamed: 0,postcode,POA_CODE21,POA_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry
0,6935,6055,6055,AUS,Australia,58.81,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.619512,0.005605,"POLYGON ((115.96241 -31.89584, 115.96247 -31.8..."
1,1109,2000,2000,AUS,Australia,4.2871,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.157964,0.000418,"POLYGON ((151.20909 -33.87978, 151.20888 -33.8..."
2,6849,6014,6014,AUS,Australia,9.5822,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.192095,0.000914,"POLYGON ((115.77992 -31.93736, 115.77976 -31.9..."
3,1114,2000,2000,AUS,Australia,4.2871,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.157964,0.000418,"POLYGON ((151.20909 -33.87978, 151.20888 -33.8..."
4,1825,2144,2144,AUS,Australia,8.5637,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.153584,0.000834,"POLYGON ((151.03050 -33.86727, 151.02930 -33.8..."


In [10]:
poa_gdf['longitude'] = poa_gdf['geometry'].apply(lambda x: x.centroid.y)

In [11]:
poa_gdf['latitude'] = poa_gdf['geometry'].apply(lambda x: x.centroid.x)

In [12]:
poa_gdf.head()

Unnamed: 0,postcode,POA_CODE21,POA_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry,longitude,latitude
0,6935,6055,6055,AUS,Australia,58.81,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.619512,0.005605,"POLYGON ((115.96241 -31.89584, 115.96247 -31.8...",-31.863787,115.986788
1,1109,2000,2000,AUS,Australia,4.2871,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.157964,0.000418,"POLYGON ((151.20909 -33.87978, 151.20888 -33.8...",-33.868384,151.20822
2,6849,6014,6014,AUS,Australia,9.5822,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.192095,0.000914,"POLYGON ((115.77992 -31.93736, 115.77976 -31.9...",-31.936879,115.804782
3,1114,2000,2000,AUS,Australia,4.2871,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.157964,0.000418,"POLYGON ((151.20909 -33.87978, 151.20888 -33.8...",-33.868384,151.20822
4,1825,2144,2144,AUS,Australia,8.5637,http://linked.data.gov.au/dataset/asgsed3/POA/...,0.153584,0.000834,"POLYGON ((151.03050 -33.86727, 151.02930 -33.8...",-33.854687,151.026208


In [13]:
poa_lat_long = poa_gdf[['POA_CODE21', 'latitude', 'longitude']]

In [14]:
poa_spark = spark.createDataFrame(poa_lat_long).toDF(*['POA_CODE21', 'latitude', 'longitude'])

In [15]:
poa_spark.show()

+----------+------------------+-------------------+
|POA_CODE21|          latitude|          longitude|
+----------+------------------+-------------------+
|      6055|115.98678832411451|-31.863786871018736|
|      2000| 151.2082196508253|  -33.8683837718929|
|      6014|115.80478169490398| -31.93687853219304|
|      2000| 151.2082196508253|  -33.8683837718929|
|      2144|151.02620831540813| -33.85468695739709|
|      2000| 151.2082196508253|  -33.8683837718929|
|      6163|115.80009620278315|-32.087758359382995|
|      5067|138.63158740304542| -34.92212750070141|
|      2165|150.94640713083987|-33.870457533229015|
|      3000|144.96238093177988| -37.81314234745685|
|      2000| 151.2082196508253|  -33.8683837718929|
|      4000|153.02369142357665|-27.465171792936314|
|      6760|119.67897879465937| -21.15542547643723|
|      3004|144.97732514913363| -37.83252018902255|
|      6107|115.96624063904017| -32.01711954768857|
|      6151|115.86965302590026|-31.981027071771685|
|      2000|

In [16]:
sdf = sdf.join(poa_spark, on = ['POA_CODE21'], how='left')

In [17]:
sdf.show(5)



+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+-------------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|        name|             address|state|gender|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|          latitude|          longitude|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+-------------------+
|      2573|  10087| 4125179

                                                                                

In [18]:
# type
type_indexer = StringIndexer(inputCol = "type", outputCol = "typeIndex", handleInvalid="skip")

# gender
gender_indexer = StringIndexer(inputCol = "gender", outputCol = "genderIndex", handleInvalid="skip")

# tag
tag_indexer = StringIndexer(inputCol = "tags",outputCol = "tagIndex", handleInvalid="skip")
onehotencoder_tag_vector = OneHotEncoder(inputCol = "tagIndex", outputCol = "tag_vec")

#Create pipeline and pass all stages
pipeline = Pipeline(stages=[gender_indexer, 
                            type_indexer,
                            tag_indexer,
                            onehotencoder_tag_vector])

In [19]:
sdf.show(5)

+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|          name|             address|state|gender|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|          latitude|         longitude|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+
|      5570|  10056| 8657

                                                                                

In [20]:
sdf_transformed = pipeline.fit(sdf).transform(sdf)
sdf_transformed.show(5)

                                                                                

22/10/06 18:18:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+--------------------+-----+------+-----------------+---------------+----------+-------+--------------+-----------------+------------------+------------------+-----------+---------+--------+--------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|            name|             address|state|gender|        2021_popu|New cases / day|     month|  total|order_datetime|__index_level_0__|          latitude|         longitude|genderIndex|typeIndex|tagIndex|       tag_vec|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+--------------------+-----+------+-----------------+---------------+----------+-------

                                                                                

In [21]:
sdf_transformed = (sdf_transformed
                   .withColumnRenamed("2021_popu", 'population')
                   .withColumnRenamed('total','total_retail')
                   )

In [22]:
sdf_transformed.show()



+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------------+--------------------+-----+-----------+-----------------+---------------+----------+------------+--------------+-----------------+------------------+-------------------+-----------+---------+--------+---------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|              name|             address|state|     gender|       population|New cases / day|     month|total_retail|order_datetime|__index_level_0__|          latitude|          longitude|genderIndex|typeIndex|tagIndex|        tag_vec|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+------------------+--------------------+-----+-----------+-----------------

                                                                                

In [23]:
path = '../data/curated/changed_data'
if (create_folder(path)):
    sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='append')
else:
    sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='overwrite')

|> The folder already exist!
|> Files already exist under this folder:
   []


                                                                                

- figure out that tag_vec

In [33]:
SAMPLE_SIZE = 0.005
sdf_transformed = sdf_transformed.sample(SAMPLE_SIZE, seed=0)

In [34]:
df_transformed = sdf_transformed.toPandas()
df_transformed.head()

                                                                                

Unnamed: 0,POA_CODE21,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,take_rate,type,postcode,...,month,total_retail,order_datetime,__index_level_0__,latitude,longitude,genderIndex,typeIndex,tagIndex,tag_vec
0,4823,6201,49891706470,35.254107,89483b1e-914a-42e3-9325-8f84e44f72f7,Non Vestibulum Industries,tent and awning shops,5.8,a,4823,...,2021-09-01,29756.7,2021-09-04,158204,141.061093,-20.424052,1.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2040,9389,18949610555,30.042452,4240ba87-07d5-4392-9074-710ab1aab8a0,Metus In LLP,shoe shops,6.47,a,2040,...,2021-09-01,29756.7,2021-09-23,592854,151.157593,-33.877226,0.0,0.0,11.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2214,20548,43162178066,82.802307,973205b9-20af-40eb-93c1-c4bbbbc6b8e7,Nascetur Ridiculus LLP,"cable, satellite, and other pay television and...",4.39,b,2214,...,2021-11-01,33345.0,2021-11-12,1785159,150.981488,-33.938837,1.0,1.0,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,5556,21559,89726005175,24.131685,3adee43e-fff4-472e-bb3b-144fb5d8a69e,Est Nunc Consulting,tent and awning shops,6.01,a,5556,...,2021-12-01,31967.9,2021-12-13,2885738,137.667814,-33.893444,0.0,0.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2214,4547,82539239304,83.740507,4e6e1069-c6a3-49c0-8613-5eda04742ef9,Vitae Ltd,"opticians, optical goods, and eyeglasses",1.06,d,2214,...,2021-12-01,31967.9,2021-12-17,3015106,150.981488,-33.938837,1.0,3.0,6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


- Figure out corresponding index

gender

In [27]:
gender_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['gender','genderIndex']])

In [28]:
gender_correspond.drop('order_id', 'merchant_name', 'tags', 'type', 'name', 'address', 'state', 'population','month','__index_level_0__', 'typeIndex', 'tagIndex','tag_vec').show()

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:571)
	at java.net.ServerSocket.accept(ServerSocket.java:534)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)

+----------+-------+------------+------------------+---------+--------+-----------+-----------+---------------+------------+--------------+-----------------+-------------------+-----------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|take_rate|postcode|consumer_id|     gender|New cases / day|total_retail|order_datetime|         latitude|          longitude|genderIndex|
+----------+-------+------------+------------------+---------+--------+-----------+-----------+---------------+------------+--------------+-----------------+-------------------+-----------+
|      2453|  16079| 89022158521|3036.6744813688742|     4.41|    2453|     984816|     Female|          1,122|     29261.9|    2021-08-28|152.5445987851557|-30.250197981223444|        1.0|
|      2214|  23103| 66186109508| 61.80280843534353|      4.8|    2214|    1313504|       Male|          1,122|     29261.9|    2021-08-28| 150.981487861616|  -33.9388368430949|        0.0|
|      6721|   6007| 17324645993|7.770519771414539

                                                                                

type

In [29]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['type','typeIndex']])

In [30]:
type_correspond.drop('order_id', 'merchant_name', 'tags', 'gender', 'name', 'address', 'state','month','__index_level_0__', 'genderIndex', 'tagIndex','tag_vec').show()



+----------+-------+------------+------------------+---------+----+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+---------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|take_rate|type|postcode|consumer_id|       population|New cases / day|total_retail|order_datetime|          latitude|          longitude|typeIndex|
+----------+-------+------------+------------------+---------+----+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+---------+
|      6721|   6007| 17324645993|7.7705197714145395|     5.73|   a|    6721|     937112|4471.784851175305|          1,122|     29261.9|    2021-08-28| 118.8368457027438|-20.536596488642612|      0.0|
|      2453|  16079| 89022158521|3036.6744813688742|     4.41|   b|    2453|     984816|3676.038666295061|          1,122|     29261.9|    2021-08-28| 152.5445987851557|-30.250197981223444|      1.0|


                                                                                

tag

In [31]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['tag','tagIndex', 'tag_vec']])

In [32]:
type_correspond.drop('order_id', 'merchant_name', 'type', 'gender', 'name', 'address', 'state','month','__index_level_0__', 'genderIndex', 'typeIndex').show()



+----------+-------+------------+------------------+--------------------+---------+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+--------+---------------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|                tags|take_rate|postcode|consumer_id|       population|New cases / day|total_retail|order_datetime|          latitude|          longitude|tagIndex|        tag_vec|
+----------+-------+------------+------------------+--------------------+---------+--------+-----------+-----------------+---------------+------------+--------------+------------------+-------------------+--------+---------------+
|      6721|   6007| 17324645993|7.7705197714145395|tent and awning s...|     5.73|    6721|     937112|4471.784851175305|          1,122|     29261.9|    2021-08-28| 118.8368457027438|-20.536596488642612|     0.0| (24,[0],[1.0])|
|      2453|  16175| 50866797623|  75.6266565291164|gift, card, novel...|   

                                                                                