Packages

In [1]:
import os
import numpy
import pandas as pd
import tqdm
import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

  shapely_geos_version, geos_capi_version_string


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

22/10/09 10:44:29 WARN Utils: Your hostname, SukiXuudeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 100.70.13.201 instead (on interface en0)
22/10/09 10:44:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/09 10:44:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/09 10:44:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
sdf = read_file(spark, 'filled.parquet', '../data/curated/')

|> Loading File...


                                                                                

|> Loading Finished!


                                                                                

-RECORD 0-------------------------------------------------------------------------------------------
 user_id         | 10285                                                                            
 merchant_abn    | 10023283211                                                                      
 dollar_value    | 311.8728694054186                                                                
 order_id        | e41d1fb7-d7b9-4758-9c20-1d34337a312d                                             
 merchant_name   | Felis Limited                                                                    
 tags            | furniture, home furnishings and equipment shops, and manufacturers, except ap... 
 take_rate       | 0.18                                                                             
 type            | e                                                                                
 postcode        | 6479                                                                    

In [6]:
sdf.show(5)

+-------+------------+------------------+--------------------+-------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+----------+------------------+------------------+-------------------+---------------+----------+-------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|merchant_name|                tags|take_rate|type|postcode|consumer_id|          name|             address|state|gender|POA_CODE21|         2021_popu|          latitude|          longitude|New cases / day|     month|  total|order_datetime|
+-------+------------+------------------+--------------------+-------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+----------+------------------+------------------+-------------------+---------------+----------+-------+--------------+
|  10285| 10023283211| 311.8728694054186|e41d1fb7-d7b9-475...|Felis Limited|furniture, home f.

In [7]:
sdf = sdf.drop('merchant_name')

In [8]:
# type
type_indexer = StringIndexer(inputCol = "type", outputCol = "typeIndex", handleInvalid="skip")

# gender
gender_indexer = StringIndexer(inputCol = "gender", outputCol = "genderIndex", handleInvalid="skip")

# tag
tag_indexer = StringIndexer(inputCol = "tags",outputCol = "tagIndex", handleInvalid="skip")
onehotencoder_tag_vector = OneHotEncoder(inputCol = "tagIndex", outputCol = "tag_vec")

#Create pipeline and pass all stages
pipeline = Pipeline(stages=[gender_indexer, 
                            type_indexer,
                            tag_indexer,
                            onehotencoder_tag_vector])

In [9]:
sdf.show(5)

+-------+------------+------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+----------+------------------+------------------+-------------------+---------------+----------+-------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|                tags|take_rate|type|postcode|consumer_id|          name|             address|state|gender|POA_CODE21|         2021_popu|          latitude|          longitude|New cases / day|     month|  total|order_datetime|
+-------+------------+------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+----------+------------------+------------------+-------------------+---------------+----------+-------+--------------+
|  10285| 10023283211| 311.8728694054186|e41d1fb7-d7b9-475...|furniture, home f...|     0.18|   e|    6479|     191378|Monica Johnson|  

In [10]:
sdf_transformed = pipeline.fit(sdf).transform(sdf)
sdf_transformed.show(5)



+-------+------------+------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+----------+------------------+------------------+-------------------+---------------+----------+-------+--------------+-----------+---------+--------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|                tags|take_rate|type|postcode|consumer_id|          name|             address|state|gender|POA_CODE21|         2021_popu|          latitude|          longitude|New cases / day|     month|  total|order_datetime|genderIndex|typeIndex|tagIndex|       tag_vec|
+-------+------------+------------------+--------------------+--------------------+---------+----+--------+-----------+--------------+--------------------+-----+------+----------+------------------+------------------+-------------------+---------------+----------+-------+--------------+-----------+---------+--------+--------------

                                                                                

In [11]:
sdf_transformed = (sdf_transformed
                   .withColumnRenamed("2021_popu", 'population')
                   .withColumnRenamed('total','total_retail')
                   )

In [12]:
sdf_transformed.show()

+-------+------------+------------------+--------------------+--------------------+---------+----+--------+-----------+---------------+--------------------+-----+-----------+----------+------------------+------------------+-------------------+---------------+----------+------------+--------------+-----------+---------+--------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|                tags|take_rate|type|postcode|consumer_id|           name|             address|state|     gender|POA_CODE21|        population|          latitude|          longitude|New cases / day|     month|total_retail|order_datetime|genderIndex|typeIndex|tagIndex|       tag_vec|
+-------+------------+------------------+--------------------+--------------------+---------+----+--------+-----------+---------------+--------------------+-----+-----------+----------+------------------+------------------+-------------------+---------------+----------+------------+--------------+-----------+

In [13]:
path = '../data/curated/changed_data'
sdf_transformed.write.partitionBy('order_datetime').parquet(path, mode='overwrite')

|> Create Successfully!


                                                                                

- figure out that tag_vec

In [14]:
SAMPLE_SIZE = 0.005
sdf_transformed = sdf_transformed.sample(SAMPLE_SIZE, seed=0)

In [16]:
df_transformed = sdf_transformed.toPandas()
df_transformed.head()

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:576)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:539)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
                                                                                

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,tags,take_rate,type,postcode,consumer_id,name,...,latitude,longitude,New cases / day,month,total_retail,order_datetime,genderIndex,typeIndex,tagIndex,tag_vec
0,11244,10023283211,167.336882,3742e7b2-1fba-4874-b2af-1d64d0721395,"furniture, home furnishings and equipment shop...",0.18,e,3415,1436472,Katherine Morales,...,141.379244,-36.374078,7.0,2021-03-01,30725.9,2021-03-07,2.0,4.0,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,23249,10023283211,265.098473,32eabfa1-98c6-4dad-aef6-d509b7210015,"furniture, home furnishings and equipment shop...",0.18,e,4415,5807,Amber Williams,...,150.229142,-26.509155,14.0,2021-04-01,30960.0,2021-04-07,1.0,4.0,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,8752,10023283211,43.868484,73091389-498a-4e3d-8d65-68979d7fac8f,"furniture, home furnishings and equipment shop...",0.18,e,2315,719183,Anne Cook,...,152.148262,-32.737555,12.0,2021-06-01,30563.8,2021-06-01,1.0,4.0,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,23358,10023283211,130.199813,dcf53120-195f-42a0-9cc3-daeb702cc72f,"furniture, home furnishings and equipment shop...",0.18,e,2011,1145473,Sierra Parker,...,151.224515,-33.86995,173.0,2021-07-01,29761.4,2021-07-24,1.0,4.0,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,22440,10023283211,560.821514,08bc059b-a07b-49dd-9bc2-bbbdda31502c,"furniture, home furnishings and equipment shop...",0.18,e,5096,1251295,Jesus Ward,...,138.657568,-34.803244,222.0,2021-07-01,29761.4,2021-07-31,0.0,4.0,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


- Figure out corresponding index

gender

In [17]:
gender_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['gender','genderIndex']])

In [18]:
gender_correspond.drop('order_id', 'merchant_name', 'tags', 'type', 'name', 'address', 'state', 'population','month', 'typeIndex', 'tagIndex','tag_vec').show()

22/10/09 10:52:38 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




+-------+------------+-----------------+---------+--------+-----------+-----------+----------+------------------+-------------------+---------------+------------+--------------+-----------+
|user_id|merchant_abn|     dollar_value|take_rate|postcode|consumer_id|     gender|POA_CODE21|          latitude|          longitude|New cases / day|total_retail|order_datetime|genderIndex|
+-------+------------+-----------------+---------+--------+-----------+-----------+----------+------------------+-------------------+---------------+------------+--------------+-----------+
|  23249| 10023283211|265.0984728970495|     0.18|    4415|       5807|     Female|      4415|150.22914199078718|-26.509154619864724|           14.0|     30960.0|    2021-04-07|        1.0|
|  22440| 10023283211|560.8215137561789|     0.18|    5096|    1251295|       Male|      5096|138.65756827352445| -34.80324353225051|          222.0|     29761.4|    2021-07-31|        0.0|
|  11244| 10023283211|167.3368824231161|     0.18|

                                                                                

type

In [19]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['type','typeIndex']])

In [20]:
type_correspond.drop('order_id', 'merchant_name', 'tags', 'gender', 'name', 'address', 'state', 'month', 'genderIndex', 'tagIndex','tag_vec').show()



+-------+------------+------------------+---------+----+--------+-----------+----------+------------------+------------------+-------------------+---------------+------------+--------------+---------+
|user_id|merchant_abn|      dollar_value|take_rate|type|postcode|consumer_id|POA_CODE21|        population|          latitude|          longitude|New cases / day|total_retail|order_datetime|typeIndex|
+-------+------------+------------------+---------+----+--------+-----------+----------+------------------+------------------+-------------------+---------------+------------+--------------+---------+
|  19997| 10192359162|217.46219992906006|     6.33|   a|    5245|      97642|      5245|1380.8899583589193|138.80813655091683|-35.033796527743796|          153.0|     29761.4|    2021-07-23|      0.0|
|  12889| 10142254217|125.75443454681056|     4.22|   b|    3273|    1396189|      3273| 476.2063975243831|142.67937957100315| -37.98311195587978|          107.0|     29761.4|    2021-07-14|      

                                                                                

tag

In [21]:
type_correspond = sdf_transformed.drop_duplicates(subset=[c for c in sdf_transformed.columns if c in ['tag','tagIndex', 'tag_vec']])

In [22]:
type_correspond.drop('order_id', 'merchant_name', 'type', 'gender', 'name', 'address', 'state','month', 'genderIndex', 'typeIndex').show()



+-------+------------+------------------+--------------------+---------+--------+-----------+----------+------------------+------------------+-------------------+---------------+------------+--------------+--------+---------------+
|user_id|merchant_abn|      dollar_value|                tags|take_rate|postcode|consumer_id|POA_CODE21|        population|          latitude|          longitude|New cases / day|total_retail|order_datetime|tagIndex|        tag_vec|
+-------+------------+------------------+--------------------+---------+--------+-----------+----------+------------------+------------------+-------------------+---------------+------------+--------------+--------+---------------+
|  22992| 11215815177| 715.6209589383707|tent and awning s...|     4.77|    4815|       3357|      4815|22702.718422412596| 146.6502599734605|-19.527645725574185|          164.0|     29761.4|    2021-07-26|     0.0| (24,[0],[1.0])|
|  23121| 10206519221|172.50414420078687|gift, card, novel...|     6.34|

                                                                                