In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructType, StructField,
    StringType, LongType, DoubleType, TimestampType
)

# Create a Spark session
spark = SparkSession.builder.appName("PMBD - Data Preparation and Data Engineering") \
    .config("spark.sql.shuffle.partitions", "80") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "16") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()
    # .config("spark.driver.maxResultSize", "4g") \
    # .config("spark.sql.execution.arrow.enabled", "true") \

In [2]:
import sys
sys.path.append("../src")

In [3]:
from datajoining import datajoining

In [4]:
sample_sizes = [.001, .002, .005, .01, .02, .05, .1, .2, .5, 1.0]

In [None]:
datajoining_durations = datajoining(spark, "../data/", sample_sizes=sample_sizes)

Number of rows in total dataset file: 109950743
Starting 0.001 sample
Number of rows in 0.001 sample: 109950
Took 55.88 seconds for 0.001 sample.

Starting 0.002 sample
Number of rows in 0.002 sample: 219901
Took 54.90 seconds for 0.002 sample.

Starting 0.005 sample
Number of rows in 0.005 sample: 549753
Took 56.70 seconds for 0.005 sample.

Starting 0.01 sample
Number of rows in 0.01 sample: 1099507
Took 55.68 seconds for 0.01 sample.

Starting 0.02 sample
Number of rows in 0.02 sample: 2199014
Took 57.88 seconds for 0.02 sample.

Starting 0.05 sample
Number of rows in 0.05 sample: 5497537
Took 59.42 seconds for 0.05 sample.

Starting 0.1 sample
Number of rows in 0.1 sample: 10995074
Took 62.28 seconds for 0.1 sample.

Starting 0.2 sample
Number of rows in 0.2 sample: 21990148
Took 65.63 seconds for 0.2 sample.

Starting 0.5 sample
Number of rows in 0.5 sample: 54975371
Took 79.03 seconds for 0.5 sample.

Starting 1.0 sample
Number of rows in 1.0 sample: 109950743


In [None]:
import time

In [None]:
from eda import eda

eda_durations = []

for sample_size in sample_sizes:
    sample_data_dir = f"../data/raw/ec_{sample_size}.parquet"
    start = time.perf_counter()
    eda(spark, parquet_dir=sample_data_dir)
    # do the eda
    print(f"Took {(sample_duration:=time.perf_counter() - start):.2f} seconds for {sample_size} sample.", end="\n\n")
    eda_durations.append(sample_duration)

TypeError: eda() got an unexpected keyword argument 'parquet_dir'

In [None]:
from dataprep import dataprep

In [None]:
dataprep_durations = []
for sample_size in sample_sizes:
    sample_data_dir = f"../data/raw/ec_{sample_size}.parquet"
    start = time.perf_counter()
    dataprep(spark, "../data/", parquet_dir=sample_data_dir, sample_size=sample_size)
    print(f"Took {(sample_duration:=time.perf_counter() - start):.2f} seconds for {sample_size} sample.", end="\n\n")
    dataprep_durations.append(sample_duration)

+---------+-------------+-----+
|  user_id|average_price|views|
+---------+-------------+-----+
|554087876|       283.07|    1|
|520948483|       240.93|    1|
|558013583|        68.98|    1|
|516300614|       378.39|    1|
|520828467|       185.07|    1|
|512664291|       116.61|    1|
|567847952|      1029.34|    1|
|530778584|       566.27|    1|
|515694821|        44.79|    1|
|568478482|       437.57|    1|
|568479194|        136.4|    1|
|516288115|        58.02|    2|
|540767912|        23.13|    1|
|545834518|      1002.66|    1|
|514214547|        62.81|    1|
|558825339|      165.255|    2|
|538156566|       771.94|    1|
|527573632|        25.71|    1|
|528290951|       205.67|    1|
|512987188|      1645.93|    1|
+---------+-------------+-----+
only showing top 20 rows

+---------+-------------+-----------+-----------+----------+---------+-------+---------+----+------------+----+-----------+-----+
|  user_id|average_price|total_views|electronics|appliances|computers|appare

In [None]:
from clustering import clustering

In [None]:
clustering_durations = []
for sample_size in sample_sizes:
    sample_data_dir = f"../data/processed/user_df_{sample_size}.parquet"
    start = time.perf_counter()
    clustering(spark, "../data/", parquet_dir=sample_data_dir, sample_size=sample_size, k=4, chosen_model=0)
    print(f"Took {(sample_duration:=time.perf_counter() - start):.2f} seconds for {sample_size} sample.", end="\n\n")
    clustering_durations.append(sample_duration)

+---------+-------------+-----------+-----------+----------+---------+-------+---------+----+------------+----+-----------+-----+
|  user_id|average_price|total_views|electronics|appliances|computers|apparel|furniture|auto|construction|kids|accessories|sport|
+---------+-------------+-----------+-----------+----------+---------+-------+---------+----+------------+----+-----------+-----+
|229356564|        285.3|          1|          0|         1|        0|      0|        0|   0|           0|   0|          0|    0|
|293335683|         66.9|          1|          0|         1|        0|      0|        0|   0|           0|   0|          0|    0|
|313268536|       357.51|          1|          1|         0|        0|      0|        0|   0|           0|   0|          0|    0|
|367138781|       301.14|          1|          0|         0|        0|      0|        1|   0|           0|   0|          0|    0|
|370599083|       235.03|          1|          0|         1|        0|      0|        0|  

In [None]:
import pandas as pd

In [None]:
pd.options.display.float_format = '{:.2f}'.format

In [None]:
datajoining_durations = [1, 2, 3, 4]
eda_durations = [3, 4, 5, 6]
dataprep_durations = [5, 6, 7, 8]

In [None]:
durations_df = pd.DataFrame(
    [datajoining_durations, eda_durations, dataprep_durations, clustering_durations], 
    index=["datajoining", "eda", "dataprep", "clustering"], 
    columns=sample_sizes,
    )

In [None]:
durations_df

Unnamed: 0,0.00,0.00.1,0.01,0.01.1
datajoining,1.0,2.0,3.0,4.0
eda,3.0,4.0,5.0,6.0
dataprep,5.0,6.0,7.0,8.0
clustering,5.87,6.39,9.42,12.18


In [None]:
durations_df.to_parquet("durations_df.parquet")