In [1]:
import random
from pyspark.sql import SparkSession
from pyspark import broadcast, SparkConf
import time
import os

RAPIDS_JAR = os.getenv("RAPIDS_JAR", "/home/yuanli/work/jars/rapids.jar")
SPARK_MASTER = os.getenv("SPARK_MASTER_URL", "spark://ip:port")
print("RAPIDS_JAR: {}".format(RAPIDS_JAR))
if "sc" in globals():
    sc.stop()

### Configure the parameters based on your dataproc cluster ###
conf = SparkConf().setAppName("Retail Analytics")
conf.setMaster(SPARK_MASTER)
conf.set("spark.driver.extraClassPath", RAPIDS_JAR)
conf.set("spark.executor.extraClassPath", RAPIDS_JAR)
conf.set("spark.jars", RAPIDS_JAR)
conf.set("spark.executor.instances", "1")
conf.set("spark.executor.cores", "4")
conf.set("spark.task.resource.gpu.amount", "0.25")
conf.set("spark.rapids.sql.concurrentGpuTasks", "2")
conf.set("spark.executor.memory", "4g")
conf.set("spark.sql.files.maxPartitionBytes", "128m")
conf.set("spark.executor.resource.gpu.amount", "1")
conf.set("spark.rapids.memory.pinnedPool.size", "2048m")
conf.set("spark.executor.memoryOverhead", "4096m")
conf.set("spark.dynamicAllocation.enabled", "false")
conf.set("spark.rapids.sql.format.json.read.enabled",True)
conf.set("spark.rapids.sql.castStringToTimestamp.enabled",True)
conf.set("spark.rapids.sql.expression.PercentRank",False)
conf.set("spark.rapids.sql.castDecimalToString.enabled",True)
conf.set("spark.rapids.sql.hasExtendedYearValues",False)
conf.set("spark.rapids.sql.enabled",True)
conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
conf.set("spark.rapids.sql.allowMultipleJars", "ALWAYS")

spark = SparkSession.builder \
                    .config(conf=conf) \
                    .getOrCreate()
# create a SparkSession
spark = SparkSession.builder.appName("RetailInvMgmt").getOrCreate()

24/11/01 17:40:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/11/01 17:40:41 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e
24/11/01 17:40:41 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.
24/11/01 17:40:41 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
24/11/01 17:40:41 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.
24/11/01 17:40:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
import os
# You need to update these to your real paths!
dataRoot = os.getenv("DATA_ROOT", 'file:/home/yuanli/work/example-repo-tests/datasets')

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

start = time.time()

def clean_data(df):
    # remove missing values
    df = df.dropna()
    # remove duplicate data
    df = df.dropDuplicates()
    return df


def read_data(spark, format, file_path):
    if format=="csv":
        return spark.read.format(format).load(file_path,header=True)
    else:
        return spark.read.format(format).load(file_path)

# read sales data
sales_df = read_data(spark, "csv", dataRoot+"/sales/")

# read stock data
stock_df = read_data(spark, "json", dataRoot+"/stock/")

# read supplier data
supplier_df = read_data(spark, "json", dataRoot+"/supplier/")

# read customer data
customer_df = read_data(spark, "csv", dataRoot+"/customer/")

# read market data
market_df = read_data(spark, "csv", dataRoot+"/market/")

# read logistic data
logistic_df = read_data(spark, "csv", dataRoot+"/logistic/")


# data cleaning
sales_df = clean_data(sales_df)
stock_df = clean_data(stock_df)
supplier_df = clean_data(supplier_df)
customer_df = clean_data(customer_df)
market_df = clean_data(market_df)
logistic_df = clean_data(logistic_df)


# convert date columns to date type
sales_df = sales_df.withColumn("date_of_sale", to_date(col("date_of_sale")))
stock_df = stock_df.withColumn("date_received", to_date(col("date_received")))
supplier_df = supplier_df.withColumn("date_ordered", to_date(col("date_ordered")))

# standardize case of string columns
sales_df = sales_df.withColumn("product_name", upper(col("product_name")))
stock_df = stock_df.withColumn("product_name", upper(col("product_name")))
stock_df = stock_df.withColumn("location", upper(col("location")))
supplier_df = supplier_df.withColumn("product_name", upper(col("product_name")))
customer_df = customer_df.withColumn("customer_name", upper(col("customer_name")))
market_df = market_df.withColumn("product_name", upper(col("product_name")))
logistic_df = logistic_df.withColumn("product_name", upper(col("product_name")))

# remove leading and trailing whitespaces
sales_df = sales_df.withColumn("product_name", trim(col("product_name")))
stock_df = stock_df.withColumn("location", trim(col("location")))

supplier_df = supplier_df.withColumn("product_name", trim(col("product_name")))
customer_df = customer_df.withColumn("customer_name", trim(col("customer_name")))
market_df = market_df.withColumn("product_name", trim(col("product_name")))
logistic_df = logistic_df.withColumn("product_name", trim(col("product_name")))

# check for invalid values
sales_df = sales_df.filter(col("product_name").isNotNull())
stock_df = stock_df.filter(col("location").isNotNull())
customer_df = customer_df.filter(col("gender").isin("male","female"))
market_df = market_df.filter(col("product_name").isNotNull())
logistic_df = logistic_df.filter(col("product_name").isNotNull())

#drop extra columns
market_df = market_df.drop("price")
supplier_df = supplier_df.drop("price")

# join all data
data_int = sales_df.join(stock_df, "product_name","leftouter").join(supplier_df, "product_name","leftouter").join(market_df, "product_name","leftouter").join(logistic_df, "product_name","leftouter").join(customer_df, "customer_id","leftouter")  

# write the cleaned data
os.makedirs(dataRoot+"cleaned/", exist_ok=True)
data_int.write.mode("overwrite").format("parquet").save(dataRoot+"/cleaned/")

end = time.time()

print("Time taken on GPU for Data Cleaning: ", end - start)



24/11/01 17:40:44 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
    !Exec <FileSourceScanExec> cannot run on GPU because unsupported file format: org.apache.spark.sql.execution.datasources.text.TextFileFormat



                                                                                

24/11/01 17:40:51 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <Invoke> value#0.toString cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke
    @Expression <AttributeReference> value#0 could run on GPU
  !Expression <AttributeReference> obj#15 cannot run on GPU because expression AttributeReference obj#15 produces an unsupported type ObjectType(class java.lang.String)
  !Exec <FileSourceScanExec> cannot run on GPU because unsupported file format: org.apache.spark.sql.execution.datasources.text.TextFileFormat

24/11/01 17:40:51 WARN GpuOverrides: 
!Exec <FileSourceScanExec> cannot run on GPU because unsupported file format: org.apache.spark.sql.execution.datasources.text.TextFileFormat

24/11/01 17:40:51 WARN GpuOverrides: 
!Exec <Fil

                                                                                

Time taken on GPU for Data Cleaning:  13.701324701309204


In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

#DO VARIOUS RETAIL DATA ANALYTICS 

start = time.time()

# read cleaned data

data = spark.read.format("parquet").load(dataRoot+"/cleaned/")

#Case when statement to create a new column to indicate whether the product is perishable or not:

data = data.withColumn("perishable", when(col("shelf_life") <= 30, "yes").otherwise("no"))

# You can use the when() and otherwise() functions to create new columns based on certain conditions:

data = data.withColumn("sales_status", when(col("quantity_sold") > 50, "good").otherwise("bad"))

# create a window to perform time series analysis
window = Window.partitionBy("product_name").orderBy("date_of_sale")

# calculate the rolling average of sales for each product
time_series_df = data.withColumn("rolling_avg_sales", avg("quantity_sold").over(window))

# use window function for forecasting

forecast_df = time_series_df.withColumn("prev_sales", lag("rolling_avg_sales").over(window))\
    .withColumn("next_sales", lead("rolling_avg_sales").over(window))


# Calculate the average price of a product, grouped by supplier
forecast_df.groupBy("sup_id").agg({"price": "avg"}).show()


# Calculate the total quantity in stock and total sales by supplier
forecast_df.groupBy("sup_id").agg({"quantity_in_stock": "sum", "price": "sum"}).show()

#Calculate the number of perishable v/s non-perishable product per location
forecast_df.groupBy("perishable").agg({"perishable": "count"}).show()


#Calculate number of good v/s bad sales status per location
forecast_df.groupBy("sales_status").agg({"sales_status": "count"}).show()

# Count the number of sales that contain a 10% off promotion
countt = forecast_df.filter(forecast_df["contains_promotion"].contains("10% off")).count()
print(countt)
# Perform some complex analysis on the DataFrame

# Calculate the total sales, quantity sold by product and location
total_sales_by_product_location = forecast_df.groupBy("product_name", "location").agg(sum("price").alias("total_price"),sum("quantity_ordered").alias("total_quantity_sold"),avg("quantity_sold").alias("avg_quantity_sold")).sort(desc("total_price"))

# Group the data by product_name
grouped_df = forecast_df.groupBy("product_name")

#Sum the quantity_in_stock, quantity_ordered, quantity_sold, and (price * quantity_sold) for each group
aggregated_df = grouped_df.agg(sum("quantity_in_stock").alias("total_quantity_in_stock"),avg("price").alias("average_price"),sum("quantity_ordered").alias("total_quantity_ordered"),sum("quantity_sold").alias("total_quantity_sold"),sum(col("price") * col("quantity_sold")).alias("total_sales"),sum("prev_sales").alias("total_prev_sales"),sum("next_sales").alias("total_next_sales"),).sort(desc("total_sales"))

#WRITE THE AGGREGATES TO DISK
aggregated_df.write.mode("overwrite").format("parquet").save(dataRoot+"/app/data.parquet")
total_sales_by_product_location.write.mode("overwrite").format("parquet").save(dataRoot+"/app1/data.parquet")

end = time.time()

print("Time taken on GPU for Data Analysis: ", end - start)

24/11/01 17:40:56 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU

24/11/01 17:40:56 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU

24/11/01 17:40:56 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU bec

0
Time taken on GPU for Data Analysis:  3.733241081237793


In [5]:
spark.stop()