In [None]:
import random
from pyspark.sql import SparkSession
from pyspark import broadcast, SparkConf
if "sc" in globals():
    sc.stop()

### Configure the parameters based on your dataproc cluster ###
conf = SparkConf().setAppName("data gen")
conf.set("spark.executor.instances", "12")
conf.set("spark.executor.cores", "5")
conf.set("spark.driver.memory", "16g")
conf.set("spark.executor.memory", "20g")
conf.set("spark.sql.files.maxPartitionBytes", "512m")
conf.set("spark.executor.memoryOverhead", "5G")
conf.set("spark.sql.broadcastTimeout", "700")
conf.set('spark.rapids.sql.enabled', 'false')
spark = SparkSession.builder \
                    .config(conf=conf) \
                    .getOrCreate()


In [None]:
import os
# You need to update these to your real paths!
dataRoot = os.getenv("DATA_ROOT", 'gs://path/to/raw/')

In [None]:
from pyspark.sql.functions import rand

In [None]:
# generate sales data
sales_data = []
for i in range(10000000):
    sales_id = "s_{}".format(i)
    product_name = "Product_{}".format(random.randint(1,100))
    price = random.uniform(1,100)
    quantity_sold = random.randint(1,100)
    date_of_sale = "2022-{}-{}".format(random.randint(1,12), random.randint(1,28))
    customer_id = "c_{}".format(random.randint(1,1000))
    sales_data.append((sales_id,product_name, price, quantity_sold, date_of_sale,customer_id))

sales_df = spark.createDataFrame(sales_data, ["sales_id","product_name", "price", "quantity_sold", "date_of_sale","customer_id"])
sales_df.write.format("csv").save(dataRoot+"/sales/",header=True)

In [None]:
# generate stock data
stock_data = []
for i in range(10000):
    product_name = "Product_{}".format(i)
    shelf_life = random.randint(1,365)
    contains_promotion = "{} % off".format(random.randint(0,10))
    quantity_in_stock = random.randint(1,1000)
    location = "Location_{}".format(random.randint(1,100))
    date_received = "2022-{}-{}".format(random.randint(1,12), random.randint(1,28))
    stock_data.append((product_name,shelf_life,contains_promotion,quantity_in_stock, location, date_received))

stock_df = spark.createDataFrame(stock_data, ["product_name","shelf_life","contains_promotion","quantity_in_stock", "location", "date_received"])
# write data to different formats
stock_df.repartition(20)
stock_df.write.format("json").save(dataRoot+"/stock/")

In [None]:
# generate supplier data
supplier_data = []
for i in range(100000):
    sup_id = "s_{}".format(i)
    product_name = "Product_{}".format(random.randint(1,10000))
    quantity_ordered = random.randint(1,1000)
    price = random.uniform(1,100)
    date_ordered = "2022-{}-{}".format(random.randint(1,12), random.randint(1,28))
    supplier_data.append((sup_id,product_name, quantity_ordered, price, date_ordered))

supplier_df = spark.createDataFrame(supplier_data, ["sup_id","product_name", "quantity_ordered", "price", "date_ordered"])
# shuffle the dataframe to ensure it is evenly distributed
supplier_df = supplier_df.sort(rand())
# divide the dataframe into smaller partitions
supplier_df = supplier_df.repartition(100)
supplier_df.write.format("json").save(dataRoot+"/supplier/")

In [None]:
#generate customer data
customer_data = []
for i in range(100000):
    customer_id = "c_{}".format(i)
    customer_name = "Customer_{}".format(i)
    age = random.randint(20,70)
    gender = random.choice(["male", "female"])
    purchase_history = random.randint(1,100)
    contact_info = "email_{}@gmail.com".format(i)
    customer_data.append((customer_id,customer_name, age, gender, purchase_history, contact_info))

customer_df = spark.createDataFrame(customer_data, ["customer_id","customer_name", "age", "gender", "purchase_history", "contact_info"])

# shuffle the dataframe to ensure it is evenly distributed
customer_df = customer_df.sort(rand())
# divide the dataframe into smaller partitions
customer_df = customer_df.repartition(100)

customer_df.write.format("csv").save(dataRoot+"/customer/",header=True)

In [None]:
# generate market data
market_data = []
for i in range(10000):
    product_name = "Product_{}".format(i)
    competitor_price = random.uniform(1,100)
    sales_trend = random.randint(1,100)
    demand_forecast = random.randint(1,100)
    market_data.append((product_name, competitor_price, sales_trend, demand_forecast))

market_df = spark.createDataFrame(market_data, ["product_name", "competitor_price", "sales_trend", "demand_forecast"])

# shuffle the dataframe to ensure it is evenly distributed
market_df = market_df.sort(rand())
# divide the dataframe into smaller partitions
market_df = market_df.repartition(100)
market_df.write.format("csv").save(dataRoot+"/market/",header=True)


In [None]:
# generate logistic data
logistic_data = []
for i in range(10000):
    product_name = "Product_{}".format(i)
    shipping_cost = random.uniform(1,100)
    transportation_cost = random.uniform(1,100)
    warehouse_cost = random.uniform(1,100)
    logistic_data.append((product_name, shipping_cost, transportation_cost, warehouse_cost))

logistic_df = spark.createDataFrame(logistic_data, ["product_name", "shipping_cost", "transportation_cost", "warehouse_cost"])
# shuffle the dataframe to ensure it is evenly distributed
logistic_df = logistic_df.sort(rand())
# divide the dataframe into smaller partitions
logistic_df = logistic_df.repartition(100)
logistic_df.write.format("csv").save(dataRoot+"/logistic/",header=True)


In [None]:
spark.stop()