### Generating and Writing Data to GCS

In [None]:
import os
import pandas as pd
import multiprocessing as mp
import random

# You need to update these to your real paths!
dataRoot = os.getenv("DATA_ROOT", 'gs://bucket-name/data/raw/')

In [2]:
#We define the generate_data function which takes an integer i as input and generates sales data using random numbers. The generated data includes sales ID, product name, price, quantity sold, date of sale, and customer ID. The function returns a tuple of the generated data.
def generate_data(i):
    sales_id = "s_{}".format(i)
    product_name = "Product_{}".format(i)
    price = random.uniform(1,100)
    quantity_sold = random.randint(1,100)
    date_of_sale = "2022-{}-{}".format(random.randint(1,12), random.randint(1,28))
    customer_id = "c_{}".format(random.randint(1,1000000))
    return (sales_id, product_name, price, quantity_sold, date_of_sale, customer_id)

with mp.Pool(mp.cpu_count()) as p:
    sales_data = p.map(generate_data, range(100000000))
    sales_data = list(sales_data)
    
print("write to gcs started")
sales_df = pd.DataFrame(sales_data, columns=["sales_id", "product_name", "price", "quantity_sold", "date_of_sale", "customer_id"])
sales_df.to_csv(dataRoot+"sales/data.csv", index=False, header=True)
print("Write to gcs completed")

In [None]:
def generate_data(i):
    product_name = "Product_{}".format(i)
    shelf_life = random.randint(1,365)
    contains_promotion = "{} % off".format(random.randint(0,10))
    quantity_in_stock = random.randint(1,1000)
    location = "Location_{}".format(random.randint(1,100))
    date_received = "2022-{}-{}".format(random.randint(1,12), random.randint(1,28))
    return (product_name,shelf_life,contains_promotion,quantity_in_stock, location, date_received)

with mp.Pool(mp.cpu_count()) as p:
    stock_data = p.map(generate_data, range(5000000))
    stock_data = list(stock_data)
    
stock_df = pd.DataFrame(stock_data,  columns=["product_name","shelf_life","contains_promotion","quantity_in_stock", "location", "date_received"])
stock_df.to_json(dataRoot+"stock/stock.json", orient='records')
print("Write to gcs completed")

In [None]:
def generate_data(i):
    sup_id = "s_{}".format(i)
    product_name = "Product_{}".format(i)
    quantity_ordered = random.randint(1,1000)
    price = random.uniform(1,100)
    date_ordered = "2022-{}-{}".format(random.randint(1,12), random.randint(1,28))
    return (sup_id,product_name, quantity_ordered, price, date_ordered)

with mp.Pool(mp.cpu_count()) as p:
    supplier_data = p.map(generate_data, range(5000000))
    supplier_data = list(supplier_data)
    
supplier_df = pd.DataFrame(supplier_data,  columns=["sup_id","product_name", "quantity_ordered", "price", "date_ordered"])
supplier_df.to_json(dataRoot+"supplier/supplier.json", orient='records')
print("Write to gcs completed")

In [None]:
def generate_data(i):
    customer_id = "c_{}".format(i)
    customer_name = "Customer_{}".format(i)
    age = random.randint(20,70)
    gender = random.choice(["male", "female"])
    purchase_history = random.randint(1,100)
    contact_info = "email_{}@gmail.com".format(i)
    return (customer_id,customer_name, age, gender, purchase_history, contact_info)

with mp.Pool(mp.cpu_count()) as p:
    customer_data = p.map(generate_data, range(100000))
    customer_data = list(customer_data)
    
customer_df = pd.DataFrame(customer_data,  columns=["customer_id","customer_name", "age", "gender", "purchase_history", "contact_info"])
customer_df.to_csv(dataRoot+"customer/customer.csv", index=False,header=True)
print("Write to gcs completed")

In [None]:
def generate_data(i):
    product_name = "Product_{}".format(i)
    competitor_price = random.uniform(1,100)
    sales_trend = random.randint(1,100)
    demand_forecast = random.randint(1,100)
    return (product_name, competitor_price, sales_trend, demand_forecast)

with mp.Pool(mp.cpu_count()) as p:
    market_data = p.map(generate_data, range(50000000))
    market_data = list(market_data)
    
market_df = pd.DataFrame(market_data,  columns=["product_name", "competitor_price", "sales_trend", "demand_forecast"])
market_df.to_csv(dataRoot+"market/market.csv", index=False,header=True)
print("Write to gcs completed")

In [None]:
def generate_data(i):
    product_name = "Product_{}".format(i)
    shipping_cost = random.uniform(1,100)
    transportation_cost = random.uniform(1,100)
    warehouse_cost = random.uniform(1,100)
    return (product_name, shipping_cost, transportation_cost, warehouse_cost)

with mp.Pool(mp.cpu_count()) as p:
    logistic_data = p.map(generate_data, range(50000000))
    logistic_data = list(logistic_data)
    
logistic_df = pd.DataFrame(logistic_data,  columns=["product_name", "shipping_cost", "transportation_cost", "warehouse_cost"])
logistic_df.to_csv(dataRoot+"logistic/logistic.csv", index=False,header=True)
print("Write to gcs completed")