In [1]:
%pip install pandas numpy faker

Collecting faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------- ----------------------- 0.8/1.9 MB 6.9 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 8.1 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.4.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import os
from datetime import datetime, timedelta

# Initialize Faker for generating realistic data
fake_data_generator = Faker()

# Create directory structure for raw data files
BASE_DIR = "raw"
CUST_MSTR_DIR = os.path.join(BASE_DIR, "CUST_MSTR")
MASTER_CHILD_DIR = os.path.join(BASE_DIR, "master_child_export")
H_ECOM_ORDER_DIR = os.path.join(BASE_DIR, "H_ECOM_ORDER")

# Create directories if they don't exist
os.makedirs(CUST_MSTR_DIR, exist_ok=True)
os.makedirs(MASTER_CHILD_DIR, exist_ok=True)
os.makedirs(H_ECOM_ORDER_DIR, exist_ok=True)

# Configuration
SAMPLE_DATES = ["20191112", "20191113"]
CUSTOMER_COUNT = 150
PRODUCT_COUNT = 120
ORDER_COUNT = 200

print("Starting data generation process...")

# ---------- Generate CUST_MSTR files ----------
print("Generating CUST_MSTR files...")
for file_date in SAMPLE_DATES:
    customer_master_data = {
        "customer_id": range(1000, 1000 + CUSTOMER_COUNT),
        "first_name": [fake_data_generator.first_name() for _ in range(CUSTOMER_COUNT)],
        "last_name": [fake_data_generator.last_name() for _ in range(CUSTOMER_COUNT)],
        "email": [fake_data_generator.email() for _ in range(CUSTOMER_COUNT)],
        "phone": [fake_data_generator.phone_number() for _ in range(CUSTOMER_COUNT)],
        "city": [fake_data_generator.city() for _ in range(CUSTOMER_COUNT)],
        "country": [fake_data_generator.country() for _ in range(CUSTOMER_COUNT)]
    }
    
    customer_dataframe = pd.DataFrame(customer_master_data)
    output_filename = f"CUST_MSTR_{file_date}.csv"
    output_path = os.path.join(CUST_MSTR_DIR, output_filename)
    customer_dataframe.to_csv(output_path, index=False)
    print(f"  Created: {output_filename}")

# ---------- Generate master_child_export files ----------
print("\nGenerating master_child_export files...")
for file_date in SAMPLE_DATES:
    product_master_data = {
        "product_id": range(2000, 2000 + PRODUCT_COUNT),
        "product_name": [fake_data_generator.word().capitalize() + " " + fake_data_generator.word().capitalize() for _ in range(PRODUCT_COUNT)],
        "category_id": np.random.randint(10, 50, PRODUCT_COUNT),
        "category_name": [fake_data_generator.word().capitalize() for _ in range(PRODUCT_COUNT)],
        "price": np.random.uniform(10.0, 999.99, PRODUCT_COUNT).round(2)
    }
    
    product_dataframe = pd.DataFrame(product_master_data)
    output_filename = f"master_child_export-{file_date}.csv"
    output_path = os.path.join(MASTER_CHILD_DIR, output_filename)
    product_dataframe.to_csv(output_path, index=False)
    print(f"  Created: {output_filename}")

# ---------- Generate H_ECOM_ORDER file ----------
print("\nGenerating H_ECOM_ORDER file...")
ecommerce_order_data = {
    "order_id": range(50000, 50000 + ORDER_COUNT),
    "customer_id": np.random.randint(1000, 1000 + CUSTOMER_COUNT, ORDER_COUNT),
    "product_id": np.random.randint(2000, 2000 + PRODUCT_COUNT, ORDER_COUNT),
    "order_date": [fake_data_generator.date_between(start_date='-1y', end_date='today') for _ in range(ORDER_COUNT)],
    "order_amount": np.random.uniform(50.0, 2000.0, ORDER_COUNT).round(2)
}

ecommerce_dataframe = pd.DataFrame(ecommerce_order_data)
order_output_path = os.path.join(H_ECOM_ORDER_DIR, "H_ECOM_ORDER.csv")
ecommerce_dataframe.to_csv(order_output_path, index=False)
print(f"  Created: H_ECOM_ORDER.csv")

# ---------- Generate summary report ----------
print("\n" + "="*50)
print("DATA GENERATION SUMMARY")
print("="*50)
print(f"Total CUST_MSTR files created: {len(SAMPLE_DATES)}")
print(f"Total master_child_export files created: {len(SAMPLE_DATES)}")
print(f"Total H_ECOM_ORDER files created: 1")
print(f"Customer records per file: {CUSTOMER_COUNT}")
print(f"Product records per file: {PRODUCT_COUNT}")
print(f"Order records: {ORDER_COUNT}")
print("\nFile structure:")
print(f"  {CUST_MSTR_DIR}/")
for file_date in SAMPLE_DATES:
    print(f"    └── CUST_MSTR_{file_date}.csv")
print(f"  {MASTER_CHILD_DIR}/")
for file_date in SAMPLE_DATES:
    print(f"    └── master_child_export-{file_date}.csv")
print(f"  {H_ECOM_ORDER_DIR}/")
print(f"    └── H_ECOM_ORDER.csv")
print("\nData generation completed successfully!")

# ---------- Optional: Display sample data ----------
print("\n" + "="*50)
print("SAMPLE DATA PREVIEW")
print("="*50)

# Preview CUST_MSTR data
print("\nCUST_MSTR Sample (first 5 rows):")
sample_cust_file = os.path.join(CUST_MSTR_DIR, f"CUST_MSTR_{SAMPLE_DATES[0]}.csv")
sample_cust_df = pd.read_csv(sample_cust_file)
print(sample_cust_df.head())

# Preview master_child_export data
print("\nmaster_child_export Sample (first 5 rows):")
sample_product_file = os.path.join(MASTER_CHILD_DIR, f"master_child_export-{SAMPLE_DATES[0]}.csv")
sample_product_df = pd.read_csv(sample_product_file)
print(sample_product_df.head())

# Preview H_ECOM_ORDER data
print("\nH_ECOM_ORDER Sample (first 5 rows):")
sample_order_df = pd.read_csv(order_output_path)
print(sample_order_df.head())

Starting data generation process...
Generating CUST_MSTR files...
  Created: CUST_MSTR_20191112.csv
  Created: CUST_MSTR_20191113.csv

Generating master_child_export files...
  Created: master_child_export-20191112.csv
  Created: master_child_export-20191113.csv

Generating H_ECOM_ORDER file...
  Created: H_ECOM_ORDER.csv

DATA GENERATION SUMMARY
Total CUST_MSTR files created: 2
Total master_child_export files created: 2
Total H_ECOM_ORDER files created: 1
Customer records per file: 150
Product records per file: 120
Order records: 200

File structure:
  raw\CUST_MSTR/
    └── CUST_MSTR_20191112.csv
    └── CUST_MSTR_20191113.csv
  raw\master_child_export/
    └── master_child_export-20191112.csv
    └── master_child_export-20191113.csv
  raw\H_ECOM_ORDER/
    └── H_ECOM_ORDER.csv

Data generation completed successfully!

SAMPLE DATA PREVIEW

CUST_MSTR Sample (first 5 rows):
   customer_id first_name last_name                       email  \
0         1000     Ronald   Ramirez       iwil