In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
from faker import Faker
from snowflake.snowpark.functions import col, count, sum as sum_, count_distinct, when
from datetime import datetime, timedelta
import time
import pandas as pd
import numpy as np
import random

# Connect to Snowflake
connection_parameters = {
    "account" : "FVQCWWK-IJB71419",
    "user" : "LALIT",
    "authenticator" : "Klalitkumar@2025",
    "role" : "SNOWFLAKE_LEARNING_ROLE",
    "warehouse" : "SNOWFLAKE_LEARNING_WH",
    "database" : "SNOWFLAKE_LEARNING_DB",
    "schema" : "ODS"
}

session = Session.builder.configs(connection_parameters).create()

In [None]:

# Parameters
NUM_DEALERS = 5000
NUM_SKUS = 80
NUM_NEW_SALES = 5000
BASELINE_SALES = 10000
INCREMENTAL_START_DATE = datetime(2025, 5, 10)
INCREMENTAL_END_DATE = datetime(2025, 6, 10)
BASELINE_START_DATE = datetime(2024, 1, 1)
BASELINE_END_DATE = INCREMENTAL_START_DATE - timedelta(days=1)
BATCH_SIZE = 10000  # Batch size for Snowflake writes

# Initialize Faker and set seeds for reproducibility
fake = Faker('en_IN')  # Use Indian locale for realistic names and addresses
Faker.seed(42)
np.random.seed(42)

In [None]:
# Check and Create Tables with Uppercase Column Names
def create_tables(session):
    start_time = time.time()
    try:
        # Cache table existence check
        tables_df = session.sql("""
            SELECT TABLE_NAME 
            FROM INFORMATION_SCHEMA.TABLES 
            WHERE TABLE_SCHEMA = 'ODS' 
            AND TABLE_NAME IN ('DEALERS', 'PRODUCTS', 'SALES', 'DP_MAPPING', 'EC_CLUB')
        """).to_pandas()
        tables_exist = tables_df['TABLE_NAME'].tolist()
        print(f"Existing tables in ODS schema: {tables_exist}")

        # Create missing tables
        if 'DEALERS' not in tables_exist:
            session.sql("""
                CREATE TABLE IF NOT EXISTS SNOWFLAKE_LEARNING_DB.ODS.DEALERS (
                    DEALER_NO VARCHAR(50) PRIMARY KEY,
                    DEALER_NAME VARCHAR(100),
                    REGION VARCHAR(50),
                    ADDRESS VARCHAR(200),
                    ANNUAL_REVENUE DECIMAL(18,2),
                    DEALER_TYPE VARCHAR(20),
                    EC_CLUB INTEGER,
                    TOTAL_SALES DECIMAL(18,2) DEFAULT 0,
                    TOTAL_INVOICES INTEGER DEFAULT 0,
                    AVERAGE_BILL_VALUE DECIMAL(18,2) DEFAULT 0,
                    AVERAGE_SKUS_PER_INVOICE DECIMAL(18,2) DEFAULT 0
                )
            """).collect()
            print("Created table ODS.DEALERS")

        if 'PRODUCTS' not in tables_exist:
            session.sql("""
                CREATE TABLE IF NOT EXISTS SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS (
                    SKU VARCHAR(50) PRIMARY KEY,
                    PRODUCT_NAME VARCHAR(100),
                    PROD_CATEGORY VARCHAR(50),
                    PROD_RANGE VARCHAR(50),
                    PROD_SUBCATEGORY VARCHAR(50),
                    IS_TRADING INTEGER,
                    IS_FINISHED INTEGER
                )
            """).collect()
            print("Created table ODS.PRODUCTS")

        if 'SALES' not in tables_exist:
            session.sql("""
                CREATE TABLE IF NOT EXISTS SNOWFLAKE_LEARNING_DB.ODS.SALES (
                    DEALER_NO VARCHAR(50),
                    SKU VARCHAR(50),
                    DATE DATE,
                    QUANTITY INTEGER,
                    AMOUNT DECIMAL(18,2),
                    PRIMARY KEY (DEALER_NO, SKU, DATE)
                )
            """).collect()
            print("Created table ODS.SALES")

        if 'DP_MAPPING' not in tables_exist:
            session.sql("""
                CREATE TABLE IF NOT EXISTS SNOWFLAKE_LEARNING_DB.ODS.DP_MAPPING (
                    DN_NUMBER VARCHAR(50) PRIMARY KEY,
                    VERTICAL VARCHAR(50)
                )
            """).collect()
            print("Created table ODS.DP_MAPPING")

        if 'EC_CLUB' not in tables_exist:
            session.sql("""
                CREATE TABLE IF NOT EXISTS SNOWFLAKE_LEARNING_DB.ODS.EC_CLUB (
                    DEALER_NO VARCHAR(50) PRIMARY KEY,
                    EC_CLUB INTEGER
                )
            """).collect()
            print("Created table ODS.EC_CLUB")
        print(f"create_tables completed in {time.time() - start_time:.2f} seconds")
    except Exception as e:
        print(f"Error creating tables: {str(e)}")
        raise

In [None]:
# Generating baseline data if tables are empty.
def generate_baseline_data(session):
    start_time = time.time()
    try:
        # Step 1: Check existing table data counts
        table_counts = {
            'DEALERS': session.sql("SELECT COUNT(*) AS CNT FROM SNOWFLAKE_LEARNING_DB.ODS.DEALERS").collect()[0]["CNT"],
            'PRODUCTS': session.sql("SELECT COUNT(*) AS CNT FROM SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS").collect()[0]["CNT"],
            'SALES': session.sql("SELECT COUNT(*) AS CNT FROM SNOWFLAKE_LEARNING_DB.ODS.SALES").collect()[0]["CNT"]
        }
        print(f"Table counts: {table_counts}")

        if any(count == 0 for count in table_counts.values()):
            # Region & category setup
            regions = ['MAHARASHTRA', 'DELHI', 'KARNATAKA', 'TAMIL_NADU', 'GUJARAT', 'UTTAR_PRADESH', 'WEST_BENGAL', 'KERALA', 'RAJASTHAN', 'PUNJAB']
            dealer_types = ['RETAILER', 'WHOLESALER', 'DISTRIBUTOR']
            verticals = ['PLUMBING', 'WATER_TANK', 'AGRI', 'BATHROOM_PRODUCTS']
            product_categories = ['Sanitary', 'Plumbing', 'Bathroom', 'Agriculture', 'Tiling', 'Water Storage']
            product_subcategories = {
                'Sanitary': ['Taps', 'Showers', 'Faucets'],
                'Plumbing': ['Pipes', 'Fittings', 'Valves'],
                'Bathroom': ['Basins', 'Commodes', 'Urinals'],
                'Agriculture': ['Drip Irrigation', 'Sprinklers', 'PVC Pipes'],
                'Tiling': ['Wall Tiles', 'Floor Tiles', 'Ceramic'],
                'Water Storage': ['Tanks', 'Overhead Tanks', 'Underground Tanks']
            }
            product_ranges = ['Economy', 'Standard', 'Premium']

            # Generate dealers
            dealer_df = pd.DataFrame({
                'DEALER_NO': [f'D{str(i+1).zfill(6)}' for i in range(NUM_DEALERS)],
                'DEALER_NAME': [fake.company() for _ in range(NUM_DEALERS)],
                'REGION': np.random.choice(regions, NUM_DEALERS),
                'ADDRESS': [fake.address().replace('\n', ', ') for _ in range(NUM_DEALERS)],
                'ANNUAL_REVENUE': np.round(np.random.lognormal(12, 1, NUM_DEALERS), 2),
                'DEALER_TYPE': np.random.choice(dealer_types, NUM_DEALERS),
                'EC_CLUB': np.random.choice([1, 0], NUM_DEALERS, p=[0.2, 0.8]),
                'TOTAL_SALES': 0,
                'TOTAL_INVOICES': 0,
                'AVERAGE_BILL_VALUE': 0,
                'AVERAGE_SKUS_PER_INVOICE': 0
            })

            # Generate products
            product_data = []
            for _ in range(NUM_SKUS):
                cat = random.choice(product_categories)
                subcat = random.choice(product_subcategories[cat])
                range_ = random.choice(product_ranges)
                product_data.append((cat, subcat, range_, f"{subcat} ({range_})"))
            products_df = pd.DataFrame(product_data, columns=['PROD_CATEGORY', 'PROD_SUBCATEGORY', 'PROD_RANGE', 'PRODUCT_NAME'])
            products_df.insert(0, 'SKU', [f'SKU{str(i+1).zfill(5)}' for i in range(NUM_SKUS)])
            flags = np.random.choice([0, 1], size=len(products_df))
            products_df['IS_TRADING'] = flags
            products_df['IS_FINISHED'] = 1 - flags


            # Generate sales
            dealer_list = dealer_df['DEALER_NO'].values
            sku_list = products_df['SKU'].values
            sales_data = []
            date_range = (BASELINE_END_DATE - BASELINE_START_DATE).days
            for i in range(0, BASELINE_SALES, BATCH_SIZE):
                batch_size = min(BATCH_SIZE, BASELINE_SALES - i)
                sales_batch = pd.DataFrame({
                    'DEALER_NO': np.random.choice(dealer_list, batch_size),
                    'SKU': np.random.choice(sku_list, batch_size),
                    'DATE': [BASELINE_START_DATE + timedelta(days=random.randint(0, date_range)) for _ in range(batch_size)],
                    'QUANTITY': np.random.randint(1, 51, batch_size),
                    'AMOUNT': np.round(np.random.uniform(10, 500, batch_size), 2)
                })
                sales_batch['AMOUNT'] *= sales_batch['QUANTITY']
                sales_data.append(sales_batch)
            sales_df = pd.concat(sales_data, ignore_index=True)

            # Additional mappings
            dp_mapping_df = pd.DataFrame({
                'DN_NUMBER': dealer_list,
                'VERTICAL': np.random.choice(verticals, NUM_DEALERS)
            })
            ec_club_df = dealer_df[['DEALER_NO', 'EC_CLUB']]

            # Write data to Snowflake
            for df, table in [
                (dealer_df, "SNOWFLAKE_LEARNING_DB.ODS.DEALERS"),
                (products_df, "SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS"),
                (dp_mapping_df, "SNOWFLAKE_LEARNING_DB.ODS.DP_MAPPING"),
                (ec_club_df, "SNOWFLAKE_LEARNING_DB.ODS.EC_CLUB")
            ]:
                session.create_dataframe(df).write.mode("overwrite").save_as_table(table)
                print(f"Written {len(df)} rows to {table}")

            for i in range(0, len(sales_df), BATCH_SIZE):
                mode = "append" if i > 0 else "overwrite"
                session.create_dataframe(sales_df[i:i+BATCH_SIZE]).write.mode(mode).save_as_table("SNOWFLAKE_LEARNING_DB.ODS.SALES")
                print(f"Wrote sales batch {i//BATCH_SIZE + 1}")

            print(f"Baseline data generated in {time.time() - start_time:.2f} seconds")
            return dealer_df, products_df, sales_df

        else:
            # Load from Snowflake if data exists
            dealer_df = session.table("SNOWFLAKE_LEARNING_DB.ODS.DEALERS").to_pandas()
            products_df = session.table("SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS").to_pandas()
            sales_df = session.table("SNOWFLAKE_LEARNING_DB.ODS.SALES").to_pandas()
            print(f"Loaded dealers: {len(dealer_df)} rows")
            print(f"Loaded products: {len(products_df)} rows")
            print(f"Loaded sales: {len(sales_df)} rows")
            return dealer_df, products_df, sales_df

    except Exception as e:
        print(f"Error in generate_baseline_data: {str(e)}")
        raise


In [None]:
# Generate Incremental Sales data
def generate_incremental_sales(session, dealers_df, products_df, sales_df):
    start_time = time.time()
    try:
        if 'DEALER_NO' not in dealers_df.columns:
            raise KeyError(f"'DEALER_NO' missing in dealers_df. Columns: {list(dealers_df.columns)}")
        dealer_list = dealers_df['DEALER_NO'].values
        if len(dealer_list) == 0:
            raise ValueError("No dealers available in dealers_df")
        sku_list = products_df['SKU'].values
        if len(sku_list) == 0:
            raise ValueError("No SKUs available in products_df")

        # Dealer weights
        dealer_weights = np.ones(len(dealer_list)) if sales_df.empty else sales_df['DEALER_NO'].value_counts().reindex(dealer_list, fill_value=1).values
        dealer_weights = dealer_weights / dealer_weights.sum()

        # Incremental sales (batch generation)
        date_range = (INCREMENTAL_END_DATE - INCREMENTAL_START_DATE).days
        sales_data = []
        for i in range(0, NUM_NEW_SALES, BATCH_SIZE):
            batch_size = min(BATCH_SIZE, NUM_NEW_SALES - i)
            dealer_nos = np.random.choice(dealer_list, batch_size, p=dealer_weights)
            skus = np.random.choice(sku_list, batch_size)
            quantities = np.random.randint(1, 51, batch_size)
            unit_prices = np.round(np.random.uniform(10, 500, batch_size), 2)
            amounts = np.round(quantities * unit_prices, 2)
            dates = [INCREMENTAL_START_DATE + timedelta(days=random.randint(0, date_range)) for _ in range(batch_size)]
            batch_df = pd.DataFrame({
                'DEALER_NO': dealer_nos,
                'SKU': skus,
                'DATE': dates,
                'QUANTITY': quantities,
                'AMOUNT': amounts
            })
            sales_data.append(batch_df)
            # Write batch to Snowflake
            session.create_dataframe(batch_df).write.mode("append").save_as_table("SNOWFLAKE_LEARNING_DB.ODS.SALES")
            print(f"Appended {len(batch_df)} incremental sales rows (batch {i//BATCH_SIZE + 1})")

        new_sales_df = pd.concat(sales_data, ignore_index=True)
        sales_df = pd.concat([sales_df, new_sales_df], ignore_index=True)
        print(f"generate_incremental_sales completed in {time.time() - start_time:.2f} seconds")
        return sales_df
    except Exception as e:
        print(f"Error generating incremental sales: {str(e)}")
        raise

In [None]:
# Update Dealer Features
def update_dealer_features(session):
    start_time = time.time()
    try:
        # Define the MERGE query with a CTE for dealer_agg
        merge_query = """
        MERGE INTO SNOWFLAKE_LEARNING_DB.ODS.DEALERS D
        USING (
            WITH dealer_agg AS (
                SELECT 
                    DEALER_NO,
                    SUM(AMOUNT) AS TOTAL_SALES,
                    COUNT(*) AS TOTAL_INVOICES,
                    COUNT(DISTINCT SKU) AS UNIQUE_SKUS,
                    CASE WHEN COUNT(*) > 0 THEN SUM(AMOUNT) / COUNT(*) ELSE 0 END AS AVERAGE_BILL_VALUE,
                    CASE WHEN COUNT(*) > 0 THEN COUNT(DISTINCT SKU) / COUNT(*) ELSE 0 END AS AVERAGE_SKUS_PER_INVOICE
                FROM SNOWFLAKE_LEARNING_DB.ODS.SALES
                GROUP BY DEALER_NO
            )
            SELECT 
                DEALER_NO,
                TOTAL_SALES,
                TOTAL_INVOICES,
                AVERAGE_BILL_VALUE,
                AVERAGE_SKUS_PER_INVOICE
            FROM dealer_agg
        ) U
        ON D.DEALER_NO = U.DEALER_NO
        WHEN MATCHED THEN
            UPDATE SET
                TOTAL_SALES = U.TOTAL_SALES,
                TOTAL_INVOICES = U.TOTAL_INVOICES,
                AVERAGE_BILL_VALUE = U.AVERAGE_BILL_VALUE,
                AVERAGE_SKUS_PER_INVOICE = U.AVERAGE_SKUS_PER_INVOICE
        """
        print("Executing MERGE query for dealer features update")
        session.sql(merge_query).collect()
        print(f"update_dealer_features completed in {time.time() - start_time:.2f} seconds")
    except Exception as e:
        print(f"Error updating dealer features: {str(e)}")
        raise

In [None]:
# Step 5: Validate Data
def validate_data(session):
    start_time = time.time()
    try:
        # Check row counts
        counts = {
            'DEALERS': session.sql("SELECT COUNT(*) FROM SNOWFLAKE_LEARNING_DB.ODS.DEALERS").count(),
            'PRODUCTS': session.sql("SELECT COUNT(*) FROM SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS").count(),
            'SALES': session.sql("SELECT COUNT(*) FROM SNOWFLAKE_LEARNING_DB.ODS.SALES").count(),
            'DP_MAPPING': session.sql("SELECT COUNT(*) FROM SNOWFLAKE_LEARNING_DB.ODS.DP_MAPPING").count(),
            'EC_CLUB': session.sql("SELECT COUNT(*) FROM SNOWFLAKE_LEARNING_DB.ODS.EC_CLUB").count()
        }
        print(f"Table row counts: {counts}")

        # Check for duplicates in sales
        duplicates = session.sql("""
            SELECT DEALER_NO, SKU, DATE, COUNT(*)
            FROM SNOWFLAKE_LEARNING_DB.ODS.SALES
            GROUP BY DEALER_NO, SKU, DATE
            HAVING COUNT(*) > 1
        """).collect()
        if duplicates:
            print(f"Duplicates found in ODS.SALES: {duplicates}")
        else:
            print("No duplicates found in ODS.SALES.")

        # Verify column names
        show_cols_df = session.sql("SHOW COLUMNS IN SNOWFLAKE_LEARNING_DB.ODS.DEALERS").to_pandas()
        print(f"SHOW COLUMNS DataFrame columns: {list(show_cols_df.columns)}")
        if len(show_cols_df.columns) < 3:
            raise KeyError(f"Expected at least 3 columns in SHOW COLUMNS for SNOWFLAKE_LEARNING_DB.ODS.DEALERS. Got: {list(show_cols_df.columns)}")
        dealer_cols = show_cols_df.iloc[:, 2].tolist()
        show_cols_df = session.sql("SHOW COLUMNS IN SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS").to_pandas()
        print(f"SHOW COLUMNS DataFrame columns: {list(show_cols_df.columns)}")
        if len(show_cols_df.columns) < 3:
            raise KeyError(f"Expected at least 3 columns in SHOW COLUMNS for SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS. Got: {list(show_cols_df.columns)}")
        product_cols = show_cols_df.iloc[:, 2].tolist()
        print(f"DEALERS table columns: {dealer_cols}")
        print(f"PRODUCTS table columns: {product_cols}")

        # Sample data
        sample_dealers = session.table("SNOWFLAKE_LEARNING_DB.ODS.DEALERS").limit(5).to_pandas()
        sample_products = session.table("SNOWFLAKE_LEARNING_DB.ODS.PRODUCTS").limit(5).to_pandas()
        sample_sales = session.table("SNOWFLAKE_LEARNING_DB.ODS.SALES").limit(5).to_pandas()
        print("Sample of ODS.DEALERS:\n" + str(sample_dealers))
        print("Sample of ODS.PRODUCTS:\n" + str(sample_products))
        print("Sample of ODS.SALES:\n" + str(sample_sales))
        print(f"validate_data completed in {time.time() - start_time:.2f} seconds")
    except Exception as e:
        print(f"Error validating data: {str(e)}")
        raise

In [None]:
# Main Execution
try:
    start_time = time.time()
    print("Starting synthetic data generation...")
    
    # Check table existence
    tables_df = session.sql("""
        SELECT TABLE_NAME 
        FROM INFORMATION_SCHEMA.TABLES 
        WHERE TABLE_SCHEMA = 'ODS' 
        AND TABLE_NAME IN ('DEALERS', 'PRODUCTS', 'SALES', 'DP_MAPPING', 'EC_CLUB')
    """).to_pandas()
    tables_exist = tables_df['TABLE_NAME'].tolist()
    print(f"Tables found in ODS schema: {tables_exist}")
    
    # Create tables if missing
    if len(tables_exist) < 5:
        create_tables(session)
    else:
        print("All required tables exist, skipping table creation.")

    # Generate or load baseline data
    dealers_df, products_df, sales_df = generate_baseline_data(session)
    
    # Generate incremental sales
    sales_df = generate_incremental_sales(session, dealers_df, products_df, sales_df)
    
    # Update dealer features
    update_dealer_features(session)
    
    # Validate data
    validate_data(session)
    
    print(f"Synthetic data generation completed in {time.time() - start_time:.2f} seconds")
except Exception as e:
    print(f"Data generation failed: {str(e)}")
    raise