**1. GPU加速数据生成 v1**

In [24]:
# -*- coding: utf-8 -*-
"""Tesla Simulated Sales Data Generator (GPU Version)

This script generates mock sales data for Tesla in a star schema, using cuDF
for GPU acceleration while strictly following the data format and breadth of
the original CPU Pandas script.
"""
# 请确保已安装 cuDF 库。
# 如果是在 Google Colab 上，可以使用以下命令：
# !pip install -U cudf-cu12 --extra-index-url=https://pypi.nvidia.com

import cudf
import cupy as cp
import numpy as np
import random
import datetime
import os
import copy
import time # 新增：用于计算执行时间

# 固定随机种子以便可复现
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# --------------------------
# Helper: 生成合理格式的邮编/邮政编码（尽量贴近各国常见格式）
# --------------------------
def generate_plausible_zip(country, state_province_abbr):
    """Generates a plausible zip code based on the country and state/province."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'

    if country == 'United States':
        # 5-digit zip with some simple state-based preferences
        if state_province_abbr.startswith(('C', 'I')):
            return f"9{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith(('T', 'L')):
            return f"7{random.randint(5,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('F'):
            return f"3{random.randint(2,4)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('N'):
            return f"1{random.randint(0,4)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('W'):
            return f"98{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('A'):
            return f"85{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('G'):
            return f"30{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('P'):
            return f"15{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        return f"{random.randint(10000, 99999)}"
    elif country == 'Canada':
        province_codes = {
            'ON': ['K','L','M','N','P'], 'QC': ['G','H','J'], 'BC': ['V'], 'AB': ['T'],
            'SK': ['S'], 'MB': ['R'], 'NB': ['E'], 'NS': ['B'], 'NL': ['A'], 'PE': ['C'],
            'YT': ['Y'], 'NT': ['X'], 'NU': ['X']
        }
        first_letter = random.choice(province_codes.get(state_province_abbr, ['A']))
        return f"{first_letter}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
    elif country == 'Mexico':
        return f"{random.randint(1000, 99999):05d}"
    elif country == 'United Kingdom':
        outward = f"{random.choice(letters)}{random.choice(letters)}{random.randint(1,9)}"
        inward = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        return f"{outward} {inward}"
    elif country in ['China', 'Taiwan']:
        if country == 'China':
            if state_province_abbr.startswith('B'):
                return f"10{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
            if state_province_abbr.startswith('S'):
                return f"20{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
            if state_province_abbr.startswith('G'):
                return f"51{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
            return f"{random.randint(10000, 99999)}"
        else:
            return f"{random.randint(100, 999)}"
    elif country == 'Germany':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Japan':
        return f"{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    elif country == 'Australia':
        return f"{random.randint(1000, 9999)}"
    elif country == 'New Zealand':
        nz_ranges = {
            'AKL': (600, 2699), 'NTL': (100, 1099), 'WKO': (3200, 3799), 'BOP': (3000, 3199),
            'GIS': (4010, 4199), 'HKB': (4100, 4299), 'MWT': (4400, 4699), 'MBH': (7200, 7299),
            'NSN': (7010, 7099), 'OTA': (9000, 9799), 'STL': (9800, 9899), 'TKI': (4300, 4399),
            'TAS': (7100, 7199), 'WLG': (5010, 5799), 'WTC': (7800, 7999), 'CAN': (7000, 7999),
        }
        rng = nz_ranges.get(state_province_abbr)
        if rng:
            low, high = rng
            val = random.randint(low, high)
            return f"{val:04d}"
        else:
            return f"{random.randint(1000, 99999):05d}"
    elif country in ['France', 'Italy']:
        return f"{random.randint(10000, 99999)}"
    elif country == 'Spain':
        return f"{random.randint(10000, 52999)}"
    elif country == 'South Korea':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Thailand':
        return f"{random.randint(10000, 99999)}"
    else:
        return f"{random.randint(10000, 99999)}"

# --------------------------
# 定义国家/省/州 字典
# --------------------------
tesla_countries = {
    'North America': {
        'United States': {'country_code': 'US', 'states': [
            {'abbr': 'CA', 'full': 'California', 'sales_weight': 10},
            {'abbr': 'TX', 'full': 'Texas', 'sales_weight': 9},
            {'abbr': 'FL', 'full': 'Florida', 'sales_weight': 8},
            {'abbr': 'NY', 'full': 'New York', 'sales_weight': 7},
            {'abbr': 'WA', 'full': 'Washington', 'sales_weight': 6},
            {'abbr': 'IL', 'full': 'Illinois', 'sales_weight': 5},
            {'abbr': 'MA', 'full': 'Massachusetts', 'sales_weight': 5},
            {'abbr': 'NJ', 'full': 'New Jersey', 'sales_weight': 5},
            {'abbr': 'NC', 'full': 'North Carolina', 'sales_weight': 4},
            {'abbr': 'GA', 'full': 'Georgia', 'sales_weight': 4},
            {'abbr': 'PA', 'full': 'Pennsylvania', 'sales_weight': 4},
            {'abbr': 'CO', 'full': 'Colorado', 'sales_weight': 4},
            {'abbr': 'AZ', 'full': 'Arizona', 'sales_weight': 4},
            {'abbr': 'OH', 'full': 'Ohio', 'sales_weight': 3},
            {'abbr': 'MI', 'full': 'Michigan', 'sales_weight': 3},
            {'abbr': 'VA', 'full': 'Virginia', 'sales_weight': 3},
            {'abbr': 'MD', 'full': 'Maryland', 'sales_weight': 3},
            {'abbr': 'OR', 'full': 'Oregon', 'sales_weight': 3},
            {'abbr': 'NV', 'full': 'Nevada', 'sales_weight': 2},
            {'abbr': 'MN', 'full': 'Minnesota', 'sales_weight': 2},
            {'abbr': 'UT', 'full': 'Utah', 'sales_weight': 2},
            {'abbr': 'DC', 'full': 'District of Columbia', 'sales_weight': 2},
            {'abbr': 'AL', 'full': 'Alabama', 'sales_weight': 1},
            {'abbr': 'AK', 'full': 'Alaska', 'sales_weight': 1},
            {'abbr': 'AR', 'full': 'Arkansas', 'sales_weight': 1},
            {'abbr': 'CT', 'full': 'Connecticut', 'sales_weight': 1},
            {'abbr': 'DE', 'full': 'Delaware', 'sales_weight': 1},
            {'abbr': 'HI', 'full': 'Hawaii', 'sales_weight': 1},
            {'abbr': 'ID', 'full': 'Idaho', 'sales_weight': 1},
            {'abbr': 'IN', 'full': 'Indiana', 'sales_weight': 1},
            {'abbr': 'IA', 'full': 'Iowa', 'sales_weight': 1},
            {'abbr': 'KS', 'full': 'Kansas', 'sales_weight': 1},
            {'abbr': 'KY', 'full': 'Kentucky', 'sales_weight': 1},
            {'abbr': 'LA', 'full': 'Louisiana', 'sales_weight': 1},
            {'abbr': 'ME', 'full': 'Maine', 'sales_weight': 1},
            {'abbr': 'MS', 'full': 'Mississippi', 'sales_weight': 1},
            {'abbr': 'MO', 'full': 'Missouri', 'sales_weight': 1},
            {'abbr': 'MT', 'full': 'Montana', 'sales_weight': 1},
            {'abbr': 'NE', 'full': 'Nebraska', 'sales_weight': 1},
            {'abbr': 'NH', 'full': 'New Hampshire', 'sales_weight': 1},
            {'abbr': 'NM', 'full': 'New Mexico', 'sales_weight': 1},
            {'abbr': 'ND', 'full': 'North Dakota', 'sales_weight': 1},
            {'abbr': 'OK', 'full': 'Oklahoma', 'sales_weight': 1},
            {'abbr': 'RI', 'full': 'Rhode Island', 'sales_weight': 1},
            {'abbr': 'SC', 'full': 'South Carolina', 'sales_weight': 1},
            {'abbr': 'SD', 'full': 'South Dakota', 'sales_weight': 1},
            {'abbr': 'TN', 'full': 'Tennessee', 'sales_weight': 1},
            {'abbr': 'VT', 'full': 'Vermont', 'sales_weight': 1},
            {'abbr': 'WV', 'full': 'West Virginia', 'sales_weight': 1},
            {'abbr': 'WI', 'full': 'Wisconsin', 'sales_weight': 1},
            {'abbr': 'WY', 'full': 'Wyoming', 'sales_weight': 1},
        ], 'zip_generator': generate_plausible_zip},
        'Canada': {'country_code': 'CA', 'states': [
            {'abbr': 'ON', 'full': 'Ontario', 'sales_weight': 8},
            {'abbr': 'QC', 'full': 'Quebec', 'sales_weight': 6},
            {'abbr': 'BC', 'full': 'British Columbia', 'sales_weight': 5},
            {'abbr': 'AB', 'full': 'Alberta', 'sales_weight': 4},
            {'abbr': 'MB', 'full': 'Manitoba', 'sales_weight': 2},
            {'abbr': 'SK', 'full': 'Saskatchewan', 'sales_weight': 1},
            {'abbr': 'NB', 'full': 'New Brunswick', 'sales_weight': 1},
            {'abbr': 'NL', 'full': 'Newfoundland and Labrador', 'sales_weight': 1},
            {'abbr': 'NS', 'full': 'Nova Scotia', 'sales_weight': 1},
            {'abbr': 'PE', 'full': 'Prince Edward Island', 'sales_weight': 1},
            {'abbr': 'NT', 'full': 'Northwest Territories', 'sales_weight': 1},
            {'abbr': 'NU', 'full': 'Nunavut', 'sales_weight': 1},
            {'abbr': 'YT', 'full': 'Yukon', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Mexico': {'country_code': 'MX', 'states': [
            {'abbr': 'CDMX', 'full': 'Mexico City', 'sales_weight': 10},
            {'abbr': 'MEX', 'full': 'Mexico State', 'sales_weight': 7},
            {'abbr': 'JAL', 'full': 'Jalisco', 'sales_weight': 6},
            {'abbr': 'NLE', 'full': 'Nuevo León', 'sales_weight': 5},
            {'abbr': 'PUE', 'full': 'Puebla', 'sales_weight': 4},
            {'abbr': 'BC', 'full': 'Baja California', 'sales_weight': 3},
            {'abbr': 'VER', 'full': 'Veracruz', 'sales_weight': 3},
            {'abbr': 'MICH', 'full': 'Michoacán', 'sales_weight': 2},
            {'abbr': 'CHIS', 'full': 'Chiapas', 'sales_weight': 2},
            {'abbr': 'QR', 'full': 'Quintana Roo', 'sales_weight': 2},
            {'abbr': 'GTO', 'full': 'Guanajuato', 'sales_weight': 1},
            {'abbr': 'GRO', 'full': 'Guerrero', 'sales_weight': 1},
            {'abbr': 'BCS', 'full': 'Baja California Sur', 'sales_weight': 1},
            {'abbr': 'CHIH', 'full': 'Chihuahua', 'sales_weight': 1},
            {'abbr': 'SIN', 'full': 'Sinaloa', 'sales_weight': 1},
            {'abbr': 'SON', 'full': 'Sonora', 'sales_weight': 1},
            {'abbr': 'YUC', 'full': 'Yucatán', 'sales_weight': 1},
            {'abbr': 'AGS', 'full': 'Aguascalientes', 'sales_weight': 1},
            {'abbr': 'CAMP', 'full': 'Campeche', 'sales_weight': 1},
            {'abbr': 'COAH', 'full': 'Coahuila', 'sales_weight': 1},
            {'abbr': 'COL', 'full': 'Colima', 'sales_weight': 1},
            {'abbr': 'DUR', 'full': 'Durango', 'sales_weight': 1},
            {'abbr': 'HGO', 'full': 'Hidalgo', 'sales_weight': 1},
            {'abbr': 'MOR', 'full': 'Morelos', 'sales_weight': 1},
            {'abbr': 'NAY', 'full': 'Nayarit', 'sales_weight': 1},
            {'abbr': 'OAX', 'full': 'Oaxaca', 'sales_weight': 1},
            {'abbr': 'QRO', 'full': 'Querétaro', 'sales_weight': 1},
            {'abbr': 'SLP', 'full': 'San Luis Potosí', 'sales_weight': 1},
            {'abbr': 'TAB', 'full': 'Tabasco', 'sales_weight': 1},
            {'abbr': 'TAM', 'full': 'Tamaulipas', 'sales_weight': 1},
            {'abbr': 'TLAX', 'full': 'Tlaxcala', 'sales_weight': 1},
            {'abbr': 'ZAC', 'full': 'Zacatecas', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip}
    },
    'Europe': {
        'Germany': {'country_code': 'DE', 'states': [
            {'abbr': 'NW', 'full': 'North Rhine-Westphalia', 'sales_weight': 8},
            {'abbr': 'BY', 'full': 'Bavaria', 'sales_weight': 7},
            {'abbr': 'BW', 'full': 'Baden-Württemberg', 'sales_weight': 6},
            {'abbr': 'HE', 'full': 'Hesse', 'sales_weight': 5},
            {'abbr': 'NI', 'full': 'Lower Saxony', 'sales_weight': 4},
            {'abbr': 'BE', 'full': 'Berlin', 'sales_weight': 3},
            {'abbr': 'HH', 'full': 'Hamburg', 'sales_weight': 3},
            {'abbr': 'SL', 'full': 'Saarland', 'sales_weight': 2},
            {'abbr': 'HB', 'full': 'Bremen', 'sales_weight': 2},
            {'abbr': 'RP', 'full': 'Rhineland-Palatinate', 'sales_weight': 2},
            {'abbr': 'SH', 'full': 'Schleswig-Holstein', 'sales_weight': 2},
            {'abbr': 'SN', 'full': 'Saxony', 'sales_weight': 1},
            {'abbr': 'TH', 'full': 'Thuringia', 'sales_weight': 1},
            {'abbr': 'BB', 'full': 'Brandenburg', 'sales_weight': 1},
            {'abbr': 'MV', 'full': 'Mecklenburg-Vorpommern', 'sales_weight': 1},
            {'abbr': 'ST', 'full': 'Saxony-Anhalt', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'United Kingdom': {'country_code': 'GB', 'states': [
            {'abbr': 'ENG', 'full': 'England', 'sales_weight': 10},
            {'abbr': 'SCT', 'full': 'Scotland', 'sales_weight': 4},
            {'abbr': 'WLS', 'full': 'Wales', 'sales_weight': 2},
            {'abbr': 'NIR', 'full': 'Northern Ireland', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Norway': {'country_code': 'NO', 'states': [
            {'abbr': 'VL', 'full': 'Viken', 'sales_weight': 5},
            {'abbr': 'OS', 'full': 'Oslo', 'sales_weight': 4},
            {'abbr': 'TR', 'full': 'Trøndelag', 'sales_weight': 3},
            {'abbr': 'VEST', 'full': 'Vestland', 'sales_weight': 3},
            {'abbr': 'RO', 'full': 'Rogaland', 'sales_weight': 2},
            {'abbr': 'MR', 'full': 'Møre og Romsdal', 'sales_weight': 2},
            {'abbr': 'INN', 'full': 'Innlandet', 'sales_weight': 2},
            {'abbr': 'TROM', 'full': 'Troms og Finnmark', 'sales_weight': 1},
            {'abbr': 'VESTF', 'full': 'Vestfold og Telemark', 'sales_weight': 1},
            {'abbr': 'AGD', 'full': 'Agder', 'sales_weight': 1},
            {'abbr': 'NORDL', 'full': 'Nordland', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'France': {'country_code': 'FR', 'states': [
            {'abbr': 'IDF', 'full': 'Île-de-France', 'sales_weight': 10},
            {'abbr': 'NAQ', 'full': 'Nouvelle-Aquitaine', 'sales_weight': 5},
            {'abbr': 'ARA', 'full': 'Auvergne-Rhône-Alpes', 'sales_weight': 4},
            {'abbr': 'PACA', 'full': 'Provence-Alpes-Côte d\'Azur', 'sales_weight': 4},
            {'abbr': 'OCC', 'full': 'Occitanie', 'sales_weight': 3},
            {'abbr': 'HDF', 'full': 'Hauts-de-France', 'sales_weight': 3},
            {'abbr': 'GES', 'full': 'Grand Est', 'sales_weight': 2},
            {'abbr': 'NOR', 'full': 'Normandy', 'sales_weight': 2},
            {'abbr': 'BRE', 'full': 'Brittany', 'sales_weight': 2},
            {'abbr': 'PDL', 'full': 'Pays de la Loire', 'sales_weight': 2},
            {'abbr': 'BFC', 'full': 'Bourgogne-Franche-Comté', 'sales_weight': 1},
            {'abbr': 'CVL', 'full': 'Centre-Val de Loire', 'sales_weight': 1},
            {'abbr': 'COR', 'full': 'Corsica', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Netherlands': {'country_code': 'NL', 'states': [
            {'abbr': 'NH', 'full': 'North Holland', 'sales_weight': 7},
            {'abbr': 'ZH', 'full': 'South Holland', 'sales_weight': 6},
            {'abbr': 'UT', 'full': 'Utrecht', 'sales_weight': 4},
            {'abbr': 'GE', 'full': 'Gelderland', 'sales_weight': 3},
            {'abbr': 'NB', 'full': 'North Brabant', 'sales_weight': 3},
            {'abbr': 'GR', 'full': 'Groningen', 'sales_weight': 2},
            {'abbr': 'OV', 'full': 'Overijssel', 'sales_weight': 2},
            {'abbr': 'FR', 'full': 'Friesland', 'sales_weight': 1},
            {'abbr': 'FL', 'full': 'Flevoland', 'sales_weight': 1},
            {'abbr': 'LB', 'full': 'Limburg', 'sales_weight': 1},
            {'abbr': 'DR', 'full': 'Drenthe', 'sales_weight': 1},
            {'abbr': 'ZE', 'full': 'Zeeland', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Sweden': {'country_code': 'SE', 'states': [
            {'abbr': 'AB', 'full': 'Stockholm County', 'sales_weight': 10},
            {'abbr': 'O', 'full': 'Västra Götaland County', 'sales_weight': 5},
            {'abbr': 'M', 'full': 'Skåne County', 'sales_weight': 4},
            {'abbr': 'D', 'full': 'Södermanland County', 'sales_weight': 2},
            {'abbr': 'E', 'full': 'Östergötland County', 'sales_weight': 2},
            {'abbr': 'U', 'full': 'Västmanland County', 'sales_weight': 2},
            {'abbr': 'I', 'full': 'Gotland County', 'sales_weight': 1},
            {'abbr': 'AC', 'full': 'Västerbotten County', 'sales_weight': 1},
            {'abbr': 'BD', 'full': 'Norrbotten County', 'sales_weight': 1},
            {'abbr': 'C', 'full': 'Uppsala County', 'sales_weight': 1},
            {'abbr': 'F', 'full': 'Jönköping County', 'sales_weight': 1},
            {'abbr': 'G', 'full': 'Kronoberg County', 'sales_weight': 1},
            {'abbr': 'H', 'full': 'Kalmar County', 'sales_weight': 1},
            {'abbr': 'K', 'full': 'Blekinge County', 'sales_weight': 1},
            {'abbr': 'N', 'full': 'Halland County', 'sales_weight': 1},
            {'abbr': 'S', 'full': 'Värmland County', 'sales_weight': 1},
            {'abbr': 'T', 'full': 'Örebro County', 'sales_weight': 1},
            {'abbr': 'W', 'full': 'Dalarna County', 'sales_weight': 1},
            {'abbr': 'X', 'full': 'Gävleborg County', 'sales_weight': 1},
            {'abbr': 'Y', 'full': 'Västernorrland County', 'sales_weight': 1},
            {'abbr': 'Z', 'full': 'Jämtland County', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Switzerland': {'country_code': 'CH', 'states': [
            {'abbr': 'ZH', 'full': 'Zurich', 'sales_weight': 6},
            {'abbr': 'GE', 'full': 'Geneva', 'sales_weight': 4},
            {'abbr': 'VD', 'full': 'Vaud', 'sales_weight': 3},
            {'abbr': 'BE', 'full': 'Bern', 'sales_weight': 3},
            {'abbr': 'BS', 'full': 'Basel-Stadt', 'sales_weight': 2},
            {'abbr': 'LU', 'full': 'Lucerne', 'sales_weight': 2},
            {'abbr': 'AG', 'full': 'Aargau', 'sales_weight': 2},
            {'abbr': 'VS', 'full': 'Valais', 'sales_weight': 2},
            {'abbr': 'ZG', 'full': 'Zug', 'sales_weight': 1},
            {'abbr': 'TI', 'full': 'Ticino', 'sales_weight': 1},
            {'abbr': 'SG', 'full': 'St. Gallen', 'sales_weight': 1},
            {'abbr': 'BL', 'full': 'Basel-Landschaft', 'sales_weight': 1},
            {'abbr': 'TG', 'full': 'Thurgau', 'sales_weight': 1},
            {'abbr': 'SO', 'full': 'Solothurn', 'sales_weight': 1},
            {'abbr': 'SZ', 'full': 'Schwyz', 'sales_weight': 1},
            {'abbr': 'AR', 'full': 'Appenzell Ausserrhoden', 'sales_weight': 1},
            {'abbr': 'AI', 'full': 'Appenzell Innerrhoden', 'sales_weight': 1},
            {'abbr': 'GL', 'full': 'Glarus', 'sales_weight': 1},
            {'abbr': 'JU', 'full': 'Jura', 'sales_weight': 1},
            {'abbr': 'NE', 'full': 'Neuchâtel', 'sales_weight': 1},
            {'abbr': 'OW', 'full': 'Obwalden', 'sales_weight': 1},
            {'abbr': 'NW', 'full': 'Nidwalden', 'sales_weight': 1},
            {'abbr': 'SH', 'full': 'Schaffhausen', 'sales_weight': 1},
            {'abbr': 'GR', 'full': 'Graubünden', 'sales_weight': 1},
            {'abbr': 'UR', 'full': 'Uri', 'sales_weight': 1},
            {'abbr': 'FR', 'full': 'Fribourg', 'sales_weight': 1},
        ], 'zip_generator': generate_plausible_zip},
        'Italy': {'country_code': 'IT', 'states': [
            {'abbr': 'LOM', 'full': 'Lombardy', 'sales_weight': 10},
            {'abbr': 'VEN', 'full': 'Veneto', 'sales_weight': 5},
            {'abbr': 'LAZ', 'full': 'Lazio', 'sales_weight': 4},
            {'abbr': 'PIE', 'full': 'Piedmont', 'sales_weight': 4},
            {'abbr': 'TOS', 'full': 'Tuscany', 'sales_weight': 3},
            {'abbr': 'EMR', 'full': 'Emilia-Romagna', 'sales_weight': 3},
            {'abbr': 'CAM', 'full': 'Campania', 'sales_weight': 2},
            {'abbr': 'SIC', 'full': 'Sicily', 'sales_weight': 2},
            {'abbr': 'PUG', 'full': 'Apulia', 'sales_weight': 2},
            {'abbr': 'FVG', 'full': 'Friuli-Venezia Giulia', 'sales_weight': 2},
            {'abbr': 'SAR', 'full': 'Sardinia', 'sales_weight': 1},
            {'abbr': 'ABR', 'full': 'Abruzzo', 'sales_weight': 1},
            {'abbr': 'BAS', 'full': 'Basilicata', 'sales_weight': 1},
            {'abbr': 'CAL', 'full': 'Calabria', 'sales_weight': 1},
            {'abbr': 'LIG', 'full': 'Liguria', 'sales_weight': 1},
            {'abbr': 'MAR', 'full': 'Marche', 'sales_weight': 1},
            {'abbr': 'MOL', 'full': 'Molise', 'sales_weight': 1},
            {'abbr': 'TAA', 'full': 'Trentino-South Tyrol', 'sales_weight': 1},
            {'abbr': 'UMB', 'full': 'Umbria', 'sales_weight': 1},
            {'abbr': 'VAO', 'full': 'Aosta Valley', 'sales_weight': 1},
        ], 'zip_generator': generate_plausible_zip},
        'Spain': {'country_code': 'ES', 'states': [
            {'abbr': 'MD', 'full': 'Madrid', 'sales_weight': 10},
            {'abbr': 'CT', 'full': 'Catalonia', 'sales_weight': 8},
            {'abbr': 'AN', 'full': 'Andalusia', 'sales_weight': 6},
            {'abbr': 'VC', 'full': 'Valencian Community', 'sales_weight': 5},
            {'abbr': 'PV', 'full': 'Basque Country', 'sales_weight': 4},
            {'abbr': 'CL', 'full': 'Castile and León', 'sales_weight': 3},
            {'abbr': 'GA', 'full': 'Galicia', 'sales_weight': 3},
            {'abbr': 'AR', 'full': 'Aragon', 'sales_weight': 2},
            {'abbr': 'IB', 'full': 'Balearic Islands', 'sales_weight': 2},
            {'abbr': 'CN', 'full': 'Canary Islands', 'sales_weight': 2},
            {'abbr': 'AS', 'full': 'Asturias', 'sales_weight': 1},
            {'abbr': 'CB', 'full': 'Cantabria', 'sales_weight': 1},
            {'abbr': 'CM', 'full': 'Castile-La Mancha', 'sales_weight': 1},
            {'abbr': 'EX', 'full': 'Extremadura', 'sales_weight': 1},
            {'abbr': 'RI', 'full': 'La Rioja', 'sales_weight': 1},
            {'abbr': 'MC', 'full': 'Murcia', 'sales_weight': 1},
            {'abbr': 'NC', 'full': 'Navarre', 'sales_weight': 1},
            {'abbr': 'CE', 'full': 'Ceuta', 'sales_weight': 1},
            {'abbr': 'ML', 'full': 'Melilla', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Belgium': {'country_code': 'BE', 'states': [
            {'abbr': 'VLG', 'full': 'Flanders', 'sales_weight': 7},
            {'abbr': 'WAL', 'full': 'Wallonia', 'sales_weight': 4},
            {'abbr': 'BRU', 'full': 'Brussels-Capital Region', 'sales_weight': 2}
        ], 'zip_generator': generate_plausible_zip},
        'Austria': {'country_code': 'AT', 'states': [
            {'abbr': 'WIE', 'full': 'Vienna', 'sales_weight': 5},
            {'abbr': 'OÖ', 'full': 'Upper Austria', 'sales_weight': 3},
            {'abbr': 'NOE', 'full': 'Lower Austria', 'sales_weight': 2},
            {'abbr': 'TIR', 'full': 'Tyrol', 'sales_weight': 2},
            {'abbr': 'STE', 'full': 'Styria', 'sales_weight': 2},
            {'abbr': 'SAL', 'full': 'Salzburg', 'sales_weight': 1},
            {'abbr': 'KNT', 'full': 'Carinthia', 'sales_weight': 1},
            {'abbr': 'BGL', 'full': 'Burgenland', 'sales_weight': 1},
            {'abbr': 'VOE', 'full': 'Vorarlberg', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Denmark': {'country_code': 'DK', 'states': [
            {'abbr': 'H', 'full': 'Capital Region of Denmark', 'sales_weight': 6},
            {'abbr': 'M', 'full': 'Central Denmark Region', 'sales_weight': 3},
            {'abbr': 'S', 'full': 'Region of Southern Denmark', 'sales_weight': 2},
            {'abbr': 'SJ', 'full': 'Zealand Region', 'sales_weight': 2},
            {'abbr': 'ND', 'full': 'North Denmark Region', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Finland': {'country_code': 'FI', 'states': [
            {'abbr': 'US', 'full': 'Uusimaa', 'sales_weight': 6},
            {'abbr': 'PS', 'full': 'Northern Savonia', 'sales_weight': 2},
            {'abbr': 'LA', 'full': 'Lapland', 'sales_weight': 2},
            {'abbr': 'OL', 'full': 'North Ostrobothnia', 'sales_weight': 2},
            {'abbr': 'HA', 'full': 'Central Ostrobothnia', 'sales_weight': 1},
            {'abbr': 'KA', 'full': 'Kainuu', 'sales_weight': 1},
            {'abbr': 'KE', 'full': 'Central Finland', 'sales_weight': 1},
            {'abbr': 'KM', 'full': 'Kymenlaakso', 'sales_weight': 1},
            {'abbr': 'KU', 'full': 'Pirkanmaa', 'sales_weight': 1},
            {'abbr': 'PH', 'full': 'North Karelia', 'sales_weight': 1},
            {'abbr': 'PV', 'full': 'South Karelia', 'sales_weight': 1},
            {'abbr': 'SA', 'full': 'Satakunta', 'sales_weight': 1},
            {'abbr': 'SS', 'full': 'Southern Savonia', 'sales_weight': 1},
            {'abbr': 'TA', 'full': 'Tavastia Proper', 'sales_weight': 1},
            {'abbr': 'ES', 'full': 'Southern Ostrobothnia', 'sales_weight': 1},
            {'abbr': 'VA', 'full': 'Vaasa', 'sales_weight': 1},
            {'abbr': 'AS', 'full': 'Åland Islands', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Greece': {'country_code': 'GR', 'states': [
            {'abbr': 'AT', 'full': 'Attica', 'sales_weight': 7},
            {'abbr': 'MK', 'full': 'Central Macedonia', 'sales_weight': 3},
            {'abbr': 'CR', 'full': 'Crete', 'sales_weight': 2},
            {'abbr': 'EM', 'full': 'East Macedonia and Thrace', 'sales_weight': 1},
            {'abbr': 'EP', 'full': 'Epirus', 'sales_weight': 1},
            {'abbr': 'IO', 'full': 'Ionian Islands', 'sales_weight': 1},
            {'abbr': 'NA', 'full': 'North Aegean', 'sales_weight': 1},
            {'abbr': 'PC', 'full': 'Peloponnese', 'sales_weight': 1},
            {'abbr': 'SM', 'full': 'South Aegean', 'sales_weight': 1},
            {'abbr': 'TH', 'full': 'Thessaly', 'sales_weight': 1},
            {'abbr': 'WM', 'full': 'West Macedonia', 'sales_weight': 1},
            {'abbr': 'WG', 'full': 'West Greece', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Iceland': {'country_code': 'IS', 'states': [
            {'abbr': 'HO', 'full': 'Capital Region', 'sales_weight': 10},
            {'abbr': 'NV', 'full': 'South Region', 'sales_weight': 2},
            {'abbr': 'SU', 'full': 'Westfjords', 'sales_weight': 1},
            {'abbr': 'V', 'full': 'Western Region', 'sales_weight': 1},
            {'abbr': 'VE', 'full': 'Southern Peninsula', 'sales_weight': 1},
            {'abbr': 'AU', 'full': 'East Region', 'sales_weight': 1},
            {'abbr': 'NO', 'full': 'Northeastern Region', 'sales_weight': 1},
            {'abbr': 'W', 'full': 'Northwestern Region', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Ireland': {'country_code': 'IE', 'states': [
            {'abbr': 'D', 'full': 'Dublin', 'sales_weight': 8},
            {'abbr': 'CO', 'full': 'Cork', 'sales_weight': 4},
            {'abbr': 'G', 'full': 'Galway', 'sales_weight': 3},
            {'abbr': 'L', 'full': 'Limerick', 'sales_weight': 2},
            {'abbr': 'W', 'full': 'Waterford', 'sales_weight': 2},
            {'abbr': 'DL', 'full': 'Donegal', 'sales_weight': 1},
            {'abbr': 'KY', 'full': 'Kerry', 'sales_weight': 1},
            {'abbr': 'KE', 'full': 'Kildare', 'sales_weight': 1},
            {'abbr': 'KK', 'full': 'Kilkenny', 'sales_weight': 1},
            {'abbr': 'LS', 'full': 'Laois', 'sales_weight': 1},
            {'abbr': 'LD', 'full': 'Longford', 'sales_weight': 1},
            {'abbr': 'LH', 'full': 'Louth', 'sales_weight': 1},
            {'abbr': 'MO', 'full': 'Mayo', 'sales_weight': 1},
            {'abbr': 'MH', 'full': 'Meath', 'sales_weight': 1},
            {'abbr': 'MN', 'full': 'Monaghan', 'sales_weight': 1},
            {'abbr': 'OY', 'full': 'Offaly', 'sales_weight': 1},
            {'abbr': 'RN', 'full': 'Roscommon', 'sales_weight': 1},
            {'abbr': 'SO', 'full': 'Sligo', 'sales_weight': 1},
            {'abbr': 'TA', 'full': 'Tipperary', 'sales_weight': 1},
            {'abbr': 'WM', 'full': 'Westmeath', 'sales_weight': 1},
            {'abbr': 'WX', 'full': 'Wexford', 'sales_weight': 1},
            {'abbr': 'WW', 'full': 'Wicklow', 'sales_weight': 1},
            {'abbr': 'CW', 'full': 'Carlow', 'sales_weight': 1},
            {'abbr': 'CN', 'full': 'Cavan', 'sales_weight': 1},
            {'abbr': 'CE', 'full': 'Clare', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Poland': {'country_code': 'PL', 'states': [
            {'abbr': 'MZ', 'full': 'Masovian Voivodeship', 'sales_weight': 10},
            {'abbr': 'SL', 'full': 'Silesian Voivodeship', 'sales_weight': 7},
            {'abbr': 'MA', 'full': 'Lesser Poland Voivodeship', 'sales_weight': 5},
            {'abbr': 'WP', 'full': 'Greater Poland Voivodeship', 'sales_weight': 4},
            {'abbr': 'DS', 'full': 'Lower Silesian Voivodeship', 'sales_weight': 3},
            {'abbr': 'LD', 'full': 'Łódź Voivodeship', 'sales_weight': 3},
            {'abbr': 'PM', 'full': 'Pomeranian Voivodeship', 'sales_weight': 2},
            {'abbr': 'KP', 'full': 'Kuyavian-Pomeranian Voivodeship', 'sales_weight': 1},
            {'abbr': 'LU', 'full': 'Lublin Voivodeship', 'sales_weight': 1},
            {'abbr': 'LB', 'full': 'Lubusz Voivodeship', 'sales_weight': 1},
            {'abbr': 'OP', 'full': 'Opole Voivodeship', 'sales_weight': 1},
            {'abbr': 'PK', 'full': 'Subcarpathian Voivodeship', 'sales_weight': 1},
            {'abbr': 'PD', 'full': 'Podlaskie Voivodeship', 'sales_weight': 1},
            {'abbr': 'PM', 'full': 'Pomeranian Voivodeship', 'sales_weight': 1},
            {'abbr': 'WM', 'full': 'Warmian-Masurian Voivodeship', 'sales_weight': 1},
            {'abbr': 'ZP', 'full': 'West Pomeranian Voivodeship', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Portugal': {'country_code': 'PT', 'states': [
            {'abbr': 'LIS', 'full': 'Lisbon', 'sales_weight': 8},
            {'abbr': 'PO', 'full': 'Porto', 'sales_weight': 5},
            {'abbr': 'SET', 'full': 'Setúbal', 'sales_weight': 3},
            {'abbr': 'BRG', 'full': 'Braga', 'sales_weight': 2},
            {'abbr': 'AVE', 'full': 'Aveiro', 'sales_weight': 2},
            {'abbr': 'COI', 'full': 'Coimbra', 'sales_weight': 2},
            {'abbr': 'LEI', 'full': 'Leiria', 'sales_weight': 1},
            {'abbr': 'FAR', 'full': 'Faro', 'sales_weight': 1},
            {'abbr': 'VIL', 'full': 'Vila Real', 'sales_weight': 1},
            {'abbr': 'POR', 'full': 'Portalegre', 'sales_weight': 1},
            {'abbr': 'VC', 'full': 'Viana do Castelo', 'sales_weight': 1},
            {'abbr': 'EVO', 'full': 'Évora', 'sales_weight': 1},
            {'abbr': 'BEJ', 'full': 'Beja', 'sales_weight': 1},
            {'abbr': 'GUA', 'full': 'Guarda', 'sales_weight': 1},
            {'abbr': 'BRA', 'full': 'Bragança', 'sales_weight': 1},
            {'abbr': 'VSE', 'full': 'Viseu', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'South Korea': {'country_code': 'KR', 'states': [
            {'abbr': 'SO', 'full': 'Seoul', 'sales_weight': 10},
            {'abbr': 'GI', 'full': 'Gyeonggi', 'sales_weight': 8},
            {'abbr': 'BS', 'full': 'Busan', 'sales_weight': 5},
            {'abbr': 'DG', 'full': 'Daegu', 'sales_weight': 3},
            {'abbr': 'IC', 'full': 'Incheon', 'sales_weight': 3},
            {'abbr': 'GJ', 'full': 'Gwangju', 'sales_weight': 2},
            {'abbr': 'DJ', 'full': 'Daejeon', 'sales_weight': 2},
            {'abbr': 'US', 'full': 'Ulsan', 'sales_weight': 2},
            {'abbr': 'GW', 'full': 'Gangwon', 'sales_weight': 2},
            {'abbr': 'CB', 'full': 'North Chungcheong', 'sales_weight': 2},
            {'abbr': 'CN', 'full': 'South Chungcheong', 'sales_weight': 2},
            {'abbr': 'GB', 'full': 'North Gyeongsang', 'sales_weight': 2},
            {'abbr': 'GN', 'full': 'South Gyeongsang', 'sales_weight': 2},
            {'abbr': 'JB', 'full': 'North Jeolla', 'sales_weight': 1},
            {'abbr': 'JN', 'full': 'South Jeolla', 'sales_weight': 1},
            {'abbr': 'JJ', 'full': 'Jeju', 'sales_weight': 1},
            {'abbr': 'SA', 'full': 'Sejong', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Thailand': {'country_code': 'TH', 'states': [
            {'abbr': 'BKK', 'full': 'Bangkok', 'sales_weight': 10},
            {'abbr': 'CT', 'full': 'Central Thailand', 'sales_weight': 5},
            {'abbr': 'NT', 'full': 'Northern Thailand', 'sales_weight': 4},
            {'abbr': 'ET', 'full': 'Eastern Thailand', 'sales_weight': 3},
            {'abbr': 'ST', 'full': 'Southern Thailand', 'sales_weight': 3},
            {'abbr': 'IST', 'full': 'Isan (Northeastern Thailand)', 'sales_weight': 2},
        ], 'zip_generator': generate_plausible_zip}
    },
    'Asia Pacific': {
        'China': {'country_code': 'CN', 'states': [
            {'abbr': 'BJ', 'full': 'Beijing', 'sales_weight': 10},
            {'abbr': 'SH', 'full': 'Shanghai', 'sales_weight': 10},
            {'abbr': 'GD', 'full': 'Guangdong', 'sales_weight': 8},
            {'abbr': 'JS', 'full': 'Jiangsu', 'sales_weight': 7},
            {'abbr': 'ZJ', 'full': 'Zhejiang', 'sales_weight': 6},
            {'abbr': 'SC', 'full': 'Sichuan', 'sales_weight': 5},
            {'abbr': 'CQ', 'full': 'Chongqing', 'sales_weight': 4},
            {'abbr': 'FJ', 'full': 'Fujian', 'sales_weight': 3},
            {'abbr': 'HB', 'full': 'Hubei', 'sales_weight': 3},
            {'abbr': 'SD', 'full': 'Shandong', 'sales_weight': 3},
            {'abbr': 'AH', 'full': 'Anhui', 'sales_weight': 2},
            {'abbr': 'HN', 'full': 'Hunan', 'sales_weight': 2},
            {'abbr': 'HA', 'full': 'Henan', 'sales_weight': 2},
            {'abbr': 'HE', 'full': 'Hebei', 'sales_weight': 2},
            {'abbr': 'LN', 'full': 'Liaoning', 'sales_weight': 2},
            {'abbr': 'JL', 'full': 'Jilin', 'sales_weight': 1},
            {'abbr': 'HL', 'full': 'Heilongjiang', 'sales_weight': 1},
            {'abbr': 'SN', 'full': 'Shaanxi', 'sales_weight': 1},
            {'abbr': 'SX', 'full': 'Shanxi', 'sales_weight': 1},
            {'abbr': 'JX', 'full': 'Jiangxi', 'sales_weight': 1},
            {'abbr': 'GS', 'full': 'Gansu', 'sales_weight': 1},
            {'abbr': 'QH', 'full': 'Qinghai', 'sales_weight': 1},
            {'abbr': 'NX', 'full': 'Ningxia', 'sales_weight': 1},
            {'abbr': 'XJ', 'full': 'Xinjiang', 'sales_weight': 1},
            {'abbr': 'NM', 'full': 'Inner Mongolia', 'sales_weight': 1},
            {'abbr': 'XZ', 'full': 'Tibet', 'sales_weight': 1},
            {'abbr': 'GZ', 'full': 'Guizhou', 'sales_weight': 1},
            {'abbr': 'YN', 'full': 'Yunnan', 'sales_weight': 1},
            {'abbr': 'GX', 'full': 'Guangxi', 'sales_weight': 1},
            {'abbr': 'HI', 'full': 'Hainan', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Japan': {'country_code': 'JP', 'states': [
            {'abbr': 'TKY', 'full': 'Tokyo', 'sales_weight': 10},
            {'abbr': 'OSK', 'full': 'Osaka', 'sales_weight': 8},
            {'abbr': 'AIC', 'full': 'Aichi', 'sales_weight': 7},
            {'abbr': 'KAN', 'full': 'Kanagawa', 'sales_weight': 6},
            {'abbr': 'SAI', 'full': 'Saitama', 'sales_weight': 5},
            {'abbr': 'CHI', 'full': 'Chiba', 'sales_weight': 4},
            {'abbr': 'FKO', 'full': 'Fukuoka', 'sales_weight': 3},
            {'abbr': 'HYO', 'full': 'Hyogo', 'sales_weight': 3},
            {'abbr': 'HKD', 'full': 'Hokkaido', 'sales_weight': 2},
            {'abbr': 'KGO', 'full': 'Kyoto', 'sales_weight': 2},
            {'abbr': 'MIE', 'full': 'Mie', 'sales_weight': 1},
            {'abbr': 'SHI', 'full': 'Shizuoka', 'sales_weight': 1},
            {'abbr': 'IBR', 'full': 'Ibaraki', 'sales_weight': 1},
            {'abbr': 'GUM', 'full': 'Gunma', 'sales_weight': 1},
            {'abbr': 'YAM', 'full': 'Yamanashi', 'sales_weight': 1},
            {'abbr': 'NAG', 'full': 'Nagano', 'sales_weight': 1},
            {'abbr': 'TOY', 'full': 'Toyama', 'sales_weight': 1},
            {'abbr': 'ISH', 'full': 'Ishikawa', 'sales_weight': 1},
            {'abbr': 'FUK', 'full': 'Fukui', 'sales_weight': 1},
            {'abbr': 'GIF', 'full': 'Gifu', 'sales_weight': 1},
            {'abbr': 'SHG', 'full': 'Shiga', 'sales_weight': 1},
            {'abbr': 'WAK', 'full': 'Wakayama', 'sales_weight': 1},
            {'abbr': 'TTO', 'full': 'Tottori', 'sales_weight': 1},
            {'abbr': 'SHM', 'full': 'Shimane', 'sales_weight': 1},
            {'abbr': 'OKA', 'full': 'Okayama', 'sales_weight': 1},
            {'abbr': 'HIR', 'full': 'Hiroshima', 'sales_weight': 1},
            {'abbr': 'YMG', 'full': 'Yamaguchi', 'sales_weight': 1},
            {'abbr': 'TOK', 'full': 'Tokushima', 'sales_weight': 1},
            {'abbr': 'KAG', 'full': 'Kagawa', 'sales_weight': 1},
            {'abbr': 'EHI', 'full': 'Ehime', 'sales_weight': 1},
            {'abbr': 'KOC', 'full': 'Kochi', 'sales_weight': 1},
            {'abbr': 'SAG', 'full': 'Saga', 'sales_weight': 1},
            {'abbr': 'NSG', 'full': 'Nagasaki', 'sales_weight': 1},
            {'abbr': 'KUM', 'full': 'Kumamoto', 'sales_weight': 1},
            {'abbr': 'OIT', 'full': 'Oita', 'sales_weight': 1},
            {'abbr': 'MIY', 'full': 'Miyazaki', 'sales_weight': 1},
            {'abbr': 'KGO', 'full': 'Kagoshima', 'sales_weight': 1},
            {'abbr': 'OKI', 'full': 'Okinawa', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Australia': {'country_code': 'AU', 'states': [
            {'abbr': 'VIC', 'full': 'Victoria', 'sales_weight': 8},
            {'abbr': 'NSW', 'full': 'New South Wales', 'sales_weight': 10},
            {'abbr': 'QLD', 'full': 'Queensland', 'sales_weight': 6},
            {'abbr': 'WA', 'full': 'Western Australia', 'sales_weight': 4},
            {'abbr': 'SA', 'full': 'South Australia', 'sales_weight': 2},
            {'abbr': 'TAS', 'full': 'Tasmania', 'sales_weight': 1},
            {'abbr': 'ACT', 'full': 'Australian Capital Territory', 'sales_weight': 1},
            {'abbr': 'NT', 'full': 'Northern Territory', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'New Zealand': {'country_code': 'NZ', 'states': [
            {'abbr': 'AKL', 'full': 'Auckland', 'sales_weight': 10},
            {'abbr': 'WLG', 'full': 'Wellington', 'sales_weight': 5},
            {'abbr': 'CAN', 'full': 'Canterbury', 'sales_weight': 4},
            {'abbr': 'WKO', 'full': 'Waikato', 'sales_weight': 3},
            {'abbr': 'BOP', 'full': 'Bay of Plenty', 'sales_weight': 2},
            {'abbr': 'MWT', 'full': 'Manawatū-Whanganui', 'sales_weight': 2},
            {'abbr': 'HKB', 'full': 'Hawke\'s Bay', 'sales_weight': 1},
            {'abbr': 'NTL', 'full': 'Northland', 'sales_weight': 1},
            {'abbr': 'OTA', 'full': 'Otago', 'sales_weight': 1},
            {'abbr': 'TKI', 'full': 'Taranaki', 'sales_weight': 1},
            {'abbr': 'TAS', 'full': 'Tasman', 'sales_weight': 1},
            {'abbr': 'NSN', 'full': 'Nelson', 'sales_weight': 1},
            {'abbr': 'GIS', 'full': 'Gisborne', 'sales_weight': 1},
            {'abbr': 'WTC', 'full': 'West Coast', 'sales_weight': 1},
            {'abbr': 'STL', 'full': 'Southland', 'sales_weight': 1},
            {'abbr': 'MBH', 'full': 'Marlborough', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'Taiwan': {'country_code': 'TW', 'states': [
            {'abbr': 'TP', 'full': 'Taipei City', 'sales_weight': 10},
            {'abbr': 'KC', 'full': 'Kaohsiung City', 'sales_weight': 7},
            {'abbr': 'NTC', 'full': 'New Taipei City', 'sales_weight': 8},
            {'abbr': 'TXC', 'full': 'Taichung City', 'sales_weight': 6},
            {'abbr': 'TYC', 'full': 'Taoyuan City', 'sales_weight': 5},
            {'abbr': 'TNC', 'full': 'Tainan City', 'sales_weight': 4},
            {'abbr': 'HSZ', 'full': 'Hsinchu City', 'sales_weight': 3},
            {'abbr': 'KEE', 'full': 'Keelung City', 'sales_weight': 2},
            {'abbr': 'HSC', 'full': 'Hsinchu County', 'sales_weight': 2},
            {'abbr': 'CHA', 'full': 'Changhua County', 'sales_weight': 2},
            {'abbr': 'YUN', 'full': 'Yunlin County', 'sales_weight': 1},
            {'abbr': 'JIA', 'full': 'Chiayi City', 'sales_weight': 1},
            {'abbr': 'JIC', 'full': 'Chiayi County', 'sales_weight': 1},
            {'abbr': 'NAN', 'full': 'Nantou County', 'sales_weight': 1},
            {'abbr': 'MIA', 'full': 'Miaoli County', 'sales_weight': 1},
            {'abbr': 'PIF', 'full': 'Pingtung County', 'sales_weight': 1},
            {'abbr': 'ILA', 'full': 'Yilan County', 'sales_weight': 1},
            {'abbr': 'HUA', 'full': 'Hualien County', 'sales_weight': 1},
            {'abbr': 'TAI', 'full': 'Taitung County', 'sales_weight': 1},
            {'abbr': 'PEN', 'full': 'Penghu County', 'sales_weight': 1},
            {'abbr': 'KIN', 'full': 'Kinmen County', 'sales_weight': 1},
            {'abbr': 'LIE', 'full': 'Lienchiang County', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip},
        'United Arab Emirates': {'country_code': 'AE', 'states': [
            {'abbr': 'AD', 'full': 'Abu Dhabi', 'sales_weight': 7},
            {'abbr': 'DU', 'full': 'Dubai', 'sales_weight': 10},
            {'abbr': 'SH', 'full': 'Sharjah', 'sales_weight': 3},
            {'abbr': 'AJ', 'full': 'Ajman', 'sales_weight': 1},
            {'abbr': 'UMQ', 'full': 'Umm Al-Quwain', 'sales_weight': 1},
            {'abbr': 'RAK', 'full': 'Ras Al-Khaimah', 'sales_weight': 1},
            {'abbr': 'FUJ', 'full': 'Fujairah', 'sales_weight': 1}
        ], 'zip_generator': generate_plausible_zip}
    }
}


# --------------------------
# 定义产品信息
# --------------------------
def generate_dim_product():
    """Generates and returns the Dim_Product DataFrame."""
    products = [
        {'Model_ID': 101, 'Model_Name': 'Model S Plaid', 'Body_Style': 'Sedan', 'Is_EV': True},
        {'Model_ID': 102, 'Model_Name': 'Model S', 'Body_Style': 'Sedan', 'Is_EV': True},
        {'Model_ID': 201, 'Model_Name': 'Model X Plaid', 'Body_Style': 'SUV', 'Is_EV': True},
        {'Model_ID': 202, 'Model_Name': 'Model X', 'Body_Style': 'SUV', 'Is_EV': True},
        {'Model_ID': 301, 'Model_Name': 'Model 3 Performance', 'Body_Style': 'Sedan', 'Is_EV': True},
        {'Model_ID': 302, 'Model_Name': 'Model 3 Long Range', 'Body_Style': 'Sedan', 'Is_EV': True},
        {'Model_ID': 303, 'Model_Name': 'Model 3 Rear-Wheel Drive', 'Body_Style': 'Sedan', 'Is_EV': True},
        {'Model_ID': 401, 'Model_Name': 'Model Y Performance', 'Body_Style': 'SUV', 'Is_EV': True},
        {'Model_ID': 402, 'Model_Name': 'Model Y Long Range', 'Body_Style': 'SUV', 'Is_EV': True},
        {'Model_ID': 403, 'Model_Name': 'Model Y Rear-Wheel Drive', 'Body_Style': 'SUV', 'Is_EV': True},
        {'Model_ID': 501, 'Model_Name': 'Cybertruck Tri-Motor AWD', 'Body_Style': 'Truck', 'Is_EV': True},
        {'Model_ID': 502, 'Model_Name': 'Cybertruck Dual-Motor AWD', 'Body_Style': 'Truck', 'Is_EV': True},
        {'Model_ID': 503, 'Model_Name': 'Cybertruck Rear-Wheel Drive', 'Body_Style': 'Truck', 'Is_EV': True},
        {'Model_ID': 601, 'Model_Name': 'Roadster', 'Body_Style': 'Sports Car', 'Is_EV': True},
        {'Model_ID': 701, 'Model_Name': 'Tesla Semi', 'Body_Style': 'Truck', 'Is_EV': True},
    ]
    return cudf.DataFrame(products)

# --------------------------
# 定义时间维度表
# --------------------------
def generate_dim_time(start_year=2021, end_year=2025):
    """Generates and returns the Dim_Time DataFrame."""
    dates = cudf.to_datetime(cudf.date_range(start=f'{start_year}-01-01', end=f'{end_year}-12-31', freq='D'))
    
    dim_time_df = cudf.DataFrame()
    dim_time_df['Full_Date'] = dates.strftime('%Y-%m-%d')
    dim_time_df['Date_ID'] = (dates - datetime.datetime(1970, 1, 1)).days
    dim_time_df['Day_of_Month'] = dates.day
    dim_time_df['Month_ID'] = dates.month
    dim_time_df['Month_Name'] = dates.strftime('%B')
    dim_time_df['Quarter_ID'] = dates.quarter
    dim_time_df['Year_ID'] = dates.year
    dim_time_df['Day_of_Week'] = dates.dayofweek
    dim_time_df['Day_Name'] = dates.strftime('%A')
    
    return dim_time_df

# --------------------------
# 定义客户维度表
# --------------------------
def generate_dim_customer(num_customers=500000):
    """Generates and returns the Dim_Customer DataFrame."""
    np.random.seed(RANDOM_SEED)
    
    customer_ids = cp.arange(1, num_customers + 1)
    
    first_names = ["John", "Jane", "Michael", "Emily", "Chris", "Sarah", "David", "Jessica"]
    last_names = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson"]
    
    # 修复了这里的代码
    first_names_indices = cp.random.randint(0, len(first_names), size=num_customers)
    last_names_indices = cp.random.randint(0, len(last_names), size=num_customers)
    
    first_names_arr = cudf.Series([first_names[i.item()] for i in first_names_indices])
    last_names_arr = cudf.Series([last_names[i.item()] for i in last_names_indices])
    
    full_names = first_names_arr + " " + last_names_arr
    
    emails = cudf.Series([f"customer{i}@email.com" for i in customer_ids])
    
    dim_customer_df = cudf.DataFrame({
        'Customer_ID': customer_ids,
        'Full_Name': full_names,
        'Email': emails,
    })
    
    return dim_customer_df

# --------------------------
# 定义地理位置、销售和价格维度表
# --------------------------
def generate_sales_and_geography(dim_product_df, dim_time_df, dim_customer_df):
    """Generates Fact_Sales, Dim_Geography, and Dim_Prices DataFrames."""
    all_geographies = []
    
    # 原始脚本的地理位置生成逻辑
    geo_id_counter = 1
    for continent, countries in tesla_countries.items():
        for country, data in countries.items():
            for state in data['states']:
                abbr = state['abbr']
                full = state['full']
                zip_code = data['zip_generator'](country, abbr)
                all_geographies.append({
                    'Geo_ID': geo_id_counter,
                    'State_Abbr': abbr,
                    'State_Name': full,
                    'Country_Name': country,
                    'Country_Code': data['country_code'],
                    'Continent': continent,
                    'Zip_Code': zip_code
                })
                geo_id_counter += 1
    
    dim_geography_df = cudf.DataFrame(all_geographies)

    # 生成 Fact_Sales 表
    num_sales_records = 1000000
    np.random.seed(RANDOM_SEED)

    # 使用cupy生成随机数据
    customer_ids = cp.random.choice(dim_customer_df['Customer_ID'].values, num_sales_records)
    product_ids = cp.random.choice(dim_product_df['Model_ID'].values, num_sales_records)
    
    # 获取所有有效的 Date_ID
    time_ids = dim_time_df['Date_ID'].values
    date_ids = cp.random.choice(time_ids, num_sales_records)
    
    # 根据销售权重分配地理位置
    geo_weights = [s['sales_weight'] for c in tesla_countries.values() for country_data in c.values() for s in country_data['states']]
    geo_population = dim_geography_df['Geo_ID'].values
    geo_ids = cp.random.choice(geo_population, num_sales_records, p=cp.array(geo_weights) / sum(geo_weights))

    # 生成价格
    base_price = 45000.0  # 基础价格
    discount_factor = 0.95 # 5% 折扣
    prices = cp.random.normal(loc=base_price, scale=base_price * 0.1, size=num_sales_records)
    final_prices = cp.clip(prices, base_price * 0.5, base_price * 1.5).astype(cp.float32)
    discounted_prices = final_prices * discount_factor
    
    fact_sales_data = {
        'Sales_ID': cp.arange(1, num_sales_records + 1),
        'Customer_ID': customer_ids,
        'Date_ID': date_ids,
        'Geo_ID': geo_ids,
        'Model_ID': product_ids,
        'Sale_Price_USD': final_prices,
        'Discounted_Price_USD': discounted_prices,
    }
    fact_sales_df = cudf.DataFrame(fact_sales_data)
    
    # 生成 Dim_Prices
    dim_prices_data = []
    
    # 原始脚本中的价格维度表生成逻辑
    product_prices = {
        'Model S Plaid': {'standard': 105000, 'discounted': 100000},
        'Model S': {'standard': 95000, 'discounted': 90000},
        'Model X Plaid': {'standard': 120000, 'discounted': 115000},
        'Model X': {'standard': 110000, 'discounted': 105000},
        'Model 3 Performance': {'standard': 55000, 'discounted': 52000},
        'Model 3 Long Range': {'standard': 48000, 'discounted': 45000},
        'Model 3 Rear-Wheel Drive': {'standard': 40000, 'discounted': 38000},
        'Model Y Performance': {'standard': 60000, 'discounted': 57000},
        'Model Y Long Range': {'standard': 53000, 'discounted': 50000},
        'Model Y Rear-Wheel Drive': {'standard': 45000, 'discounted': 42000},
        'Cybertruck Tri-Motor AWD': {'standard': 80000, 'discounted': 75000},
        'Cybertruck Dual-Motor AWD': {'standard': 60000, 'discounted': 58000},
        'Cybertruck Rear-Wheel Drive': {'standard': 50000, 'discounted': 48000},
        'Roadster': {'standard': 200000, 'discounted': 190000},
        'Tesla Semi': {'standard': 180000, 'discounted': 175000},
    }

    # 使用 for 循环逐一生成价格记录，确保与原始脚本格式一致
    for model_name, prices in product_prices.items():
        model_id = dim_product_df[dim_product_df['Model_Name'] == model_name]['Model_ID'].iloc[0]
        quarter_start_date = datetime.date(2021, 1, 1)
        while quarter_start_date.year <= 2025:
            quarter_start_date_str = quarter_start_date.strftime('%Y-%m-%d')
            dim_prices_data.append({
                'Model_ID': model_id,
                'Quarter_Start_Date': quarter_start_date_str,
                'Standard_Price_USD': prices['standard'],
                'Discounted_Price_USD': prices['discounted']
            })
            # 移动到下一个季度
            if quarter_start_date.month == 10:
                quarter_start_date = datetime.date(quarter_start_date.year + 1, 1, 1)
            else:
                quarter_start_date = datetime.date(quarter_start_date.year, quarter_start_date.month + 3, 1)
    
    dim_prices_df = cudf.DataFrame(dim_prices_data)

    return fact_sales_df, dim_geography_df, dim_prices_df

# 主函数，执行所有生成任务并保存文件
def main():
    start_time = time.time() # 开始计时
    
    print("正在生成数据表...")

    dim_product_df = generate_dim_product()
    dim_time_df = generate_dim_time()
    dim_customer_df = generate_dim_customer()

    fact_sales_df, dim_geography_df, dim_prices_df = generate_sales_and_geography(dim_product_df, dim_time_df, dim_customer_df)

    # 确保保存路径存在
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 保存到CSV
    print("保存 Dim_Product.csv...")
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8')
    print("保存 Dim_Time.csv...")
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    print("保存 Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    print("保存 Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    print("保存 Dim_Prices.csv...")
    dim_prices_df.to_csv(os.path.join(output_dir, 'Dim_Prices.csv'), index=False, encoding='utf-8')
    print("保存 Fact_Sales.csv...")
    fact_sales_df.to_csv(os.path.join(output_dir, 'Fact_Sales.csv'), index=False, encoding='utf-8')
    
    end_time = time.time() # 结束计时
    total_time = end_time - start_time
    
    print("所有数据表已生成并保存到 'output_data' 文件夹。")
    print(f"代码总执行时间: {total_time:.2f} 秒") # 打印总时间

if __name__ == "__main__":
    main()

正在生成数据表...
保存 Dim_Product.csv...
保存 Dim_Time.csv...
保存 Dim_Customer.csv...
保存 Dim_Geography.csv...
保存 Dim_Prices.csv...
保存 Fact_Sales.csv...
所有数据表已生成并保存到 'output_data' 文件夹。
代码总执行时间: 62.93 秒


**2. GPU加速数据生成 v2**

In [25]:
# -*- coding: utf-8 -*-
"""Tesla Simulated Sales Data Generator (GPU Version)

This script generates mock sales data for Tesla in a star schema, using cuDF
for GPU acceleration while strictly following the data format and breadth of
the original CPU Pandas script.
"""
# 请确保已安装 cuDF 和 pandas 库。
# 如果是在 Google Colab 上，可以使用以下命令：
# !pip install -U cudf-cu12 --extra-index-url=https://pypi.nvidia.com
# !pip install pandas

import cudf
import cupy as cp
import numpy as np
import random
import datetime
import os
import copy
import time
import pandas as pd
from cudf.core.dataframe import DataFrame as CuDFDataFrame

# Set a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# --------------------------
# Helper functions
# --------------------------

def generate_plausible_zip(country, state_province_abbr):
    """Generates a plausible zip code based on the country and state/province."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'

    if country == 'United States':
        if state_province_abbr.startswith(('C', 'I')):
            return f"9{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith(('T', 'L')):
            return f"7{random.randint(5,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('F'):
            return f"3{random.randint(2,4)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('N'):
            return f"1{random.randint(0,4)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('W'):
            return f"98{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('A'):
            return f"85{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('G'):
            return f"30{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        if state_province_abbr.startswith('P'):
            return f"15{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
        return f"{random.randint(10000, 99999)}"
    elif country == 'Canada':
        province_codes = {
            'ON': ['K','L','M','N','P'], 'QC': ['G','H','J'], 'BC': ['V'], 'AB': ['T'],
            'SK': ['S'], 'MB': ['R'], 'NB': ['E'], 'NS': ['B'], 'NL': ['A'], 'PE': ['C'],
            'YT': ['Y'], 'NT': ['X'], 'NU': ['X']
        }
        first_letter = random.choice(province_codes.get(state_province_abbr, ['A']))
        return f"{first_letter}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
    elif country == 'Mexico':
        return f"{random.randint(1000, 99999):05d}"
    elif country == 'United Kingdom':
        outward = f"{random.choice(letters)}{random.choice(letters)}{random.randint(1,9)}"
        inward = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        return f"{outward} {inward}"
    elif country in ['China', 'Taiwan']:
        if country == 'China':
            if state_province_abbr.startswith('B'):
                return f"10{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
            if state_province_abbr.startswith('S'):
                return f"20{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
            if state_province_abbr.startswith('G'):
                return f"51{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}{random.randint(0,9)}"
            return f"{random.randint(10000, 99999)}"
        else:
            return f"{random.randint(100, 999)}"
    elif country == 'Germany':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Japan':
        return f"{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    elif country == 'Australia':
        return f"{random.randint(1000, 9999)}"
    elif country == 'New Zealand':
        nz_ranges = {
            'AKL': (600, 2699), 'NTL': (100, 1099), 'WKO': (3200, 3799), 'BOP': (3000, 3199),
            'GIS': (4010, 4199), 'HKB': (4100, 4299), 'MWT': (4400, 4699), 'MBH': (7200, 7299),
            'NSN': (7010, 7099), 'OTA': (9000, 9799), 'STL': (9800, 9899), 'TKI': (4300, 4399),
            'TAS': (7100, 7199), 'WLG': (5010, 5799), 'WTC': (7800, 7999), 'CAN': (7000, 7999),
        }
        rng = nz_ranges.get(state_province_abbr)
        if rng:
            low, high = rng
            val = random.randint(low, high)
            return f"{val:04d}"
        else:
            return f"{random.randint(1000, 99999):05d}"
    elif country in ['France', 'Italy']:
        return f"{random.randint(10000, 99999)}"
    elif country == 'Spain':
        return f"{random.randint(10000, 52999)}"
    elif country == 'South Korea':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Thailand':
        return f"{random.randint(10000, 99999)}"
    else:
        return f"{random.randint(10000, 99999)}"

# --------------------------
# Data generation functions for each dimension table
# --------------------------

def generate_dim_product():
    """
    Generates the Dim_Product table.
    
    Fields: Model_ID, Model_Name, Model_Category, Model_Base_Price_USD, Model_Launch_Date
    """
    products = {
        'Model 3': {'category': 'Sedan', 'base_price': 40240, 'launch_date': '2017-07-28'},
        'Model Y': {'category': 'SUV', 'base_price': 43990, 'launch_date': '2020-03-13'},
        'Model S': {'category': 'Sedan', 'base_price': 74990, 'launch_date': '2012-06-22'},
        'Model X': {'category': 'SUV', 'base_price': 79990, 'launch_date': '2015-09-29'},
        'Cybertruck': {'category': 'Truck', 'base_price': 60990, 'launch_date': '2023-11-30'},
        'Roadster': {'category': 'Sport Car', 'base_price': 200000, 'launch_date': '2008-02-19'},
        'Semi': {'category': 'Truck', 'base_price': 150000, 'launch_date': '2022-12-01'}
    }
    product_data = []
    model_id = 1000
    for name, details in products.items():
        product_data.append({
            'Model_ID': model_id,
            'Model_Name': name,
            'Model_Category': details['category'],
            'Model_Base_Price_USD': details['base_price'],
            'Model_Launch_Date': details['launch_date']
        })
        model_id += 1

    # Use pandas to handle the mixed data types, then convert to cuDF
    dim_product_pd = pd.DataFrame(product_data)
    dim_product_pd['Model_Launch_Date'] = pd.to_datetime(dim_product_pd['Model_Launch_Date'])
    
    return cudf.from_pandas(dim_product_pd)


def generate_dim_time(start_date, end_date):
    """
    Generates the Dim_Time table.
    
    Fields: Time_ID, Full_Date, Year, Quarter, Month, Day, Week_of_Year, Day_of_Week, Day_Name
    """
    date_range = [start_date + datetime.timedelta(days=x) for x in range(0, (end_date-start_date).days + 1)]
    time_data = []
    time_id = 0
    for d in date_range:
        time_data.append({
            'Time_ID': time_id,
            'Full_Date': d,
            'Year': d.year,
            'Quarter': (d.month - 1) // 3 + 1,
            'Month': d.month,
            'Day': d.day,
            'Week_of_Year': d.isocalendar()[1],
            'Day_of_Week': d.weekday() + 1,
            'Day_Name': d.strftime('%A')
        })
        time_id += 1

    dim_time_pd = pd.DataFrame(time_data)
    # This is the fix: convert the 'Full_Date' column to a proper datetime type
    dim_time_pd['Full_Date'] = pd.to_datetime(dim_time_pd['Full_Date'])

    return cudf.from_pandas(dim_time_pd)


def generate_dim_customer(num_customers=50000):
    """
    Generates the Dim_Customer table.
    
    Fields: Customer_ID, Customer_Name, Gender, Age_Group, Income_Level
    """
    genders = ['Male', 'Female', 'Unknown']
    age_groups = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    income_levels = ['Low', 'Medium', 'High']
    customer_data = []
    
    customer_ids = cp.arange(10000, 10000 + num_customers)
    gender_choices = cudf.Series(genders)
    age_group_choices = cudf.Series(age_groups)
    income_level_choices = cudf.Series(income_levels)
    
    customer_data_dict = {
        'Customer_ID': customer_ids,
        'Customer_Name': ['Customer_' + str(i) for i in customer_ids.tolist()],
        'Gender': gender_choices.sample(n=num_customers, replace=True).reset_index(drop=True),
        'Age_Group': age_group_choices.sample(n=num_customers, replace=True).reset_index(drop=True),
        'Income_Level': income_level_choices.sample(n=num_customers, replace=True).reset_index(drop=True)
    }

    return cudf.DataFrame(customer_data_dict)


def generate_dim_geography():
    """
    Generates the Dim_Geography table based on key Tesla sales regions.
    
    Fields: Geo_ID, Continent, Country, Country_Code, State_Province, State_Province_Abbr, Zip_Code
    """
    tesla_countries = {
        'North America': {
            'United States': {'country_code': 'US', 'states': [
                {'abbr': 'CA', 'full': 'California', 'sales_weight': 10},
                {'abbr': 'TX', 'full': 'Texas', 'sales_weight': 9},
                {'abbr': 'FL', 'full': 'Florida', 'sales_weight': 8},
                {'abbr': 'NY', 'full': 'New York', 'sales_weight': 7},
                {'abbr': 'WA', 'full': 'Washington', 'sales_weight': 6},
                {'abbr': 'IL', 'full': 'Illinois', 'sales_weight': 5},
                {'abbr': 'MA', 'full': 'Massachusetts', 'sales_weight': 5},
                {'abbr': 'NJ', 'full': 'New Jersey', 'sales_weight': 5},
                {'abbr': 'NC', 'full': 'North Carolina', 'sales_weight': 4},
                {'abbr': 'GA', 'full': 'Georgia', 'sales_weight': 4},
                {'abbr': 'PA', 'full': 'Pennsylvania', 'sales_weight': 4},
                {'abbr': 'CO', 'full': 'Colorado', 'sales_weight': 4},
                {'abbr': 'AZ', 'full': 'Arizona', 'sales_weight': 4},
                {'abbr': 'OH', 'full': 'Ohio', 'sales_weight': 3},
                {'abbr': 'MI', 'full': 'Michigan', 'sales_weight': 3},
                {'abbr': 'VA', 'full': 'Virginia', 'sales_weight': 3},
                {'abbr': 'MD', 'full': 'Maryland', 'sales_weight': 3},
                {'abbr': 'OR', 'full': 'Oregon', 'sales_weight': 3},
                {'abbr': 'NV', 'full': 'Nevada', 'sales_weight': 2},
                {'abbr': 'MN', 'full': 'Minnesota', 'sales_weight': 2},
                {'abbr': 'UT', 'full': 'Utah', 'sales_weight': 2},
                {'abbr': 'DC', 'full': 'District of Columbia', 'sales_weight': 2},
                {'abbr': 'AL', 'full': 'Alabama', 'sales_weight': 1},
                {'abbr': 'AK', 'full': 'Alaska', 'sales_weight': 1},
                {'abbr': 'AR', 'full': 'Arkansas', 'sales_weight': 1},
                {'abbr': 'CT', 'full': 'Connecticut', 'sales_weight': 1},
                {'abbr': 'DE', 'full': 'Delaware', 'sales_weight': 1},
                {'abbr': 'HI', 'full': 'Hawaii', 'sales_weight': 1},
                {'abbr': 'ID', 'full': 'Idaho', 'sales_weight': 1},
                {'abbr': 'IN', 'full': 'Indiana', 'sales_weight': 1},
                {'abbr': 'IA', 'full': 'Iowa', 'sales_weight': 1},
                {'abbr': 'KS', 'full': 'Kansas', 'sales_weight': 1},
                {'abbr': 'KY', 'full': 'Kentucky', 'sales_weight': 1},
                {'abbr': 'LA', 'full': 'Louisiana', 'sales_weight': 1},
                {'abbr': 'ME', 'full': 'Maine', 'sales_weight': 1},
                {'abbr': 'MS', 'full': 'Mississippi', 'sales_weight': 1},
                {'abbr': 'MO', 'full': 'Missouri', 'sales_weight': 1},
                {'abbr': 'MT', 'full': 'Montana', 'sales_weight': 1},
                {'abbr': 'NE', 'full': 'Nebraska', 'sales_weight': 1},
                {'abbr': 'NH', 'full': 'New Hampshire', 'sales_weight': 1},
                {'abbr': 'NM', 'full': 'New Mexico', 'sales_weight': 1},
                {'abbr': 'ND', 'full': 'North Dakota', 'sales_weight': 1},
                {'abbr': 'OK', 'full': 'Oklahoma', 'sales_weight': 1},
                {'abbr': 'RI', 'full': 'Rhode Island', 'sales_weight': 1},
                {'abbr': 'SC', 'full': 'South Carolina', 'sales_weight': 1},
                {'abbr': 'SD', 'full': 'South Dakota', 'sales_weight': 1},
                {'abbr': 'TN', 'full': 'Tennessee', 'sales_weight': 1},
                {'abbr': 'VT', 'full': 'Vermont', 'sales_weight': 1},
                {'abbr': 'WV', 'full': 'West Virginia', 'sales_weight': 1},
                {'abbr': 'WI', 'full': 'Wisconsin', 'sales_weight': 1},
                {'abbr': 'WY', 'full': 'Wyoming', 'sales_weight': 1},
            ], 'zip_generator': generate_plausible_zip},
            'Canada': {'country_code': 'CA', 'states': [
                {'abbr': 'ON', 'full': 'Ontario', 'sales_weight': 8},
                {'abbr': 'QC', 'full': 'Quebec', 'sales_weight': 6},
                {'abbr': 'BC', 'full': 'British Columbia', 'sales_weight': 5},
                {'abbr': 'AB', 'full': 'Alberta', 'sales_weight': 4},
                {'abbr': 'MB', 'full': 'Manitoba', 'sales_weight': 2},
                {'abbr': 'SK', 'full': 'Saskatchewan', 'sales_weight': 1},
                {'abbr': 'NB', 'full': 'New Brunswick', 'sales_weight': 1},
                {'abbr': 'NL', 'full': 'Newfoundland and Labrador', 'sales_weight': 1},
                {'abbr': 'NS', 'full': 'Nova Scotia', 'sales_weight': 1},
                {'abbr': 'PE', 'full': 'Prince Edward Island', 'sales_weight': 1},
                {'abbr': 'NT', 'full': 'Northwest Territories', 'sales_weight': 1},
                {'abbr': 'NU', 'full': 'Nunavut', 'sales_weight': 1},
                {'abbr': 'YT', 'full': 'Yukon', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Mexico': {'country_code': 'MX', 'states': [
                {'abbr': 'CDMX', 'full': 'Mexico City', 'sales_weight': 10},
                {'abbr': 'MEX', 'full': 'Mexico State', 'sales_weight': 7},
                {'abbr': 'JAL', 'full': 'Jalisco', 'sales_weight': 6},
                {'abbr': 'NLE', 'full': 'Nuevo León', 'sales_weight': 5},
                {'abbr': 'PUE', 'full': 'Puebla', 'sales_weight': 4},
                {'abbr': 'BC', 'full': 'Baja California', 'sales_weight': 3},
                {'abbr': 'VER', 'full': 'Veracruz', 'sales_weight': 3},
                {'abbr': 'MICH', 'full': 'Michoacán', 'sales_weight': 2},
                {'abbr': 'CHIS', 'full': 'Chiapas', 'sales_weight': 2},
                {'abbr': 'QR', 'full': 'Quintana Roo', 'sales_weight': 2},
                {'abbr': 'GTO', 'full': 'Guanajuato', 'sales_weight': 1},
                {'abbr': 'GRO', 'full': 'Guerrero', 'sales_weight': 1},
                {'abbr': 'BCS', 'full': 'Baja California Sur', 'sales_weight': 1},
                {'abbr': 'CHIH', 'full': 'Chihuahua', 'sales_weight': 1},
                {'abbr': 'SIN', 'full': 'Sinaloa', 'sales_weight': 1},
                {'abbr': 'SON', 'full': 'Sonora', 'sales_weight': 1},
                {'abbr': 'YUC', 'full': 'Yucatán', 'sales_weight': 1},
                {'abbr': 'AGS', 'full': 'Aguascalientes', 'sales_weight': 1},
                {'abbr': 'CAMP', 'full': 'Campeche', 'sales_weight': 1},
                {'abbr': 'COAH', 'full': 'Coahuila', 'sales_weight': 1},
                {'abbr': 'COL', 'full': 'Colima', 'sales_weight': 1},
                {'abbr': 'DUR', 'full': 'Durango', 'sales_weight': 1},
                {'abbr': 'HGO', 'full': 'Hidalgo', 'sales_weight': 1},
                {'abbr': 'MOR', 'full': 'Morelos', 'sales_weight': 1},
                {'abbr': 'NAY', 'full': 'Nayarit', 'sales_weight': 1},
                {'abbr': 'OAX', 'full': 'Oaxaca', 'sales_weight': 1},
                {'abbr': 'QRO', 'full': 'Querétaro', 'sales_weight': 1},
                {'abbr': 'SLP', 'full': 'San Luis Potosí', 'sales_weight': 1},
                {'abbr': 'TAB', 'full': 'Tabasco', 'sales_weight': 1},
                {'abbr': 'TAM', 'full': 'Tamaulipas', 'sales_weight': 1},
                {'abbr': 'TLAX', 'full': 'Tlaxcala', 'sales_weight': 1},
                {'abbr': 'ZAC', 'full': 'Zacatecas', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip}
        },
        'Europe': {
            'Germany': {'country_code': 'DE', 'states': [
                {'abbr': 'NW', 'full': 'North Rhine-Westphalia', 'sales_weight': 8},
                {'abbr': 'BY', 'full': 'Bavaria', 'sales_weight': 7},
                {'abbr': 'BW', 'full': 'Baden-Württemberg', 'sales_weight': 6},
                {'abbr': 'HE', 'full': 'Hesse', 'sales_weight': 5},
                {'abbr': 'NI', 'full': 'Lower Saxony', 'sales_weight': 4},
                {'abbr': 'BE', 'full': 'Berlin', 'sales_weight': 3},
                {'abbr': 'HH', 'full': 'Hamburg', 'sales_weight': 3},
                {'abbr': 'SL', 'full': 'Saarland', 'sales_weight': 2},
                {'abbr': 'HB', 'full': 'Bremen', 'sales_weight': 2},
                {'abbr': 'RP', 'full': 'Rhineland-Palatinate', 'sales_weight': 2},
                {'abbr': 'SH', 'full': 'Schleswig-Holstein', 'sales_weight': 2},
                {'abbr': 'SN', 'full': 'Saxony', 'sales_weight': 1},
                {'abbr': 'TH', 'full': 'Thuringia', 'sales_weight': 1},
                {'abbr': 'BB', 'full': 'Brandenburg', 'sales_weight': 1},
                {'abbr': 'MV', 'full': 'Mecklenburg-Vorpommern', 'sales_weight': 1},
                {'abbr': 'ST', 'full': 'Saxony-Anhalt', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'United Kingdom': {'country_code': 'GB', 'states': [
                {'abbr': 'ENG', 'full': 'England', 'sales_weight': 10},
                {'abbr': 'SCT', 'full': 'Scotland', 'sales_weight': 4},
                {'abbr': 'WLS', 'full': 'Wales', 'sales_weight': 2},
                {'abbr': 'NIR', 'full': 'Northern Ireland', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Norway': {'country_code': 'NO', 'states': [
                {'abbr': 'VL', 'full': 'Viken', 'sales_weight': 5},
                {'abbr': 'OS', 'full': 'Oslo', 'sales_weight': 4},
                {'abbr': 'TR', 'full': 'Trøndelag', 'sales_weight': 3},
                {'abbr': 'VEST', 'full': 'Vestland', 'sales_weight': 3},
                {'abbr': 'RO', 'full': 'Rogaland', 'sales_weight': 2},
                {'abbr': 'MR', 'full': 'Møre og Romsdal', 'sales_weight': 2},
                {'abbr': 'INN', 'full': 'Innlandet', 'sales_weight': 2},
                {'abbr': 'TROM', 'full': 'Troms og Finnmark', 'sales_weight': 1},
                {'abbr': 'VESTF', 'full': 'Vestfold og Telemark', 'sales_weight': 1},
                {'abbr': 'AGD', 'full': 'Agder', 'sales_weight': 1},
                {'abbr': 'NORDL', 'full': 'Nordland', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'France': {'country_code': 'FR', 'states': [
                {'abbr': 'IDF', 'full': 'Île-de-France', 'sales_weight': 10},
                {'abbr': 'NAQ', 'full': 'Nouvelle-Aquitaine', 'sales_weight': 5},
                {'abbr': 'ARA', 'full': 'Auvergne-Rhône-Alpes', 'sales_weight': 4},
                {'abbr': 'HDF', 'full': 'Hauts-de-France', 'sales_weight': 3},
                {'abbr': 'OCC', 'full': 'Occitanie', 'sales_weight': 3},
                {'abbr': 'PAC', 'full': 'Provence-Alpes-Côte d\'Azur', 'sales_weight': 3},
                {'abbr': 'BRE', 'full': 'Brittany', 'sales_weight': 2},
                {'abbr': 'CVL', 'full': 'Centre-Val de Loire', 'sales_weight': 2},
                {'abbr': 'PDL', 'full': 'Pays de la Loire', 'sales_weight': 2},
                {'abbr': 'GE', 'full': 'Grand Est', 'sales_weight': 2},
                {'abbr': 'NOR', 'full': 'Normandy', 'sales_weight': 1},
                {'abbr': 'BFC', 'full': 'Bourgogne-Franche-Comté', 'sales_weight': 1},
                {'abbr': 'COR', 'full': 'Corsica', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Netherlands': {'country_code': 'NL', 'states': [
                {'abbr': 'NH', 'full': 'North Holland', 'sales_weight': 8},
                {'abbr': 'ZH', 'full': 'South Holland', 'sales_weight': 7},
                {'abbr': 'UT', 'full': 'Utrecht', 'sales_weight': 6},
                {'abbr': 'GD', 'full': 'Gelderland', 'sales_weight': 5},
                {'abbr': 'NB', 'full': 'North Brabant', 'sales_weight': 4},
                {'abbr': 'LB', 'full': 'Limburg', 'sales_weight': 3},
                {'abbr': 'GR', 'full': 'Groningen', 'sales_weight': 2},
                {'abbr': 'OV', 'full': 'Overijssel', 'sales_weight': 2},
                {'abbr': 'FR', 'full': 'Friesland', 'sales_weight': 1},
                {'abbr': 'DR', 'full': 'Drenthe', 'sales_weight': 1},
                {'abbr': 'ZE', 'full': 'Zeeland', 'sales_weight': 1},
                {'abbr': 'FL', 'full': 'Flevoland', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Sweden': {'country_code': 'SE', 'states': [
                {'abbr': 'Stockholm', 'full': 'Stockholm County', 'sales_weight': 10},
                {'abbr': 'Västra Götaland', 'full': 'Västra Götaland County', 'sales_weight': 6},
                {'abbr': 'Skåne', 'full': 'Skåne County', 'sales_weight': 5},
                {'abbr': 'Uppsala', 'full': 'Uppsala County', 'sales_weight': 3},
                {'abbr': 'Östergötland', 'full': 'Östergötland County', 'sales_weight': 2},
                {'abbr': 'Jönköping', 'full': 'Jönköping County', 'sales_weight': 2},
                {'abbr': 'Halland', 'full': 'Halland County', 'sales_weight': 1},
                {'abbr': 'Västmanland', 'full': 'Västmanland County', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Switzerland': {'country_code': 'CH', 'states': [
                {'abbr': 'ZH', 'full': 'Zurich', 'sales_weight': 10},
                {'abbr': 'VD', 'full': 'Vaud', 'sales_weight': 5},
                {'abbr': 'GE', 'full': 'Geneva', 'sales_weight': 4},
                {'abbr': 'BE', 'full': 'Bern', 'sales_weight': 3},
                {'abbr': 'TI', 'full': 'Ticino', 'sales_weight': 2},
                {'abbr': 'AG', 'full': 'Aargau', 'sales_weight': 2},
                {'abbr': 'BS', 'full': 'Basel-Stadt', 'sales_weight': 1},
                {'abbr': 'LU', 'full': 'Lucerne', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Italy': {'country_code': 'IT', 'states': [
                {'abbr': 'LOM', 'full': 'Lombardy', 'sales_weight': 10},
                {'abbr': 'LAZ', 'full': 'Lazio', 'sales_weight': 7},
                {'abbr': 'VEN', 'full': 'Veneto', 'sales_weight': 6},
                {'abbr': 'PIE', 'full': 'Piedmont', 'sales_weight': 5},
                {'abbr': 'EMI', 'full': 'Emilia-Romagna', 'sales_weight': 4},
                {'abbr': 'CAM', 'full': 'Campania', 'sales_weight': 3},
                {'abbr': 'TOS', 'full': 'Tuscany', 'sales_weight': 3},
                {'abbr': 'SIC', 'full': 'Sicily', 'sales_weight': 2},
                {'abbr': 'PUG', 'full': 'Apulia', 'sales_weight': 2},
                {'abbr': 'SAR', 'full': 'Sardinia', 'sales_weight': 1},
                {'abbr': 'CAL', 'full': 'Calabria', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Spain': {'country_code': 'ES', 'states': [
                {'abbr': 'MD', 'full': 'Madrid', 'sales_weight': 10},
                {'abbr': 'CT', 'full': 'Catalonia', 'sales_weight': 8},
                {'abbr': 'AN', 'full': 'Andalusia', 'sales_weight': 6},
                {'abbr': 'VC', 'full': 'Valencian Community', 'sales_weight': 5},
                {'abbr': 'GA', 'full': 'Galicia', 'sales_weight': 3},
                {'abbr': 'PV', 'full': 'Basque Country', 'sales_weight': 3},
                {'abbr': 'CL', 'full': 'Castile and León', 'sales_weight': 2},
                {'abbr': 'AS', 'full': 'Asturias', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip}
        },
        'Asia': {
            'China': {'country_code': 'CN', 'states': [
                {'abbr': 'BJ', 'full': 'Beijing', 'sales_weight': 10},
                {'abbr': 'SH', 'full': 'Shanghai', 'sales_weight': 9},
                {'abbr': 'GD', 'full': 'Guangdong', 'sales_weight': 8},
                {'abbr': 'JS', 'full': 'Jiangsu', 'sales_weight': 7},
                {'abbr': 'ZJ', 'full': 'Zhejiang', 'sales_weight': 6},
                {'abbr': 'SC', 'full': 'Sichuan', 'sales_weight': 5},
                {'abbr': 'HB', 'full': 'Hubei', 'sales_weight': 4},
                {'abbr': 'FJ', 'full': 'Fujian', 'sales_weight': 3},
                {'abbr': 'SD', 'full': 'Shandong', 'sales_weight': 2},
                {'abbr': 'LN', 'full': 'Liaoning', 'sales_weight': 2}
            ], 'zip_generator': generate_plausible_zip},
            'Japan': {'country_code': 'JP', 'states': [
                {'abbr': 'TKY', 'full': 'Tokyo', 'sales_weight': 10},
                {'abbr': 'KAN', 'full': 'Kanagawa', 'sales_weight': 7},
                {'abbr': 'OSA', 'full': 'Osaka', 'sales_weight': 6},
                {'abbr': 'AIC', 'full': 'Aichi', 'sales_weight': 5},
                {'abbr': 'SAI', 'full': 'Saitama', 'sales_weight': 4},
                {'abbr': 'CHB', 'full': 'Chiba', 'sales_weight': 3},
                {'abbr': 'HKD', 'full': 'Hokkaido', 'sales_weight': 2},
                {'abbr': 'FKO', 'full': 'Fukuoka', 'sales_weight': 2}
            ], 'zip_generator': generate_plausible_zip},
            'South Korea': {'country_code': 'KR', 'states': [
                {'abbr': 'Seoul', 'full': 'Seoul', 'sales_weight': 10},
                {'abbr': 'Busan', 'full': 'Busan', 'sales_weight': 5},
                {'abbr': 'Incheon', 'full': 'Incheon', 'sales_weight': 4},
                {'abbr': 'Daegu', 'full': 'Daegu', 'sales_weight': 3},
                {'abbr': 'Daejeon', 'full': 'Daejeon', 'sales_weight': 2},
                {'abbr': 'Ulsan', 'full': 'Ulsan', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'Taiwan': {'country_code': 'TW', 'states': [
                {'abbr': 'TPE', 'full': 'Taipei', 'sales_weight': 10},
                {'abbr': 'TCH', 'full': 'Taichung', 'sales_weight': 7},
                {'abbr': 'KHH', 'full': 'Kaohsiung', 'sales_weight': 6},
                {'abbr': 'TNY', 'full': 'Tainan', 'sales_weight': 5},
                {'abbr': 'TXG', 'full': 'Taoyuan', 'sales_weight': 4},
                {'abbr': 'HSZ', 'full': 'Hsinchu', 'sales_weight': 3}
            ], 'zip_generator': generate_plausible_zip}
        },
        'Oceania': {
            'Australia': {'country_code': 'AU', 'states': [
                {'abbr': 'NSW', 'full': 'New South Wales', 'sales_weight': 10},
                {'abbr': 'VIC', 'full': 'Victoria', 'sales_weight': 8},
                {'abbr': 'QLD', 'full': 'Queensland', 'sales_weight': 6},
                {'abbr': 'WA', 'full': 'Western Australia', 'sales_weight': 4},
                {'abbr': 'SA', 'full': 'South Australia', 'sales_weight': 2},
                {'abbr': 'TAS', 'full': 'Tasmania', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip},
            'New Zealand': {'country_code': 'NZ', 'states': [
                {'abbr': 'AKL', 'full': 'Auckland', 'sales_weight': 10},
                {'abbr': 'WLG', 'full': 'Wellington', 'sales_weight': 5},
                {'abbr': 'CAN', 'full': 'Canterbury', 'sales_weight': 4},
                {'abbr': 'WKO', 'full': 'Waikato', 'sales_weight': 3},
                {'abbr': 'BOP', 'full': 'Bay of Plenty', 'sales_weight': 2}
            ], 'zip_generator': generate_plausible_zip}
        },
        'South America': {
            'Brazil': {'country_code': 'BR', 'states': [
                {'abbr': 'SP', 'full': 'São Paulo', 'sales_weight': 10},
                {'abbr': 'RJ', 'full': 'Rio de Janeiro', 'sales_weight': 7},
                {'abbr': 'MG', 'full': 'Minas Gerais', 'sales_weight': 5},
                {'abbr': 'RS', 'full': 'Rio Grande do Sul', 'sales_weight': 4},
                {'abbr': 'PR', 'full': 'Paraná', 'sales_weight': 3},
                {'abbr': 'SC', 'full': 'Santa Catarina', 'sales_weight': 2},
                {'abbr': 'DF', 'full': 'Federal District', 'sales_weight': 2},
                {'abbr': 'BA', 'full': 'Bahia', 'sales_weight': 1}
            ], 'zip_generator': generate_plausible_zip}
        }
    }
    geography_data = []
    geo_id = 100000
    for continent, countries in tesla_countries.items():
        for country, details in countries.items():
            for state in details['states']:
                zip_code = details['zip_generator'](country, state['abbr'])
                geography_data.append({
                    'Geo_ID': geo_id,
                    'Continent': continent,
                    'Country': country,
                    'Country_Code': details['country_code'],
                    'State_Province': state['full'],
                    'State_Province_Abbr': state['abbr'],
                    'Zip_Code': zip_code,
                    'Sales_Weight': state['sales_weight'] # Used for sampling, not part of the final table
                })
                geo_id += 1
    
    # We will return the full DataFrame, then drop the 'Sales_Weight' column
    geo_df_full = cudf.DataFrame(geography_data)
    return geo_df_full.drop(columns=['Sales_Weight'])


def generate_dim_prices(dim_product_df, dim_time_df):
    """
    Generates the Dim_Prices table.
    
    Fields: Model_ID, Quarter_Start_Date, Standard_Price_USD, Discounted_Price_USD
    """
    price_data = []
    
    # Get all unique model IDs and the start of each quarter
    model_ids = dim_product_df['Model_ID'].unique().to_pandas().tolist()
    
    time_df_pd = dim_time_df.to_pandas()
    quarter_start_dates = time_df_pd[time_df_pd['Day'] == 1].drop_duplicates(subset=['Year', 'Quarter'])['Full_Date'].tolist()
    
    for model_id in model_ids:
        # Fetch the base price from the product table
        base_price_series = dim_product_df[dim_product_df['Model_ID'] == model_id]['Model_Base_Price_USD']
        if base_price_series.empty:
            continue
        base_price = base_price_series.iloc[0]
        
        for q_date in quarter_start_dates:
            # Simulate price changes and discounts over time
            standard_price = int(base_price * (1 + random.uniform(-0.02, 0.05)))
            discounted_price = int(standard_price * (1 - random.uniform(0.05, 0.2)))
            
            price_data.append({
                'Model_ID': model_id,
                'Quarter_Start_Date': q_date,
                'Standard_Price_USD': standard_price,
                'Discounted_Price_USD': discounted_price
            })
            
    dim_prices_pd = pd.DataFrame(price_data)
    # This is the fix: convert the 'Quarter_Start_Date' column to a proper datetime type
    dim_prices_pd['Quarter_Start_Date'] = pd.to_datetime(dim_prices_pd['Quarter_Start_Date'])

    return cudf.from_pandas(dim_prices_pd)


def generate_fact_sales(dim_time_df, dim_geography_df, dim_product_df, dim_customer_df, dim_prices_df, num_records=2000000):
    """
    Generates the Fact_Sales table.
    
    Fields: Time_ID, Geo_ID, Model_ID, Customer_ID, Sales_Units, Is_Discounted_Sale, Revenue_USD
    
    This function leverages cuDF for large-scale data generation on the GPU.
    """
    
    print(f"Generating {num_records} sales records...")

    # Get the raw data from dimension tables and convert to pandas for sampling
    all_time_ids = dim_time_df['Time_ID'].to_pandas().tolist()
    
    dim_geo_pd = dim_geography_df.to_pandas()
    
    # Geographical sales distribution weights
    geo_weights_dict = {
        'United States': 10, 'Canada': 8, 'Mexico': 5,
        'Germany': 10, 'United Kingdom': 8, 'Norway': 10, 'France': 8, 'Netherlands': 7, 'Sweden': 6, 'Switzerland': 5, 'Italy': 4, 'Spain': 3,
        'China': 15, 'Japan': 7, 'South Korea': 5, 'Taiwan': 4,
        'Australia': 5, 'New Zealand': 2,
        'Brazil': 2
    }
    
    geo_ids_list = dim_geo_pd['Geo_ID'].tolist()
    geo_weights = [geo_weights_dict.get(row['Country'], 1) for _, row in dim_geo_pd.iterrows()]
    
    all_model_ids = dim_product_df['Model_ID'].to_pandas().tolist()
    all_customer_ids = dim_customer_df['Customer_ID'].to_pandas().tolist()

    # Generate random data on the GPU using cuPY
    cp.random.seed(RANDOM_SEED)
    time_id_indices = cp.random.randint(0, len(all_time_ids), size=num_records)
    geo_id_indices = cp.random.choice(len(geo_ids_list), size=num_records, p=cp.array(geo_weights) / sum(geo_weights))
    model_id_indices = cp.random.randint(0, len(all_model_ids), size=num_records)
    customer_id_indices = cp.random.randint(0, len(all_customer_ids), size=num_records)
    
    # Create cuDF Series from the randomly generated indices
    time_ids = cudf.Series(cp.array(all_time_ids)[time_id_indices])
    geo_ids = cudf.Series(cp.array(geo_ids_list)[geo_id_indices])
    model_ids = cudf.Series(cp.array(all_model_ids)[model_id_indices])
    customer_ids = cudf.Series(cp.array(all_customer_ids)[customer_id_indices])
    
    # Generate other columns with randomness
    sales_units = cudf.Series(cp.random.randint(1, 4, size=num_records))
    is_discounted_sale = cudf.Series(cp.random.choice([True, False], size=num_records, p=[0.2, 0.8]))

    # Combine into a temporary cuDF DataFrame
    fact_sales_temp = cudf.DataFrame({
        'Time_ID': time_ids,
        'Geo_ID': geo_ids,
        'Model_ID': model_ids,
        'Customer_ID': customer_ids,
        'Sales_Units': sales_units,
        'Is_Discounted_Sale': is_discounted_sale
    })
    
    # Join with prices table to get revenue.
    dim_time_pd = dim_time_df.to_pandas()
    dim_prices_pd = dim_prices_df.to_pandas()
    
    # Convert temporary cuDF DataFrame to pandas for the join logic
    fact_sales_temp_pd = fact_sales_temp.to_pandas()
    
    # Join with Dim_Time to get the quarter for each sale
    fact_sales_temp_pd = fact_sales_temp_pd.merge(
        dim_time_pd[['Time_ID', 'Full_Date']], on='Time_ID'
    )
    
    # Create a quarter-based key for joining with Dim_Prices
    fact_sales_temp_pd['Quarter_Key'] = pd.to_datetime(fact_sales_temp_pd['Full_Date']).dt.to_period('Q')
    dim_prices_pd['Quarter_Key'] = dim_prices_pd['Quarter_Start_Date'].dt.to_period('Q')
    
    # Join to get the prices
    fact_sales_temp_pd = fact_sales_temp_pd.merge(
        dim_prices_pd[['Model_ID', 'Quarter_Key', 'Standard_Price_USD', 'Discounted_Price_USD']],
        on=['Model_ID', 'Quarter_Key'],
        how='left'
    )
    
    # Calculate revenue based on price type
    fact_sales_temp_pd['Revenue_USD'] = fact_sales_temp_pd.apply(
        lambda row: row['Sales_Units'] * (row['Discounted_Price_USD'] if row['Is_Discounted_Sale'] else row['Standard_Price_USD']),
        axis=1
    )
    
    # Drop intermediate columns
    fact_sales_temp_pd = fact_sales_temp_pd.drop(
        columns=['Full_Date', 'Quarter_Key', 'Standard_Price_USD', 'Discounted_Price_USD']
    )
    
    return cudf.from_pandas(fact_sales_temp_pd)


def main():
    """Main function to generate and save all data tables."""
    start_time = time.time()
    
    print("Generating dimension tables...")
    
    # Generate dimension tables
    dim_product_df = generate_dim_product()
    dim_time_df = generate_dim_time(datetime.date(2018, 1, 1), datetime.date(2024, 12, 31))
    dim_customer_df = generate_dim_customer(num_customers=50000)
    dim_geography_df = generate_dim_geography()
    dim_prices_df = generate_dim_prices(dim_product_df, dim_time_df)
    
    print("Generating fact table...")
    
    # Generate fact table, capping at 2 million records
    fact_sales_df = generate_fact_sales(
        dim_time_df,
        dim_geography_df,
        dim_product_df,
        dim_customer_df,
        dim_prices_df,
        num_records=2000000
    )
    
    # Ensure output directory exists
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save to CSV
    print("Saving Dim_Product.csv...")
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8')
    print("Saving Dim_Time.csv...")
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    print("Saving Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    print("Saving Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    print("Saving Dim_Prices.csv...")
    dim_prices_df.to_csv(os.path.join(output_dir, 'Dim_Prices.csv'), index=False, encoding='utf-8')
    print("Saving Fact_Sales.csv...")
    fact_sales_df.to_csv(os.path.join(output_dir, 'Fact_Sales.csv'), index=False, encoding='utf-8')

    end_time = time.time()
    print(f"\nData generation completed in {end_time - start_time:.2f} seconds.")
    print(f"Total rows in Fact_Sales: {len(fact_sales_df)}")


if __name__ == '__main__':
    main()


Generating dimension tables...
Generating fact table...
Generating 2000000 sales records...
Saving Dim_Product.csv...
Saving Dim_Time.csv...
Saving Dim_Customer.csv...
Saving Dim_Geography.csv...
Saving Dim_Prices.csv...
Saving Fact_Sales.csv...

Data generation completed in 16.25 seconds.
Total rows in Fact_Sales: 2000000


***3. GPU代码生成数据 v3**

**1/6: 生成 Dim_Product 表 (简单静态数据)维度表更可能需要更新Update或追加Append。例如，新产品发布，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [61]:
# -*- coding: utf-8 -*-
"""Generate Dim_Product Table (CPU Version)"""

import pandas as pd
import os
import time

def generate_dim_product():
    """
    Generates a Dim_Product table with specific, fixed data to match the provided file.
    """
    data = {
        'Model_ID': [1, 2, 3, 4, 5],
        'Model_Name': ['Model 3', 'Model Y', 'Model S', 'Model X', 'Cybertruck'],
        'Model_Category': ['Sedan', 'SUV', 'Sedan', 'SUV', 'Truck'],
        'Model_Base_Price_USD': [46500.0, 55000.0, 82500.0, 95000.0, 70000.0],
        'Model_Launch_Date': ['2017-07-28', '2020-03-13', '2012-06-22', '2015-09-29', '2023-11-30']
    }
    
    dim_product_df = pd.DataFrame(data)
    
    # 确保日期列的数据类型为 datetime
    dim_product_df['Model_Launch_Date'] = pd.to_datetime(dim_product_df['Model_Launch_Date'])
    
    return dim_product_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在生成 Dim_Product 表...")
    dim_product_df = generate_dim_product()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("保存 Dim_Product.csv...")
    # 在保存前将日期列格式化为 YYYY-MM-DD 字符串，以确保 Power BI 兼容性
    dim_product_df['Model_Launch_Date'] = dim_product_df['Model_Launch_Date'].dt.strftime('%Y-%m-%d')
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Product.csv 已成功生成，耗时 {end_time - start_time:.2f} 秒。")

正在生成 Dim_Product 表...
保存 Dim_Product.csv...
Dim_Product.csv 已成功生成，耗时 0.01 秒。


**2/6: 生成 Dim_Time 表 (单一向前数据)维度表只追加Append。每一个时间点、每一天、每一个月都是一个既定的、永恒不变的事实。你无法“更新”昨天或去年的日期，此时就需要追加Append表中的相应记录。没有复杂的版本控制机制（Slowly Changing Dimension, SCD）**

In [62]:
# -*- coding: utf-8 -*-
"""Generate Dim_Time Table (CPU Version)"""
import pandas as pd
import os
import time

def generate_dim_time(start_year=2017, end_year=2025):
    """
    Generates a comprehensive Dim_Time table.
    """
    date_range = pd.date_range(start=f'{start_year}-01-01', end=f'{end_year}-12-31', freq='D')
    dim_time_df = pd.DataFrame({'Full_Date': date_range})

    dim_time_df['Time_ID'] = dim_time_df['Full_Date'].dt.strftime('%Y%m%d').astype(int)
    dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter'] = 'Q' + dim_time_df['Full_Date'].dt.quarter.astype(str)
    dim_time_df['Month'] = dim_time_df['Full_Date'].dt.month
    dim_time_df['Day'] = dim_time_df['Full_Date'].dt.day
    dim_time_df['Week_of_Year'] = dim_time_df['Full_Date'].dt.isocalendar().week.astype(int)
    dim_time_df['Day_of_Week'] = dim_time_df['Full_Date'].dt.dayofweek + 1
    dim_time_df['Day_Name'] = dim_time_df['Full_Date'].dt.day_name()

    # 按照要求的顺序进行排列
    dim_time_df = dim_time_df[['Time_ID', 'Full_Date', 'Year', 'Quarter', 'Month', 'Day', 'Week_of_Year', 'Day_of_Week', 'Day_Name']]
    
    return dim_time_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在生成 Dim_Time 表...")
    dim_time_df = generate_dim_time()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("保存 Dim_Time.csv...")
    # 在保存前将日期列格式化为 YYYY-MM-DD 字符串，以确保 Power BI 兼容性
    dim_time_df['Full_Date'] = dim_time_df['Full_Date'].dt.strftime('%Y-%m-%d')
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Time.csv 已成功生成，耗时 {end_time - start_time:.2f} 秒。")

正在生成 Dim_Time 表...
保存 Dim_Time.csv...
Dim_Time.csv 已成功生成，耗时 0.02 秒。


**3/6: 生成 Dim_Customer 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的收入水平或家庭住址可能会发生变化，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [64]:
# -*- coding: utf-8 -*-
"""Generate Dim_Customer Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time

# 使用固定的随机种子，确保每次生成的数据都相同
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

def generate_dim_customer(num_customers=50000):
    """
    Generates a Dim_Customer table with mock customer data.
    """
    first_names = ['Jane', 'John', 'Emily', 'Daniel', 'Laura', 'Chris', 'Jessica', 'David', 'Sarah', 'Michael']
    last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez']
    
    # 按照实际分布生成数据
    genders = ['Male', 'Female', 'Other']
    gender_weights = [0.4, 0.4, 0.2]
    
    age_groups = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    age_group_weights = [0.15, 0.25, 0.2, 0.15, 0.15, 0.1]
    
    income_levels = ['Low', 'Medium', 'High']
    income_weights = [0.35, 0.35, 0.3]
    
    customer_ids = np.arange(1, num_customers + 1)
    
    # 随机生成客户信息
    customer_names = [f"{np.random.choice(first_names)} {np.random.choice(last_names)}" for _ in range(num_customers)]
    genders_list = np.random.choice(genders, size=num_customers, p=gender_weights)
    age_groups_list = np.random.choice(age_groups, size=num_customers, p=age_group_weights)
    income_levels_list = np.random.choice(income_levels, size=num_customers, p=income_weights)
    
    dim_customer_df = pd.DataFrame({
        'Customer_ID': customer_ids,
        'Customer_Name': customer_names,
        'Gender': genders_list,
        'Age_Group': age_groups_list,
        'Income_Level': income_levels_list
    })
    
    return dim_customer_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在生成 Dim_Customer 表...")
    dim_customer_df = generate_dim_customer()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("保存 Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Customer.csv 已成功生成，耗时 {end_time - start_time:.2f} 秒。")

正在生成 Dim_Customer 表...
保存 Dim_Customer.csv...
Dim_Customer.csv 已成功生成，耗时 0.50 秒。


**4/6: 生成 Dim_Geography 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的地址可能会发生变化或更新到新的国家和城市，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [71]:
# -*- coding: utf-8 -*-
"""Generate Dim_Geography Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time
import random

# 使用固定的随机种子，确保每次生成的数据都相同
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_plausible_zip(country, state_province_abbr):
    """Generates a plausible zip code based on the country and state/province."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'

    if country == 'United States':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Canada':
        return f"{random.choice(letters)}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
    elif country == 'Mexico':
        return f"{random.randint(10000, 99999)}"
    elif country in ['Germany', 'Italy', 'Spain', 'Switzerland', 'Netherlands', 'Denmark', 'Norway', 'Sweden', 'Finland', 'Greece', 'Iceland', 'Ireland', 'Luxembourg', 'Monaco']:
        return f"{random.randint(10000, 99999)}"
    elif country == 'United Kingdom':
        part1 = ''.join(random.choices(letters, k=random.choice([1, 2]))) + ''.join(random.choices(digits, k=random.choice([1, 2])))
        part2 = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        return f"{part1} {part2}"
    elif country == 'France':
        return f"{random.randint(1, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}"
    elif country == 'China':
        return f"{random.randint(100000, 999999)}"
    elif country in ['Japan', 'South Korea', 'Taiwan', 'Hong Kong', 'Macau']:
        return f"{random.randint(10000, 9999999)}"
    elif country == 'Australia':
        return f"{random.randint(1000, 9999)}"
    elif country == 'New Zealand':
        return f"{random.randint(1000, 9999)}"
    else:
        return ""

def generate_dim_geography():
    """
    Generates a Dim_Geography table with a specific list of countries and a more extensive list of sub-regions,
    using names compatible with Power BI's Shape Map visualization based on user's DAX.
    """
    geography_data = []
    geo_id = 1
    
    # North America
    north_america_countries = {
        'United States': {
            'code': 'US',
            'provinces': [
                'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
                'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
                'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
                'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
                'Wisconsin', 'Wyoming'
            ]
        },
        'Canada': {
            'code': 'CA',
            'provinces': [
                'Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
                'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Québec', 'Saskatchewan',
                'Northwest Territories', 'Nunavut', 'Yukon'
            ]
        },
        'Mexico': {
            'code': 'MX',
            'provinces': [
                'Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas',
                'Chihuahua', 'Coahuila', 'Colima', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo',
                'Jalisco', 'México', 'Distrito Federal', 'Michoacán', 'Morelos', 'Nayarit',
                'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
                'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas'
            ]
        }
    }

    # Europe
    europe_countries = {
        'Germany': {
            'code': 'DE',
            'provinces': [
                'Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg',
                'Hesse', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'North Rhine-Westphalia',
                'Rhineland-Palatinate', 'Saarland', 'Saxony', 'Saxony-Anhalt',
                'Schleswig-Holstein', 'Thuringia'
            ]
        },
        'United Kingdom': {
            'code': 'GB',
            'provinces': [
                'England', 'Scotland', 'Wales', 'Northern Ireland'
            ]
        },
        'Norway': {'code': 'NO', 'provinces': ['Oslo', 'Viken', 'Innlandet', 'Vestfold og Telemark', 'Agder', 'Rogaland', 'Vestland', 'Møre og Romsdal', 'Trøndelag', 'Nordland', 'Troms og Finnmark']},
        'France': {'code': 'FR', 'provinces': ['Bretagne', 'Normandie', 'Île-de-France', 'Auvergne-Rhône-Alpes', 'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Corsica', 'Grand Est', 'Hauts-de-France', 'Nouvelle-Aquitaine', 'Occitanie', 'Pays de la Loire', 'Provence-Alpes-Côte d\'Azur']},
        'Netherlands': {'code': 'NL', 'provinces': ['Drenthe', 'Flevoland', 'Friesland', 'Gelderland', 'Groningen', 'Limburg', 'North Brabant', 'North Holland', 'Overijssel', 'Utrecht', 'Zeeland', 'South Holland']},
        'Sweden': {'code': 'SE', 'provinces': ['Blekinge', 'Dalarna', 'Gotland', 'Gävleborg', 'Halland', 'Jämtland', 'Jönköping', 'Kalmar', 'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Södermanland', 'Uppsala', 'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland', 'Västra Götaland', 'Örebro', 'Östergötland']},
        'Switzerland': {'code': 'CH', 'provinces': ['Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen', 'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen', 'Graubünden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchâtel', 'Geneva', 'Jura']},
        'Italy': {'code': 'IT', 'provinces': ['Abruzzo', 'Aosta Valley', 'Apulia', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-Alto Adige', 'Umbria', 'Veneto']},
        'Spain': {'code': 'ES', 'provinces': ['Andalusia', 'Aragon', 'Principality of Asturias', 'Balearic Islands', 'Basque Country', 'Canary Islands', 'Cantabria', 'Castile and León', 'Castile-La Mancha', 'Catalonia', 'Community of Madrid', 'Valencian Community', 'Extremadura', 'Galicia', 'La Rioja', 'Region of Murcia', 'Foral Community of Navarre']},
        'Denmark': {'code': 'DK', 'provinces': ['Capital Region of Denmark', 'Central Denmark Region', 'North Denmark Region', 'Region Zealand', 'Region of Southern Denmark']},
        'Finland': {'code': 'FI', 'provinces': ['Åland Islands', 'Central Finland', 'Central Ostrobothnia', 'Kainuu', 'Kymenlaakso', 'Lapland', 'North Karelia', 'North Ostrobothnia', 'Northern Savonia', 'Päijät-Häme', 'Pirkanmaa', 'Satakunta', 'South Karelia', 'Southern Ostrobothnia', 'Southern Savonia', 'Tavastia Proper', 'Uusimaa', 'Southwest Finland']},
        'Greece': {'code': 'GR', 'provinces': ['Attica', 'Central Greece', 'Central Macedonia', 'Crete', 'East Macedonia and Thrace', 'Epirus', 'Ionian Islands', 'North Aegean', 'Peloponnese', 'South Aegean', 'Thessaly', 'West Greece', 'West Macedonia']},
        'Iceland': {'code': 'IS', 'provinces': ['Capital Region', 'Southern Peninsula', 'Western Region', 'Westfjords', 'Northwest Region', 'Northeast Region', 'Eastern Region', 'Southern Region']},
        'Ireland': {'code': 'IE', 'provinces': ['Connacht', 'Leinster', 'Munster', 'Ulster']},
        'Luxembourg': {'code': 'LU', 'provinces': ['Diekirch', 'Grevenmacher', 'Luxembourg']},
        'Monaco': {'code': 'MC', 'provinces': ['Monaco']}
    }
    
    # Asia
    asia_countries = {
        'China': {
            'code': 'CN',
            'provinces': [
                'Anhui', 'Fujian', 'Gansu', 'Guangdong', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang',
                'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Liaoning', 'Qinghai',
                'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Yunnan', 'Zhejiang',
                'Guangxi', 'Nei Mongol', 'Ningxia Hui', 'Xinjiang Uygur', 'Xizang', 
                'Beijing', 'Chongqing', 'Shanghai', 'Tianjin'
            ]
        },
        'Hong Kong': {'code': 'HK', 'provinces': ['Hong Kong Island', 'Kowloon', 'New Territories']},
        'Macau': {'code': 'MO', 'provinces': ['Macau']},
        'Japan': {
            'code': 'JP',
            'provinces': [
                'Hokkaido', 'Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima',
                'Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa',
                'Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano',
                'Gifu', 'Shizuoka', 'Aichi', 'Mie', 'Shiga', 'Kyoto', 'Osaka',
                'Hyōgo', 'Nara', 'Wakayama', 'Tottori', 'Shimane', 'Okayama',
                'Hiroshima', 'Yamaguchi', 'Tokushima', 'Kagawa', 'Ehime', 'Kochi',
                'Fukuoka', 'Saga', 'Naoasaki', 'Kumamoto', 'Oita', 'Miyazaki', 'Kagoshima', 'Okinawa'
            ]
        },
        'South Korea': {
            'code': 'KR',
            'provinces': [
                'Busan', 'Chungcheongbuk-do', 'Chungcheongnam-do', 'Daegu', 'Daejeon', 'Gangwon-do',
                'Gwangju', 'Gyeonggi-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Incheon', 'Jeollabuk-do',
                'Jeollanam-do', 'Sejong', 'Seoul', 'Ulsan', 'Jeju'
            ]
        },
        'Taiwan': {
            'code': 'TW',
            'provinces': [
                'Taipei', 'New Taipei', 'Taichung', 'Tainan', 'Kaohsiung', 'Taoyuan', 
                'Keelung', 'Hsinchu City', 'Chiayi City', 'Hsinchu County', 'Chiayi County',
                'Changhua', 'Nantou', 'Yulin', 'Miaoli', 'Pingtung', 'Yilan', 'Hualien',
                'Taitung', 'Penghu', 'Kinmen', 'Lienkiang'
            ]
        }
    }

    # Oceania
    oceania_countries = {
        'Australia': {
            'code': 'AU',
            'provinces': [
                'New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia',
                'Tasmania', 'Australian Capital Territory', 'Northern Territory'
            ]
        },
        'New Zealand': {
            'code': 'NZ',
            'provinces': [
                'Auckland', 'Bay of Plenty', 'Canterbury', 'Gisborne', 'Hawke\'s Bay',
                'Manawatu-Wanganui', 'Marlborough', 'Nelson', 'Northland', 'Otago',
                'Southland', 'Taranaki', 'Tasman', 'Waikato', 'Wellington', 'West Coast'
            ]
        }
    }
    
    continents = {
        '北美': north_america_countries,
        '欧洲': europe_countries,
        '亚洲': asia_countries,
        '大洋洲': oceania_countries
    }

    for continent, countries in continents.items():
        for country, details in countries.items():
            for province in details['provinces']:
                state_abbr = province[:2].upper()
                
                geography_data.append([
                    geo_id,
                    continent,
                    country,
                    details['code'],
                    province,
                    state_abbr,
                    generate_plausible_zip(country, state_abbr)
                ])
                geo_id += 1

    dim_geography_df = pd.DataFrame(geography_data, columns=[
        'Geo_ID', 'Continent', 'Country', 'Country_Code', 'State_Province', 'State_Province_Abbr', 'Zip_Code'
    ])
    
    return dim_geography_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在生成 Dim_Geography 表...")
    dim_geography_df = generate_dim_geography()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("保存 Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Geography.csv 已成功生成，共 {len(dim_geography_df)} 行，耗时 {end_time - start_time:.2f} 秒。")

正在生成 Dim_Geography 表...
保存 Dim_Geography.csv...
Dim_Geography.csv 已成功生成，共 432 行，耗时 0.01 秒。


**5/6: 生成 Dim_Prices 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，新产品或不同时段价格可能会发生变化，此时就需要更新或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [74]:
# -*- coding: utf-8 -*-
"""Generate Dim_Geography Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time
import random

# 使用固定的随机种子，确保每次生成的数据都相同
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_plausible_zip(country, state_province_abbr):
    """Generates a plausible zip code based on the country and state/province."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'

    if country == 'United States':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Canada':
        return f"{random.choice(letters)}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
    elif country == 'Mexico':
        return f"{random.randint(10000, 99999)}"
    elif country in ['Germany', 'Italy', 'Spain', 'Switzerland', 'Netherlands', 'Denmark', 'Norway', 'Sweden', 'Finland', 'Greece', 'Iceland', 'Ireland', 'Luxembourg', 'Monaco']:
        return f"{random.randint(10000, 99999)}"
    elif country == 'United Kingdom':
        part1 = ''.join(random.choices(letters, k=random.choice([1, 2]))) + ''.join(random.choices(digits, k=random.choice([1, 2])))
        part2 = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        return f"{part1} {part2}"
    elif country == 'France':
        return f"{random.randint(1, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}"
    elif country == 'China':
        return f"{random.randint(100000, 999999)}"
    elif country in ['Japan', 'South Korea', 'Taiwan', 'Hong Kong', 'Macau']:
        return f"{random.randint(10000, 9999999)}"
    elif country == 'Australia':
        return f"{random.randint(1000, 9999)}"
    elif country == 'New Zealand':
        return f"{random.randint(1000, 9999)}"
    else:
        return ""

def generate_dim_geography():
    """
    Generates a Dim_Geography table with a specific list of countries and a more extensive list of sub-regions,
    using names compatible with Power BI's Shape Map visualization based on user's DAX.
    """
    geography_data = []
    geo_id = 1
    
    # North America
    north_america_countries = {
        'United States': {
            'code': 'US',
            'provinces': [
                'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
                'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
                'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
                'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
                'Wisconsin', 'Wyoming'
            ]
        },
        'Canada': {
            'code': 'CA',
            'provinces': [
                'Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
                'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Québec', 'Saskatchewan',
                'Northwest Territories', 'Nunavut', 'Yukon'
            ]
        },
        'Mexico': {
            'code': 'MX',
            'provinces': [
                'Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas',
                'Chihuahua', 'Coahuila', 'Colima', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo',
                'Jalisco', 'México', 'Distrito Federal', 'Michoacán', 'Morelos', 'Nayarit',
                'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
                'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas'
            ]
        }
    }

    # Europe
    europe_countries = {
        'Germany': {
            'code': 'DE',
            'provinces': [
                'Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg',
                'Hesse', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'North Rhine-Westphalia',
                'Rhineland-Palatinate', 'Saarland', 'Saxony', 'Saxony-Anhalt',
                'Schleswig-Holstein', 'Thuringia'
            ]
        },
        'United Kingdom': {
            'code': 'GB',
            'provinces': [
                'England', 'Scotland', 'Wales', 'Northern Ireland'
            ]
        },
        'Norway': {'code': 'NO', 'provinces': ['Oslo', 'Viken', 'Innlandet', 'Vestfold og Telemark', 'Agder', 'Rogaland', 'Vestland', 'Møre og Romsdal', 'Trøndelag', 'Nordland', 'Troms og Finnmark']},
        'France': {'code': 'FR', 'provinces': ['Bretagne', 'Normandie', 'Île-de-France', 'Auvergne-Rhône-Alpes', 'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Corsica', 'Grand Est', 'Hauts-de-France', 'Nouvelle-Aquitaine', 'Occitanie', 'Pays de la Loire', 'Provence-Alpes-Côte d\'Azur']},
        'Netherlands': {'code': 'NL', 'provinces': ['Drenthe', 'Flevoland', 'Friesland', 'Gelderland', 'Groningen', 'Limburg', 'North Brabant', 'North Holland', 'Overijssel', 'Utrecht', 'Zeeland', 'South Holland']},
        'Sweden': {'code': 'SE', 'provinces': ['Blekinge', 'Dalarna', 'Gotland', 'Gävleborg', 'Halland', 'Jämtland', 'Jönköping', 'Kalmar', 'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Södermanland', 'Uppsala', 'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland', 'Västra Götaland', 'Örebro', 'Östergötland']},
        'Switzerland': {'code': 'CH', 'provinces': ['Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen', 'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen', 'Graubünden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchâtel', 'Geneva', 'Jura']},
        'Italy': {'code': 'IT', 'provinces': ['Abruzzo', 'Aosta Valley', 'Apulia', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-Alto Adige', 'Umbria', 'Veneto']},
        'Spain': {'code': 'ES', 'provinces': ['Andalusia', 'Aragon', 'Principality of Asturias', 'Balearic Islands', 'Basque Country', 'Canary Islands', 'Cantabria', 'Castile and León', 'Castile-La Mancha', 'Catalonia', 'Community of Madrid', 'Valencian Community', 'Extremadura', 'Galicia', 'La Rioja', 'Region of Murcia', 'Foral Community of Navarre']},
        'Denmark': {'code': 'DK', 'provinces': ['Capital Region of Denmark', 'Central Denmark Region', 'North Denmark Region', 'Region Zealand', 'Region of Southern Denmark']},
        'Finland': {'code': 'FI', 'provinces': ['Åland Islands', 'Central Finland', 'Central Ostrobothnia', 'Kainuu', 'Kymenlaakso', 'Lapland', 'North Karelia', 'North Ostrobothnia', 'Northern Savonia', 'Päijät-Häme', 'Pirkanmaa', 'Satakunta', 'South Karelia', 'Southern Ostrobothnia', 'Southern Savonia', 'Tavastia Proper', 'Uusimaa', 'Southwest Finland']},
        'Greece': {'code': 'GR', 'provinces': ['Attica', 'Central Greece', 'Central Macedonia', 'Crete', 'East Macedonia and Thrace', 'Epirus', 'Ionian Islands', 'North Aegean', 'Peloponnese', 'South Aegean', 'Thessaly', 'West Greece', 'West Macedonia']},
        'Iceland': {'code': 'IS', 'provinces': ['Capital Region', 'Southern Peninsula', 'Western Region', 'Westfjords', 'Northwest Region', 'Northeast Region', 'Eastern Region', 'Southern Region']},
        'Ireland': {'code': 'IE', 'provinces': ['Connacht', 'Leinster', 'Munster', 'Ulster']},
        'Luxembourg': {'code': 'LU', 'provinces': ['Diekirch', 'Grevenmacher', 'Luxembourg']},
        'Monaco': {'code': 'MC', 'provinces': ['Monaco']}
    }
    
    # Asia
    asia_countries = {
        'China': {
            'code': 'CN',
            'provinces': [
                'Anhui', 'Fujian', 'Gansu', 'Guangdong', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang',
                'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Liaoning', 'Qinghai',
                'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Yunnan', 'Zhejiang',
                'Guangxi', 'Nei Mongol', 'Ningxia Hui', 'Xinjiang Uygur', 'Xizang', 
                'Beijing', 'Chongqing', 'Shanghai', 'Tianjin'
            ]
        },
        'Hong Kong': {'code': 'HK', 'provinces': ['Hong Kong Island', 'Kowloon', 'New Territories']},
        'Macau': {'code': 'MO', 'provinces': ['Macau']},
        'Japan': {
            'code': 'JP',
            'provinces': [
                'Hokkaido', 'Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima',
                'Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa',
                'Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano',
                'Gifu', 'Shizuoka', 'Aichi', 'Mie', 'Shiga', 'Kyoto', 'Osaka',
                'Hyōgo', 'Nara', 'Wakayama', 'Tottori', 'Shimane', 'Okayama',
                'Hiroshima', 'Yamaguchi', 'Tokushima', 'Kagawa', 'Ehime', 'Kochi',
                'Fukuoka', 'Saga', 'Naoasaki', 'Kumamoto', 'Oita', 'Miyazaki', 'Kagoshima', 'Okinawa'
            ]
        },
        'South Korea': {
            'code': 'KR',
            'provinces': [
                'Busan', 'Chungcheongbuk-do', 'Chungcheongnam-do', 'Daegu', 'Daejeon', 'Gangwon-do',
                'Gwangju', 'Gyeonggi-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Incheon', 'Jeollabuk-do',
                'Jeollanam-do', 'Sejong', 'Seoul', 'Ulsan', 'Jeju'
            ]
        },
        'Taiwan': {
            'code': 'TW',
            'provinces': [
                'Taipei', 'New Taipei', 'Taichung', 'Tainan', 'Kaohsiung', 'Taoyuan', 
                'Keelung', 'Hsinchu City', 'Chiayi City', 'Hsinchu County', 'Chiayi County',
                'Changhua', 'Nantou', 'Yulin', 'Miaoli', 'Pingtung', 'Yilan', 'Hualien',
                'Taitung', 'Penghu', 'Kinmen', 'Lienkiang'
            ]
        }
    }

    # Oceania
    oceania_countries = {
        'Australia': {
            'code': 'AU',
            'provinces': [
                'New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia',
                'Tasmania', 'Australian Capital Territory', 'Northern Territory'
            ]
        },
        'New Zealand': {
            'code': 'NZ',
            'provinces': [
                'Auckland', 'Bay of Plenty', 'Canterbury', 'Gisborne', 'Hawke\'s Bay',
                'Manawatu-Wanganui', 'Marlborough', 'Nelson', 'Northland', 'Otago',
                'Southland', 'Taranaki', 'Tasman', 'Waikato', 'Wellington', 'West Coast'
            ]
        }
    }
    
    continents = {
        'North America': north_america_countries,
        'Europe': europe_countries,
        'Asia': asia_countries,
        'Oceania': oceania_countries
    }

    for continent, countries in continents.items():
        for country, details in countries.items():
            for province in details['provinces']:
                state_abbr = province[:2].upper()
                
                geography_data.append([
                    geo_id,
                    continent,
                    country,
                    details['code'],
                    province,
                    state_abbr,
                    generate_plausible_zip(country, state_abbr)
                ])
                geo_id += 1

    dim_geography_df = pd.DataFrame(geography_data, columns=[
        'Geo_ID', 'Continent', 'Country', 'Country_Code', 'State_Province', 'State_Province_Abbr', 'Zip_Code'
    ])
    
    return dim_geography_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Geography table...")
    dim_geography_df = generate_dim_geography()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Geography.csv has been successfully generated with {len(dim_geography_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Geography table...
Saving Dim_Geography.csv...
Dim_Geography.csv has been successfully generated with 432 rows in 0.00 seconds.


**6/6: 生成 Fact_Sales 表 (高度动态数据，最常被追加（append）的表) 只进不出”的设计哲学。每当一笔新的销售发生，就在 Fact_Sales 表中追加一行新的数据，而不会去修改之前已经存在的历史销售记录**

In [70]:
# -*- coding: utf-8 -*-
"""Generate Fact_Sales Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Includes Tesla-like sales distribution logic.
    """
    start_year = dim_time_df['Year'].min()
    end_year = dim_time_df['Year'].max()
    
    sales_data = []
    
    product_weights = {
        1: 0.40, 2: 0.45, 3: 0.05, 4: 0.05, 5: 0.05
    }
    
    continent_weights = {
        'North America': 0.50, 'Europe': 0.33, 'Asia-Oceania': 0.17
    }

    customer_ids = dim_customer_df['Customer_ID'].values
    geography_map = dim_geography_df.groupby('Continent')['Geo_ID'].apply(list).to_dict()

    # 将 dim_prices_df 与 dim_time_df 合并，创建有效的 Model-Time-Price 组合查找表
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time
    # 确保日期列的时区一致
    dim_prices_df['Quarter_Start_Date'] = dim_prices_df['Quarter_Start_Date'].dt.tz_localize(None)

    price_time_lookup = dim_prices_df.merge(
        dim_time_df, 
        on='Quarter_Start_Date', 
        how='left'
    ).drop(columns=['Full_Date', 'Quarter_Start_Date'])

    for year in range(start_year, end_year + 1):
        num_sales_for_year = 0
        
        if year == start_year:
            num_sales_for_year = 100000
        else:
            sales_growth_factor = 1.50
            num_sales_for_year = int(len(sales_data[-1]) * sales_growth_factor)
        
        print(f"正在为年份 {year} 生成 {num_sales_for_year} 条销售记录...")
        
        # 筛选出当前年份的价格-时间组合
        current_year_lookup = price_time_lookup[price_time_lookup['Year'] == year].copy()
        
        if current_year_lookup.empty:
            print(f"警告：年份 {year} 没有可用的价格数据，跳过该年份。")
            continue
        
        # 修正：将权重映射到每一行，然后进行归一化
        current_year_lookup['Weight'] = current_year_lookup['Model_ID'].map(product_weights)
        total_weight_sum = current_year_lookup['Weight'].sum()
        if total_weight_sum == 0:
            print(f"警告：年份 {year} 的可用车型权重为零，跳过该年份。")
            continue
        current_year_lookup['Probability'] = current_year_lookup['Weight'] / total_weight_sum
        
        # 从有效的价格-时间组合中按权重进行抽样
        sampled_rows = current_year_lookup.sample(n=num_sales_for_year, replace=True, weights='Probability', random_state=42).reset_index(drop=True)

        # 构建销售记录
        sampled_rows['Customer_ID'] = np.random.choice(customer_ids, size=num_sales_for_year, replace=True)

        continent_choices = np.random.choice(list(continent_weights.keys()), size=num_sales_for_year, p=list(continent_weights.values()))
        sampled_rows['Geo_ID'] = [np.random.choice(geography_map[continent]) for continent in continent_choices]

        # 计算 Is_Discounted_Sale 和 Revenue_USD
        sampled_rows['Sales_Units'] = 1
        sampled_rows['Is_Discounted_Sale'] = sampled_rows['Discounted_Price_USD'] < sampled_rows['Standard_Price_USD']
        sampled_rows['Revenue_USD'] = sampled_rows['Sales_Units'] * sampled_rows['Discounted_Price_USD']

        sales_data.append(sampled_rows)
        
    if not sales_data:
        print("所有年份均没有可用数据，无法生成 Fact_Sales 表。")
        return pd.DataFrame()
        
    fact_sales_df = pd.concat(sales_data, ignore_index=True)
    
    # 按照要求的顺序选择并排序列
    fact_sales_df = fact_sales_df[['Time_ID', 'Geo_ID', 'Model_ID', 'Customer_ID', 'Sales_Units', 'Is_Discounted_Sale', 'Revenue_USD']]

    return fact_sales_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在加载所有维度表...")
    dim_product_df = pd.read_csv(os.path.join('./output_data', 'Dim_Product.csv'))
    dim_time_df = pd.read_csv(os.path.join('./output_data', 'Dim_Time.csv'))
    dim_customer_df = pd.read_csv(os.path.join('./output_data', 'Dim_Customer.csv'))
    dim_geography_df = pd.read_csv(os.path.join('./output_data', 'Dim_Geography.csv'))
    dim_prices_df = pd.read_csv(os.path.join('./output_data', 'Dim_Prices.csv'))

    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    
    print("正在生成 Fact_Sales 表...")
    fact_sales_df = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df)

    if not fact_sales_df.empty:
        output_dir = './output_data'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print("保存 Fact_Sales.csv...")
        fact_sales_df.to_csv(os.path.join(output_dir, 'Fact_Sales.csv'), index=False, encoding='utf-8')
        
        end_time = time.time()
        print(f"Fact_Sales.csv 已成功生成，耗时 {end_time - start_time:.2f} 秒。")
        print("数据生成完成！")
    else:
        print("数据生成失败。")

正在加载所有维度表...
正在生成 Fact_Sales 表...
正在为年份 2017 生成 100000 条销售记录...
正在为年份 2018 生成 150000 条销售记录...
正在为年份 2019 生成 225000 条销售记录...
正在为年份 2020 生成 337500 条销售记录...
正在为年份 2021 生成 506250 条销售记录...
正在为年份 2022 生成 759375 条销售记录...
正在为年份 2023 生成 1139062 条销售记录...
正在为年份 2024 生成 1708593 条销售记录...
正在为年份 2025 生成 2562889 条销售记录...
保存 Fact_Sales.csv...
Fact_Sales.csv 已成功生成，耗时 58.82 秒。
数据生成完成！


**第二版松耦合代码**

In [75]:
# -*- coding: utf-8 -*-
"""1/6: Generate Dim_Product Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_product():
    """Generates the Dim_Product table."""
    data = [
        [1, 'Model 3', 'Sedan', 46500.00, '2017-07-28'],
        [2, 'Model Y', 'SUV', 55000.00, '2020-03-13'],
        [3, 'Model S', 'Sedan', 82500.00, '2012-06-22'],
        [4, 'Model X', 'SUV', 95000.00, '2015-09-29'],
        [5, 'Cybertruck', 'Truck', 70000.00, '2023-11-30']
    ]
    df = pd.DataFrame(data, columns=['Model_ID', 'Model_Name', 'Model_Category', 'Model_Base_Price_USD', 'Model_Launch_Date'])
    df['Model_Launch_Date'] = pd.to_datetime(df['Model_Launch_Date'])
    return df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Product table...")
    dim_product_df = generate_dim_product()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Product.csv...")
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Product.csv has been successfully generated with {len(dim_product_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Product table...
Saving Dim_Product.csv...
Dim_Product.csv has been successfully generated with 5 rows in 0.00 seconds.


In [76]:
# -*- coding: utf-8 -*-
"""2/6: Generate Dim_Time Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import datetime

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_time():
    """Generates the Dim_Time table."""
    start_date = datetime.date(2017, 1, 1)
    end_date = datetime.date(2025, 12, 31)
    date_range = [start_date + datetime.timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

    data = []
    for date in date_range:
        data.append([
            int(date.strftime('%Y%m%d')),
            date,
            date.year,
            f"Q{((date.month - 1) // 3) + 1}",
            date.month,
            date.day,
            date.isocalendar()[1],
            date.isoweekday(),
            date.strftime('%A')
        ])
    
    return pd.DataFrame(data, columns=['Time_ID', 'Full_Date', 'Year', 'Quarter', 'Month', 'Day', 'Week_of_Year', 'Day_of_Week', 'Day_Name'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Time table...")
    dim_time_df = generate_dim_time()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Time.csv...")
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Time.csv has been successfully generated with {len(dim_time_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Time table...
Saving Dim_Time.csv...
Dim_Time.csv has been successfully generated with 3287 rows in 0.02 seconds.


In [77]:
# -*- coding: utf-8 -*-
"""3/6: Generate Dim_Customer Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_customer(num_customers=50000):
    """Generates the Dim_Customer table."""
    genders = ['Male', 'Female', 'Other']
    age_groups = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    income_levels = ['Low', 'Medium', 'High']
    first_names = ['James', 'Mary', 'John', 'Patricia', 'Robert', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'David', 'Susan', 'Richard', 'Jessica', 'Joseph', 'Sarah', 'Thomas', 'Karen', 'Charles', 'Nancy', 'Christopher', 'Lisa', 'Daniel', 'Betty', 'Paul', 'Margaret', 'Mark', 'Sandra', 'Donald', 'Ashley', 'George', 'Kimberly', 'Kenneth', 'Donna', 'Steven', 'Emily', 'Edward', 'Carol', 'Brian', 'Michelle', 'Ronald', 'Amanda', 'Anthony', 'Melissa', 'Kevin', 'Deborah', 'Jason', 'Stephanie', 'Jeff', 'Maria', 'Gary', 'Heather', 'Timothy', 'Nicole', 'Jose', 'Denise', 'Larry', 'Megan', 'Jeffrey', 'Christina', 'Frank', 'Alexis', 'Scott', 'Tiffany', 'Eric', 'Lauren', 'Stephen', 'Rachel', 'Andrew', 'Crystal', 'Raymond', 'Kayla', 'Ryan', 'Danielle', 'Jacob', 'Brittany', 'Nicholas', 'Emma', 'Jonathan', 'Samantha', 'Laura', 'Alexis', 'Joshua', 'Brandon', 'Justin', 'Daniel', 'Daniel', 'Taylor']
    last_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young', 'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins', 'Stewart', 'Sanchez', 'Morris', 'Rogers', 'Reed', 'Cook', 'Morgan', 'Bell', 'Murphy', 'Bailey', 'Rivera', 'Cooper', 'Richardson', 'Cox', 'Howard', 'Ward', 'Torres', 'Peterson', 'Gray', 'Ramirez', 'James', 'Watson', 'Brooks', 'Kelly', 'Sanders', 'Price', 'Bennett', 'Wood', 'Barnes', 'Ross', 'Henderson', 'Coleman', 'Jenkins', 'Perry', 'Powell', 'Long', 'Patterson', 'Hughes', 'Flores', 'Washington', 'Butler', 'Simmons', 'Foster', 'Gonzales', 'Bryant', 'Alexander', 'Russell', 'Griffin', 'Diaz', 'Hayes', 'Myers', 'Ford', 'Hamilton', 'Graham', 'Sullivan', 'Wallace', 'Woods', 'Cole', 'West', 'Jordan', 'Owens', 'Reynolds', 'Fisher', 'Ellis', 'Harrison', 'Gibson', 'Mcdonald', 'Cruz', 'Marshall', 'Ortiz', 'Gomez', 'Murray', 'Freeman', 'Wells', 'Webb', 'Simpson', 'Stevens', 'Tucker', 'Porter', 'Hunter', 'Hicks', 'Crawford', 'Henry', 'Boyd', 'Mason', 'Kennedy', 'Warren', 'Dixon', 'Ramos', 'Reid', 'Carr', 'Chavez', 'Gibson']
    
    data = []
    for i in range(1, num_customers + 1):
        full_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        gender = random.choice(genders)
        age_group = random.choice(age_groups)
        income_level = random.choice(income_levels)
        data.append([i, full_name, gender, age_group, income_level])
        
    return pd.DataFrame(data, columns=['Customer_ID', 'Customer_Name', 'Gender', 'Age_Group', 'Income_Level'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Customer table...")
    dim_customer_df = generate_dim_customer()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Customer.csv has been successfully generated with {len(dim_customer_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Customer table...
Saving Dim_Customer.csv...
Dim_Customer.csv has been successfully generated with 50000 rows in 4.52 seconds.


In [78]:
# -*- coding: utf-8 -*-
"""4/6: Generate Dim_Geography Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_plausible_zip(country, state_province_abbr):
    """Generates a plausible zip code based on the country and state/province."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'

    if country == 'United States':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Canada':
        return f"{random.choice(letters)}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
    elif country == 'Mexico':
        return f"{random.randint(10000, 99999)}"
    elif country in ['Germany', 'Italy', 'Spain', 'Switzerland', 'Netherlands', 'Denmark', 'Norway', 'Sweden', 'Finland', 'Greece', 'Iceland', 'Ireland', 'Luxembourg', 'Monaco']:
        return f"{random.randint(10000, 99999)}"
    elif country == 'United Kingdom':
        part1 = ''.join(random.choices(letters, k=random.choice([1, 2]))) + ''.join(random.choices(digits, k=random.choice([1, 2])))
        part2 = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        return f"{part1} {part2}"
    elif country == 'France':
        return f"{random.randint(1, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}"
    elif country == 'China':
        return f"{random.randint(100000, 999999)}"
    elif country in ['Japan', 'South Korea', 'Taiwan', 'Hong Kong', 'Macau']:
        return f"{random.randint(10000, 9999999)}"
    elif country == 'Australia':
        return f"{random.randint(1000, 9999)}"
    elif country == 'New Zealand':
        return f"{random.randint(1000, 9999)}"
    else:
        return ""

def generate_dim_geography():
    """
    Generates a Dim_Geography table.
    """
    geography_data = []
    geo_id = 1
    
    # North America
    north_america_countries = {
        'United States': {
            'code': 'US',
            'provinces': [
                'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
                'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
                'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
                'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
                'Wisconsin', 'Wyoming'
            ]
        },
        'Canada': {
            'code': 'CA',
            'provinces': [
                'Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
                'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Québec', 'Saskatchewan',
                'Northwest Territories', 'Nunavut', 'Yukon'
            ]
        },
        'Mexico': {
            'code': 'MX',
            'provinces': [
                'Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas',
                'Chihuahua', 'Coahuila', 'Colima', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo',
                'Jalisco', 'México', 'Distrito Federal', 'Michoacán', 'Morelos', 'Nayarit',
                'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
                'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas'
            ]
        }
    }

    # Europe
    europe_countries = {
        'Germany': {
            'code': 'DE',
            'provinces': [
                'Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg',
                'Hesse', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'North Rhine-Westphalia',
                'Rhineland-Palatinate', 'Saarland', 'Saxony', 'Saxony-Anhalt',
                'Schleswig-Holstein', 'Thuringia'
            ]
        },
        'United Kingdom': {
            'code': 'GB',
            'provinces': [
                'England', 'Scotland', 'Wales', 'Northern Ireland'
            ]
        },
        'Norway': {'code': 'NO', 'provinces': ['Oslo', 'Viken', 'Innlandet', 'Vestfold og Telemark', 'Agder', 'Rogaland', 'Vestland', 'Møre og Romsdal', 'Trøndelag', 'Nordland', 'Troms og Finnmark']},
        'France': {'code': 'FR', 'provinces': ['Bretagne', 'Normandie', 'Île-de-France', 'Auvergne-Rhône-Alpes', 'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Corsica', 'Grand Est', 'Hauts-de-France', 'Nouvelle-Aquitaine', 'Occitanie', 'Pays de la Loire', 'Provence-Alpes-Côte d\'Azur']},
        'Netherlands': {'code': 'NL', 'provinces': ['Drenthe', 'Flevoland', 'Friesland', 'Gelderland', 'Groningen', 'Limburg', 'North Brabant', 'North Holland', 'Overijssel', 'Utrecht', 'Zeeland', 'South Holland']},
        'Sweden': {'code': 'SE', 'provinces': ['Blekinge', 'Dalarna', 'Gotland', 'Gävleborg', 'Halland', 'Jämtland', 'Jönköping', 'Kalmar', 'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Södermanland', 'Uppsala', 'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland', 'Västra Götaland', 'Örebro', 'Östergötland']},
        'Switzerland': {'code': 'CH', 'provinces': ['Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen', 'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen', 'Graubünden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchâtel', 'Geneva', 'Jura']},
        'Italy': {'code': 'IT', 'provinces': ['Abruzzo', 'Aosta Valley', 'Apulia', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-Alto Adige', 'Umbria', 'Veneto']},
        'Spain': {'code': 'ES', 'provinces': ['Andalusia', 'Aragon', 'Principality of Asturias', 'Balearic Islands', 'Basque Country', 'Canary Islands', 'Cantabria', 'Castile and León', 'Castile-La Mancha', 'Catalonia', 'Community of Madrid', 'Valencian Community', 'Extremadura', 'Galicia', 'La Rioja', 'Region of Murcia', 'Foral Community of Navarre']},
        'Denmark': {'code': 'DK', 'provinces': ['Capital Region of Denmark', 'Central Denmark Region', 'North Denmark Region', 'Region Zealand', 'Region of Southern Denmark']},
        'Finland': {'code': 'FI', 'provinces': ['Åland Islands', 'Central Finland', 'Central Ostrobothnia', 'Kainuu', 'Kymenlaakso', 'Lapland', 'North Karelia', 'North Ostrobothnia', 'Northern Savonia', 'Päijät-Häme', 'Pirkanmaa', 'Satakunta', 'South Karelia', 'Southern Ostrobothnia', 'Southern Savonia', 'Tavastia Proper', 'Uusimaa', 'Southwest Finland']},
        'Greece': {'code': 'GR', 'provinces': ['Attica', 'Central Greece', 'Central Macedonia', 'Crete', 'East Macedonia and Thrace', 'Epirus', 'Ionian Islands', 'North Aegean', 'Peloponnese', 'South Aegean', 'Thessaly', 'West Greece', 'West Macedonia']},
        'Iceland': {'code': 'IS', 'provinces': ['Capital Region', 'Southern Peninsula', 'Western Region', 'Westfjords', 'Northwest Region', 'Northeast Region', 'Eastern Region', 'Southern Region']},
        'Ireland': {'code': 'IE', 'provinces': ['Connacht', 'Leinster', 'Munster', 'Ulster']},
        'Luxembourg': {'code': 'LU', 'provinces': ['Diekirch', 'Grevenmacher', 'Luxembourg']},
        'Monaco': {'code': 'MC', 'provinces': ['Monaco']}
    }
    
    # Asia
    asia_countries = {
        'China': {
            'code': 'CN',
            'provinces': [
                'Anhui', 'Fujian', 'Gansu', 'Guangdong', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang',
                'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Liaoning', 'Qinghai',
                'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Yunnan', 'Zhejiang',
                'Guangxi', 'Nei Mongol', 'Ningxia Hui', 'Xinjiang Uygur', 'Xizang', 
                'Beijing', 'Chongqing', 'Shanghai', 'Tianjin'
            ]
        },
        'Hong Kong': {'code': 'HK', 'provinces': ['Hong Kong Island', 'Kowloon', 'New Territories']},
        'Macau': {'code': 'MO', 'provinces': ['Macau']},
        'Japan': {
            'code': 'JP',
            'provinces': [
                'Hokkaido', 'Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima',
                'Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa',
                'Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano',
                'Gifu', 'Shizuoka', 'Aichi', 'Mie', 'Shiga', 'Kyoto', 'Osaka',
                'Hyōgo', 'Nara', 'Wakayama', 'Tottori', 'Shimane', 'Okayama',
                'Hiroshima', 'Yamaguchi', 'Tokushima', 'Kagawa', 'Ehime', 'Kochi',
                'Fukuoka', 'Saga', 'Naoasaki', 'Kumamoto', 'Oita', 'Miyazaki', 'Kagoshima', 'Okinawa'
            ]
        },
        'South Korea': {
            'code': 'KR',
            'provinces': [
                'Busan', 'Chungcheongbuk-do', 'Chungcheongnam-do', 'Daegu', 'Daejeon', 'Gangwon-do',
                'Gwangju', 'Gyeonggi-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Incheon', 'Jeollabuk-do',
                'Jeollanam-do', 'Sejong', 'Seoul', 'Ulsan', 'Jeju'
            ]
        },
        'Taiwan': {
            'code': 'TW',
            'provinces': [
                'Taipei', 'New Taipei', 'Taichung', 'Tainan', 'Kaohsiung', 'Taoyuan', 
                'Keelung', 'Hsinchu City', 'Chiayi City', 'Hsinchu County', 'Chiayi County',
                'Changhua', 'Nantou', 'Yulin', 'Miaoli', 'Pingtung', 'Yilan', 'Hualien',
                'Taitung', 'Penghu', 'Kinmen', 'Lienkiang'
            ]
        }
    }

    # Oceania
    oceania_countries = {
        'Australia': {
            'code': 'AU',
            'provinces': [
                'New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia',
                'Tasmania', 'Australian Capital Territory', 'Northern Territory'
            ]
        },
        'New Zealand': {
            'code': 'NZ',
            'provinces': [
                'Auckland', 'Bay of Plenty', 'Canterbury', 'Gisborne', 'Hawke\'s Bay',
                'Manawatu-Wanganui', 'Marlborough', 'Nelson', 'Northland', 'Otago',
                'Southland', 'Taranaki', 'Tasman', 'Waikato', 'Wellington', 'West Coast'
            ]
        }
    }
    
    continents = {
        'North America': north_america_countries,
        'Europe': europe_countries,
        'Asia': asia_countries,
        'Oceania': oceania_countries
    }

    for continent, countries in continents.items():
        for country, details in countries.items():
            for province in details['provinces']:
                state_abbr = province[:2].upper()
                
                geography_data.append([
                    geo_id,
                    continent,
                    country,
                    details['code'],
                    province,
                    state_abbr,
                    generate_plausible_zip(country, state_abbr)
                ])
                geo_id += 1

    dim_geography_df = pd.DataFrame(geography_data, columns=[
        'Geo_ID', 'Continent', 'Country', 'Country_Code', 'State_Province', 'State_Province_Abbr', 'Zip_Code'
    ])
    
    return dim_geography_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Geography table...")
    dim_geography_df = generate_dim_geography()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Geography.csv has been successfully generated with {len(dim_geography_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Geography table...
Saving Dim_Geography.csv...
Dim_Geography.csv has been successfully generated with 432 rows in 0.01 seconds.


In [79]:
# -*- coding: utf-8 -*-
"""5/6: Generate Dim_Prices Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_prices(dim_product_df, dim_time_df):
    """Generates the Dim_Prices table."""
    quarter_start_dates = sorted(dim_time_df['Full_Date'].loc[dim_time_df['Month'].isin([1, 4, 7, 10])].unique())
    model_ids = dim_product_df['Model_ID'].unique()
    
    prices_data = []
    
    # Dynamic price generation with seasonal/random fluctuations
    for quarter_start_date in quarter_start_dates:
        for model_id in model_ids:
            # Base price from product table
            base_price = dim_product_df.loc[dim_product_df['Model_ID'] == model_id, 'Model_Base_Price_USD'].iloc[0]
            
            # Fluctuate prices randomly with a trend
            price_factor = 1 + random.uniform(-0.05, 0.05)
            
            # Apply launch date logic
            launch_date = dim_product_df.loc[dim_product_df['Model_ID'] == model_id, 'Model_Launch_Date'].iloc[0]
            if pd.to_datetime(quarter_start_date) < launch_date:
                # Car not launched yet
                continue
            
            standard_price = base_price * price_factor
            
            # Randomly apply a discount
            is_discounted = random.random() < 0.2  # 20% chance of a discount
            discount_price = standard_price
            if is_discounted:
                discount_percentage = random.uniform(0.02, 0.10) # 2-10% discount
                discount_price = standard_price * (1 - discount_percentage)
            
            prices_data.append([
                model_id,
                quarter_start_date,
                standard_price,
                discount_price
            ])
            
    return pd.DataFrame(prices_data, columns=['Model_ID', 'Quarter_Start_Date', 'Standard_Price_USD', 'Discounted_Price_USD'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Loading Dim_Product and Dim_Time for price generation...")
    # This script depends on the output of the two previous scripts
    try:
        dim_product_df = pd.read_csv('./output_data/Dim_Product.csv')
        dim_time_df = pd.read_csv('./output_data/Dim_Time.csv')
        dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
        dim_product_df['Model_Launch_Date'] = pd.to_datetime(dim_product_df['Model_Launch_Date'])
    except FileNotFoundError:
        print("Error: Could not find 'Dim_Product.csv' or 'Dim_Time.csv'. Please run the previous scripts first.")
        exit()
        
    print("Generating Dim_Prices table...")
    dim_prices_df = generate_dim_prices(dim_product_df, dim_time_df)

    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Prices.csv...")
    dim_prices_df.to_csv(os.path.join(output_dir, 'Dim_Prices.csv'), index=False, encoding='utf-8')

    end_time = time.time()
    print(f"Dim_Prices.csv has been successfully generated with {len(dim_prices_df)} rows in {end_time - start_time:.2f} seconds.")

Loading Dim_Product and Dim_Time for price generation...
Generating Dim_Prices table...
Saving Dim_Prices.csv...
Dim_Prices.csv has been successfully generated with 4186 rows in 1.14 seconds.


In [81]:
# -*- coding: utf-8 -*-
"""6/6: Generate Fact_Sales Table (Optimized for Distribution)"""

import pandas as pd
import os
import time
import numpy as np
import random
import copy

# 使用固定随机种子以便可复现
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_fact_sales(num_sales_records=1000000):
    """
    生成 Fact_Sales 表。
    该版本通过为不同地区分配权重，来模拟更真实的销售分布，同时确保所有地区都有销售数据。
    """
    
    # 加载所有维度表以获取有效的 ID
    try:
        dim_product_df = pd.read_csv('./output_data/Dim_Product.csv')
        dim_time_df = pd.read_csv('./output_data/Dim_Time.csv')
        dim_customer_df = pd.read_csv('./output_data/Dim_Customer.csv')
        dim_geography_df = pd.read_csv('./output_data/Dim_Geography.csv')
        dim_prices_df = pd.read_csv('./output_data/Dim_Prices.csv')
        
        # 转换日期列以正确处理
        dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
        dim_product_df['Model_Launch_Date'] = pd.to_datetime(dim_product_df['Model_Launch_Date'])
        dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    except FileNotFoundError:
        print("错误：缺少一个或多个必需的 CSV 文件。请先运行所有维度生成脚本（1-5）。")
        return pd.DataFrame() # 返回空 DataFrame

    # 获取维度表中的有效 ID
    model_ids = dim_product_df['Model_ID'].unique()
    time_ids = dim_time_df['Time_ID'].unique()
    customer_ids = dim_customer_df['Customer_ID'].unique()
    geo_ids = dim_geography_df['Geo_ID'].unique()

    sales_data = []
    
    # ----------------------------------------------------
    # 步骤 1: 强制生成每地区至少一条销售数据
    # 这是为了确保地图上所有州/府都有颜色，避免出现灰白区域。
    # ----------------------------------------------------
    print("步骤 1/2: 正在为每个地理区域生成至少一条销售记录以确保完整覆盖...")
    
    # 复制一份地理数据，以免修改原始 DataFrame
    temp_geography_df = dim_geography_df.copy()
    
    for geo_id in temp_geography_df['Geo_ID'].unique():
        # 随机选择一个日期、客户和产品
        random_time_id = np.random.choice(time_ids)
        random_customer_id = np.random.choice(customer_ids)
        random_model_id = np.random.choice(model_ids)
        
        # 获取该时间 ID 对应的季度开始日期，用于查找价格
        quarter_start_date = dim_time_df.loc[dim_time_df['Time_ID'] == random_time_id, 'Full_Date'].iloc[0].to_period('Q').start_time
        
        # 查找该季度该车型的有效价格
        price_row = dim_prices_df[(dim_prices_df['Model_ID'] == random_model_id) & 
                                  (dim_prices_df['Quarter_Start_Date'] == quarter_start_date)]
        
        if not price_row.empty:
            is_discounted = random.random() < 0.3 # 30% 几率打折
            unit_price = price_row['Discounted_Price_USD'].iloc[0] if is_discounted else price_row['Standard_Price_USD'].iloc[0]
            unit_cost = unit_price * random.uniform(0.70, 0.85) # 成本为价格的 70-85%
            
            # 确保数量至少为 1
            quantity = 1
            
            sales_data.append([
                random_time_id,
                random_customer_id,
                geo_id,  # 这是关键行：使用来自循环的 geo_id
                random_model_id,
                unit_price,
                unit_cost,
                quantity,
                unit_price * quantity
            ])

    # ----------------------------------------------------
    # 步骤 2: 生成剩余的随机销售数据，并应用权重
    # 这部分是核心优化，使得销售数据分布更集中在富裕和人口稠密地区。
    # ----------------------------------------------------
    print("步骤 2/2: 正在生成剩余的随机销售记录，并根据地区权重进行分布...")
    
    # 为每个地区分配权重
    # 高权重：美国加州/纽约、德国、中国北京/上海等
    # 中权重：其他美国主要州、日本、英国、澳大利亚、加拿大、法国等
    # 低权重：其他人口稀少或市场较小的地区
    
    temp_geography_df['Sales_Weight'] = 1.0 # 基础权重
    
    # 根据国家/地区调整权重
    temp_geography_df.loc[temp_geography_df['Country'] == 'United States', 'Sales_Weight'] *= 5.0
    temp_geography_df.loc[temp_geography_df['Country'] == 'Germany', 'Sales_Weight'] *= 4.5
    temp_geography_df.loc[temp_geography_df['Country'] == 'China', 'Sales_Weight'] *= 4.0
    temp_geography_df.loc[temp_geography_df['Country'].isin(['United Kingdom', 'Japan', 'Australia', 'Canada', 'France']), 'Sales_Weight'] *= 3.0
    
    # 针对特定人口稠密/富裕地区进一步提升权重
    high_weight_provinces = ['California', 'New York', 'Texas', 'Florida', 'Beijing', 'Shanghai', 'Bavaria', 'North Rhine-Westphalia', 'Ontario', 'New South Wales', 'Tokyo']
    temp_geography_df.loc[temp_geography_df['State_Province'].isin(high_weight_provinces), 'Sales_Weight'] *= 2.0
    
    # 确保总权重为 1
    total_weight = temp_geography_df['Sales_Weight'].sum()
    temp_geography_df['Sales_Probability'] = temp_geography_df['Sales_Weight'] / total_weight
    
    # 获取地区 ID 和其对应的概率
    weighted_geo_ids = temp_geography_df['Geo_ID'].values
    probabilities = temp_geography_df['Sales_Probability'].values
    
    # 计算需要生成的剩余销售记录数
    remaining_sales = num_sales_records - len(sales_data)
    
    if remaining_sales > 0:
        # 使用 np.random.choice 的 p 参数进行加权抽样
        random_geo_choices = np.random.choice(weighted_geo_ids, size=remaining_sales, p=probabilities)
        
        for random_geo_id in random_geo_choices:
            # 随机选择其他维度 ID
            random_time_id = np.random.choice(time_ids)
            random_customer_id = np.random.choice(customer_ids)
            random_model_id = np.random.choice(model_ids)
            
            # 查找价格
            quarter_start_date = dim_time_df.loc[dim_time_df['Time_ID'] == random_time_id, 'Full_Date'].iloc[0].to_period('Q').start_time
            price_row = dim_prices_df[(dim_prices_df['Model_ID'] == random_model_id) & 
                                      (dim_prices_df['Quarter_Start_Date'] == quarter_start_date)]
            
            if not price_row.empty:
                is_discounted = random.random() < 0.2
                unit_price = price_row['Discounted_Price_USD'].iloc[0] if is_discounted else price_row['Standard_Price_USD'].iloc[0]
                unit_cost = unit_price * random.uniform(0.70, 0.85)
                quantity = random.randint(1, 3)
                
                sales_data.append([
                    random_time_id,
                    random_customer_id,
                    random_geo_id,
                    random_model_id,
                    unit_price,
                    unit_cost,
                    quantity,
                    unit_price * quantity
                ])

    fact_sales_df = pd.DataFrame(sales_data, columns=[
        'Time_ID', 'Customer_ID', 'Geo_ID', 'Model_ID', 'Unit_Price_USD', 'Unit_Cost_USD', 'Quantity', 'Total_Revenue_USD'
    ])
    
    # 添加 Sales_Order_ID，并按时间排序
    fact_sales_df.sort_values(by='Time_ID', inplace=True)
    fact_sales_df.reset_index(drop=True, inplace=True)
    fact_sales_df['Sales_Order_ID'] = range(1, len(fact_sales_df) + 1)
    
    return fact_sales_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("开始生成 Fact_Sales 表...")
    
    fact_sales_df = generate_fact_sales()
    
    if not fact_sales_df.empty:
        output_dir = './output_data'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print("正在保存 Fact_Sales.csv...")
        fact_sales_df.to_csv(os.path.join(output_dir, 'Fact_Sales.csv'), index=False, encoding='utf-8')
        
        end_time = time.time()
        print(f"Fact_Sales.csv 已成功生成 {len(fact_sales_df)} 行数据，用时 {end_time - start_time:.2f} 秒。")

开始生成 Fact_Sales 表...
步骤 1/2: 正在为每个地理区域生成至少一条销售记录以确保完整覆盖...
步骤 2/2: 正在生成剩余的随机销售记录，并根据地区权重进行分布...
正在保存 Fact_Sales.csv...
Fact_Sales.csv 已成功生成 756087 行数据，用时 392.72 秒。
