### **1/6: 生成 Dim_Product 表 （覆盖绝大部分特斯拉产品）(简单静态数据)维度表更可能需要更新Update或追加Append。例如，新产品发布，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [45]:
# -*- coding: utf-8 -*-
"""1/6: Generate Dim_Product Table with detailed configurations."""

import pandas as pd
import os
import time
import numpy as np
import random
from itertools import product
from datetime import date

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_product():
    """
    Generates the Dim_Product table with all product and configuration details.
    """
    # 基础产品信息 (Base products)
    base_products = {
        'Model 3': 'Automotive',
        'Model Y': 'Automotive',
        'Model S': 'Automotive',
        'Model X': 'Automotive',
        'Cybertruck': 'Automotive'
    }

    # 汽车版本和价格 (Variants and their prices)
    variants = {
        'Model 3': {
            'Rear-Wheel Drive': np.nan,
            'Long Range RWD': np.nan,
            'Long Range AWD': np.nan,
            'Performance': np.nan
        },
        'Model Y': {
            'Standard Range RWD': np.nan,
            'Long Range RWD': np.nan,
            'Long Range AWD': np.nan,
            'Performance': np.nan,
            'Model Y L (3-Row)': np.nan
        },
        'Model S': {
            'Long Range AWD': np.nan,
            'Plaid': np.nan
        },
        'Model X': {
            'All-Wheel Drive': np.nan,
            'Plaid': np.nan
        },
        'Cybertruck': {
            'Long Range': np.nan,
            'AWD': np.nan,
            'Cyberbeast': np.nan
        }
    }

    # 可配置选项和价格 (Configurable options and their prices)
    options = {
        'Paint_Color': {
            'Solid Black': np.nan, 'Deep Blue Metallic': np.nan, 'Stealth Grey': np.nan,
            'Ultra Red': np.nan, 'Quicksilver': np.nan, 'Pearl White Multi-Coat': np.nan, 'None': np.nan
        },
        'Wheel_Type': {
            'Aero Wheels': np.nan, '19" Nova Wheels': np.nan, 'Crossflow 19"': np.nan,
            'Helix 20"': np.nan, 'Base Wheels': np.nan, 'Performance Wheels': np.nan, 'None': np.nan
        },
        'Interior_Type': {
            'Black Interior': np.nan, 'Black and White Interior': np.nan, 'Cream Interior': np.nan, 'None': np.nan
        }
    }
    
    # Define product release dates for ID ordering
    # Use a dictionary to map product models to their approximate release dates
    release_dates = {
        'Model S': date(2012, 6, 1),
        'Model X': date(2015, 9, 1),
        'Model 3': date(2017, 7, 1),
        'Model Y': date(2020, 1, 1),
        'Cybertruck': date(2023, 11, 1),
        'N/A': date(2010, 1, 1) # A generic early date for non-automotive products
    }

    all_products = []

    # Generate all automotive product configurations first
    for model, category in base_products.items():
        if model in variants:
            variant_names = list(variants[model].keys())
            paint_colors = list(options['Paint_Color'].keys())
            wheel_types = list(options['Wheel_Type'].keys())
            interior_types = list(options['Interior_Type'].keys())
            
            combinations = list(product(variant_names, paint_colors, wheel_types, interior_types))
            
            for combo in combinations:
                variant, paint, wheel, interior = combo
                product_name = f"{model} {variant} - {paint} - {wheel} - {interior}"
                all_products.append({
                    'Product_Name': product_name,
                    'Product_Category': category,
                    'Product_Model': model,
                    'Product_Variant': variant,
                    'Paint_Color': paint,
                    'Wheel_Type': wheel,
                    'Interior_Type': interior
                })

    # Add non-automotive products, including the missing ones
    non_automotive_products = [
        ('Model 3 LR RWD Lease (24mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Model 3 LR RWD Lease (36mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Model Y LR RWD Lease (24mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Model Y LR RWD Lease (36mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Solar Panels', 'Energy Generation & Storage', 'Solar Panel', np.nan, np.nan, np.nan),
        ('Solar Roof', 'Energy Generation & Storage', 'Solar Roof', np.nan, np.nan, np.nan),
        ('Powerwall 3', 'Energy Generation & Storage', 'Powerwall', np.nan, np.nan, np.nan),
        ('Megapack', 'Energy Generation & Storage', 'Megapack', np.nan, np.nan, np.nan),
        ('FSD (Full Self-Driving)', 'Automotive', 'Feature', np.nan, np.nan, np.nan),
        ('CyberCab (2026 Placeholder)', 'Automotive', 'Service', np.nan, np.nan, np.nan),
        ('Regulatory Credits', 'Financial & Regulatory', 'Credit', np.nan, np.nan, np.nan),
        ('Charging Equipment', 'Services & Other', 'Accessory', np.nan, np.nan, np.nan),
        ('Vehicle Accessories', 'Services & Other', 'Accessory', np.nan, np.nan, np.nan),
        ('Apparel', 'Services & Other', 'Apparel', np.nan, np.nan, np.nan),
        ('Lifestyle', 'Services & Other', 'Lifestyle', np.nan, np.nan, np.nan),
    ]

    for prod_name, cat, variant, paint, wheel, interior in non_automotive_products:
        all_products.append({
            'Product_Name': prod_name,
            'Product_Category': cat,
            'Product_Model': 'N/A', # Use 'N/A' for non-automotive products
            'Product_Variant': variant,
            'Paint_Color': paint,
            'Wheel_Type': wheel,
            'Interior_Type': interior
        })

    # Sort products by their release date to ensure a logical ID sequence
    all_products.sort(key=lambda x: release_dates.get(x['Product_Model'], date(2010, 1, 1)))

    # Assign sequential Product_ID to the sorted list
    for i, prod in enumerate(all_products):
        prod['Product_ID'] = f'PRO{i+1:03d}'
    
    # Define columns with the price column removed
    columns = [
        'Product_ID', 'Product_Name', 'Product_Category', 'Product_Variant',
        'Paint_Color', 'Wheel_Type', 'Interior_Type'
    ]
    
    df = pd.DataFrame(all_products, columns=columns)
    
    return df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Product table...")
    dim_product_df = generate_dim_product()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Product.csv...")
    # 修改了这一行，确保 NA 值被正确写入 CSV 文件
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8', na_rep='NA')
    
    end_time = time.time()
    print(f"Dim_Product.csv has been successfully generated with {len(dim_product_df)} rows in {end_time - start_time:.2f} seconds.")


Generating Dim_Product table...
Saving Dim_Product.csv...
Dim_Product.csv has been successfully generated with 3151 rows in 0.01 seconds.


### **2/6: 生成 Dim_Time 表 (单一向前数据)维度表只追加Append。每一个时间点、每一天、每一个月都是一个既定的、永恒不变的事实。你无法“更新”昨天或去年的日期，此时就需要追加Append表中的相应记录。没有复杂的版本控制机制（Slowly Changing Dimension, SCD）** ###

In [37]:
# -*- coding: utf-8 -*-
"""2/6: Generate Dim_Time Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import datetime

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_time():
    """Generates the Dim_Time table."""
    # 修改起始年份为 2013
    start_date = datetime.date(2013, 1, 1)
    end_date = datetime.date(2025, 12, 31)
    date_range = [start_date + datetime.timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

    data = []
    time_id_counter = 1
    for date in date_range:
        # Generate the new Time_ID format (T + 7 digits)
        time_id = f'T{time_id_counter:07d}'
        
        data.append([
            time_id,
            date,
            date.year,
            f"Q{((date.month - 1) // 3) + 1}",
            date.month,
            date.day,
            date.isocalendar()[1],
            date.isoweekday(),
            date.strftime('%A')
        ])
        time_id_counter += 1
    
    return pd.DataFrame(data, columns=['Time_ID', 'Full_Date', 'Year', 'Quarter', 'Month', 'Day', 'Week_of_Year', 'Day_of_Week', 'Day_Name'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Time table...")
    dim_time_df = generate_dim_time()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Time.csv...")
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Time.csv has been successfully generated with {len(dim_time_df):,} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Time table...
Saving Dim_Time.csv...
Dim_Time.csv has been successfully generated with 4,748 rows in 0.02 seconds.


### **3/6: 生成 Dim_Customer 表 （修改性别和年龄分布）(相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的收入水平或家庭住址可能会发生变化，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [8]:
# -*- coding: utf-8 -*-
"""3/6: Generate Dim_Customer Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import string

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_customer(num_customers=50000):
    """Generates the Dim_Customer table."""
    
    # 调整性别分布，偏向男性
    genders = ['Male', 'Female']
    gender_probs = [0.75, 0.25] # Male: 75%, Female: 25%

    # 调整年龄组分布，偏向中年群体
    age_groups = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    age_probs = [0.05, 0.20, 0.35, 0.25, 0.10, 0.05]
    
    # 调整收入水平分布，偏向中高收入
    income_levels = ['Low', 'Medium', 'High']
    income_probs = [0.10, 0.80, 0.10]
    
    first_names = ['James', 'Mary', 'John', 'Patricia', 'Robert', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'David', 'Susan', 'Richard', 'Jessica', 'Joseph', 'Sarah', 'Thomas', 'Karen', 'Charles', 'Nancy', 'Christopher', 'Lisa', 'Daniel', 'Betty', 'Paul', 'Margaret', 'Mark', 'Sandra', 'Donald', 'Ashley', 'George', 'Kimberly', 'Kenneth', 'Donna', 'Steven', 'Emily', 'Edward', 'Carol', 'Brian', 'Michelle', 'Ronald', 'Amanda', 'Anthony', 'Melissa', 'Kevin', 'Deborah', 'Jason', 'Stephanie', 'Jeff', 'Maria', 'Gary', 'Heather', 'Timothy', 'Nicole', 'Jose', 'Denise', 'Larry', 'Megan', 'Jeffrey', 'Christina', 'Frank', 'Alexis', 'Scott', 'Tiffany', 'Eric', 'Lauren', 'Stephen', 'Rachel', 'Andrew', 'Crystal', 'Raymond', 'Kayla', 'Ryan', 'Danielle', 'Jacob', 'Brittany', 'Nicholas', 'Emma', 'Jonathan', 'Samantha', 'Laura', 'Alexis', 'Joshua', 'Brandon', 'Justin', 'Daniel', 'Daniel', 'Taylor']
    last_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young', 'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins', 'Stewart', 'Sanchez', 'Morris', 'Rogers', 'Reed', 'Cook', 'Morgan', 'Bell', 'Murphy', 'Bailey', 'Rivera', 'Cooper', 'Richardson', 'Cox', 'Howard', 'Ward', 'Torres', 'Peterson', 'Gray', 'Ramirez', 'James', 'Watson', 'Brooks', 'Kelly', 'Sanders', 'Price', 'Bennett', 'Wood', 'Barnes', 'Ross', 'Henderson', 'Coleman', 'Jenkins', 'Perry', 'Powell', 'Long', 'Patterson', 'Hughes', 'Flores', 'Washington', 'Butler', 'Simmons', 'Foster', 'Gonzales', 'Bryant', 'Alexander', 'Russell', 'Griffin', 'Diaz', 'Hayes', 'Myers', 'Ford', 'Hamilton', 'Graham', 'Sullivan', 'Wallace', 'Woods', 'Cole', 'West', 'Jordan', 'Owens', 'Reynolds', 'Fisher', 'Ellis', 'Harrison', 'Gibson', 'Mcdonald', 'Cruz', 'Marshall', 'Ortiz', 'Gomez', 'Murray', 'Freeman', 'Wells', 'Webb', 'Simpson', 'Stevens', 'Tucker', 'Porter', 'Hunter', 'Hicks', 'Crawford', 'Henry', 'Boyd', 'Mason', 'Kennedy', 'Warren', 'Dixon', 'Ramos', 'Reid', 'Carr', 'Chavez', 'Gibson']
    
    data = []
    
    # 跟踪每个组合的序号，确保不重复
    combination_tracker = {}
    
    # 随机生成个人客户数据
    for _ in range(num_customers):
        full_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        
        # 使用 np.random.choice 并指定概率
        gender_raw = np.random.choice(genders, p=gender_probs)
        age_group = np.random.choice(age_groups, p=age_probs)
        income_level = np.random.choice(income_levels, p=income_probs)
        
        # 随机生成两个字母的国家和省/州代码
        country_code = ''.join(random.choices(string.ascii_uppercase, k=2))
        state_code = ''.join(random.choices(string.ascii_uppercase, k=2))
        
        # 编码 Customer_ID
        gender_code = 'M' if gender_raw == 'Male' else 'W'
        
        # 构建组合前缀
        prefix = f'C{gender_code}{country_code}{state_code}'
        
        # 获取并递增序号
        if prefix not in combination_tracker:
            combination_tracker[prefix] = 0
        else:
            combination_tracker[prefix] += 1
            
        # 序号部分为四位数字，从0000开始
        sequential_id = f"{combination_tracker[prefix] % 10000:04d}"
        
        customer_id = f'{prefix}{sequential_id}'

        data.append([customer_id, full_name, 'Individual', gender_raw, age_group, income_level, 'NA', 'NA', 'NA'])

    individual_df = pd.DataFrame(data, columns=['Customer_ID', 'Customer_Name', 'Customer_Segment', 'Gender', 'Age_Group', 'Income_Level', 'Country', 'State_Province', 'City'])

    # 新增业务客户数据，用于购买监管积分
    business_customers = [
        ['B001', 'General Motors', 'Business', 'NA', 'NA', 'NA', 'United States', 'Michigan', 'Detroit'],
        ['B002', 'Ford Motor Company', 'Business', 'NA', 'NA', 'NA', 'United States', 'Michigan', 'Dearborn'],
        ['B003', 'Toyota', 'Business', 'NA', 'NA', 'NA', 'Japan', 'Aichi', 'Toyota'],
        ['B004', 'Volkswagen', 'Business', 'NA', 'NA', 'NA', 'Germany', 'Lower Saxony', 'Wolfsburg'],
        ['B005', 'Stellantis', 'Business', 'NA', 'NA', 'NA', 'Netherlands', 'North Holland', 'Amsterdam'],
        ['B006', 'Honda', 'Business', 'NA', 'NA', 'NA', 'Japan', 'Tokyo', 'Tokyo'],
        ['B007', 'Nissan', 'Business', 'NA', 'NA', 'NA', 'Japan', 'Kanagawa', 'Yokohama'],
        ['B008', 'Delta Air Lines', 'Business', 'NA', 'NA', 'NA', 'United States', 'Georgia', 'Atlanta'],
        ['B009', 'United Airlines', 'Business', 'NA', 'NA', 'NA', 'United States', 'Illinois', 'Chicago'],
        ['B010', 'American Airlines', 'Business', 'NA', 'NA', 'NA', 'United States', 'Texas', 'Fort Worth'],
        ['B011', 'Shell', 'Business', 'NA', 'NA', 'NA', 'Netherlands', 'South Holland', 'The Hague'],
        ['B012', 'ExxonMobil', 'Business', 'NA', 'NA', 'NA', 'United States', 'Texas', 'Irving'],
        ['B013', 'BP', 'Business', 'NA', 'NA', 'NA', 'United Kingdom', 'Greater London', 'London'],
        ['B014', 'Chevron', 'Business', 'NA', 'NA', 'NA', 'United States', 'California', 'San Ramon'],
        ['B015', 'TotalEnergies', 'Business', 'NA', 'NA', 'NA', 'France', 'Île-de-France', 'Courbevoie'],
    ]
    
    business_df = pd.DataFrame(business_customers, columns=['Customer_ID', 'Customer_Name', 'Customer_Segment', 'Gender', 'Age_Group', 'Income_Level', 'Country', 'State_Province', 'City'])

    # 合并个人和业务客户数据
    final_df = pd.concat([individual_df, business_df], ignore_index=True)
    
    return final_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Customer table...")
    dim_customer_df = generate_dim_customer()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Customer.csv has been successfully generated with {len(dim_customer_df)} rows in {end_time - start_time:.2f} seconds.")


Generating Dim_Customer table...
Saving Dim_Customer.csv...
Dim_Customer.csv has been successfully generated with 50015 rows in 0.97 seconds.


### **4/6: 生成 Dim_Geography 表 （按大洲、国家编号）(相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的地址可能会发生变化或更新到新的国家和城市，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [35]:
# -*- coding: utf-8 -*-
"""4/6: Generate Dim_Geography Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import string

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_plausible_zip(country):
    """Generates a plausible zip code based on the country, padded to 8 characters."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'
    
    # 辅助函数，用于填充到8位
    def pad_to_eight(s):
        return (s + '0' * 8)[:8]

    if country == 'United States':
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country == 'Canada':
        # A1A 1A1 format (6 characters + 1 space)
        code = f"{random.choice(letters)}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
        return pad_to_eight(code.replace(' ', '')) # Remove space for 8-char ID, or keep for Zip_Code column
    elif country == 'Mexico':
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country == 'United Kingdom':
        # AN NAA or ANN NAA format
        part1_letters = ''.join(random.choices(letters, k=random.choice([1, 2])))
        part1_digits = ''.join(random.choices(digits, k=random.choice([1, 2])))
        part2 = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        code = f"{part1_letters}{part1_digits} {part2}"
        return pad_to_eight(code.replace(' ', ''))
    elif country == 'France':
        # 5 digits, but first is not zero
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country in ['Germany', 'Italy', 'Spain', 'Switzerland', 'Netherlands', 'Denmark', 'Norway', 'Sweden', 'Finland', 'Greece', 'Iceland', 'Ireland', 'Luxembourg', 'Monaco']:
        # Most of Europe uses 5-digit numbers
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country == 'China':
        # 6-digit numeric
        return pad_to_eight(f"{random.randint(100000, 999999)}")
    elif country == 'Japan':
        # 7-digit numeric, often with a hyphen, so we generate and pad
        return pad_to_eight(f"{random.randint(1000000, 9999999)}")
    elif country in ['South Korea', 'Taiwan', 'Hong Kong', 'Macau']:
        # 5-7 digit numeric
        return pad_to_eight(f"{random.randint(10000, 9999999)}")
    elif country == 'Australia':
        # 4-digit numeric
        return pad_to_eight(f"{random.randint(1000, 9999)}")
    elif country == 'New Zealand':
        # 4-digit numeric
        return pad_to_eight(f"{random.randint(1000, 9999)}")
    else:
        return "00000000"

def generate_dim_geography():
    """
    Generates a Dim_Geography table.
    """
    geography_data = []
    
    # Continent codes for the new Geo_ID
    continent_codes = {
        'North America': 'NA',
        'Europe': 'EU',
        'Asia': 'AS',
        'Oceania': 'OC'
    }
    
    # North America
    north_america_countries = {
        'United States': {
            'code': 'US',
            'provinces': [
                'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
                'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
                'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
                'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
                'Wisconsin', 'Wyoming'
            ]
        },
        'Canada': {
            'code': 'CA',
            'provinces': [
                'Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
                'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Québec', 'Saskatchewan',
                'Northwest Territories', 'Nunavut', 'Yukon'
            ]
        },
        'Mexico': {
            'code': 'MX',
            'provinces': [
                'Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas',
                'Chihuahua', 'Coahuila', 'Colima', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo',
                'Jalisco', 'México', 'Distrito Federal', 'Michoacán', 'Morelos', 'Nayarit',
                'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
                'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas'
            ]
        }
    }

    # Europe
    europe_countries = {
        'Germany': {
            'code': 'DE',
            'provinces': [
                'Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg',
                'Hesse', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'North Rhine-Westphalia',
                'Rhineland-Palatinate', 'Saarland', 'Saxony', 'Saxony-Anhalt',
                'Schleswig-Holstein', 'Thuringia'
            ]
        },
        'United Kingdom': {
            'code': 'GB',
            'provinces': [
                'England', 'Scotland', 'Wales', 'Northern Ireland'
            ]
        },
        'Norway': {'code': 'NO', 'provinces': ['Oslo', 'Viken', 'Innlandet', 'Vestfold og Telemark', 'Agder', 'Rogaland', 'Vestland', 'Møre og Romsdal', 'Trøndelag', 'Nordland', 'Troms og Finnmark']},
        'France': {'code': 'FR', 'provinces': ['Bretagne', 'Normandie', 'Île-de-France', 'Auvergne-Rhône-Alpes', 'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Corsica', 'Grand Est', 'Hauts-de-France', 'Nouvelle-Aquitaine', 'Occitanie', 'Pays de la Loire', 'Provence-Alpes-Côte d\'Azur']},
        'Netherlands': {'code': 'NL', 'provinces': ['Drenthe', 'Flevoland', 'Friesland', 'Gelderland', 'Groningen', 'Limburg', 'North Brabant', 'North Holland', 'Overijssel', 'Utrecht', 'Zeeland', 'South Holland']},
        'Sweden': {'code': 'SE', 'provinces': ['Blekinge', 'Dalarna', 'Gotland', 'Gävleborg', 'Halland', 'Jämtland', 'Jönköping', 'Kalmar', 'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Södermanland', 'Uppsala', 'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland', 'Västra Götaland', 'Örebro', 'Östergötland']},
        'Switzerland': {'code': 'CH', 'provinces': ['Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen', 'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen', 'Graubünden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchâtel', 'Geneva', 'Jura']},
        'Italy': {'code': 'IT', 'provinces': ['Abruzzo', 'Aosta Valley', 'Apulia', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-Alto Adige', 'Umbria', 'Veneto']},
        'Spain': {'code': 'ES', 'provinces': ['Andalusia', 'Aragon', 'Principality of Asturias', 'Balearic Islands', 'Basque Country', 'Canary Islands', 'Cantabria', 'Castile and León', 'Castile-La Mancha', 'Catalonia', 'Community of Madrid', 'Valencian Community', 'Extremadura', 'Galicia', 'La Rioja', 'Region of Murcia', 'Foral Community of Navarre']},
        'Denmark': {'code': 'DK', 'provinces': ['Capital Region of Denmark', 'Central Denmark Region', 'North Denmark Region', 'Region Zealand', 'Region of Southern Denmark']},
        'Finland': {'code': 'FI', 'provinces': ['Åland Islands', 'Central Finland', 'Central Ostrobothnia', 'Kainuu', 'Kymenlaakso', 'Lapland', 'North Karelia', 'North Ostrobothnia', 'Northern Savonia', 'Päijät-Häme', 'Pirkanmaa', 'Satakunta', 'South Karelia', 'Southern Ostrobothnia', 'Southern Savonia', 'Tavastia Proper', 'Uusimaa', 'Southwest Finland']},
        'Greece': {'code': 'GR', 'provinces': ['Attica', 'Central Greece', 'Central Macedonia', 'Crete', 'East Macedonia and Thrace', 'Epirus', 'Ionian Islands', 'North Aegean', 'Peloponnese', 'South Aegean', 'Thessaly', 'West Greece', 'West Macedonia']},
        'Iceland': {'code': 'IS', 'provinces': ['Capital Region', 'Southern Peninsula', 'Western Region', 'Westfjords', 'Northwest Region', 'Northeast Region', 'Eastern Region', 'Southern Region']},
        'Ireland': {'code': 'IE', 'provinces': ['Connacht', 'Leinster', 'Munster', 'Ulster']},
        'Luxembourg': {'code': 'LU', 'provinces': ['Diekirch', 'Grevenmacher', 'Luxembourg']},
        'Monaco': {'code': 'MC', 'provinces': ['Monaco']}
    }
    
    # Asia
    asia_countries = {
        'China': {
            'code': 'CN',
            'provinces': [
                'Anhui', 'Fujian', 'Gansu', 'Guangdong', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang',
                'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Liaoning', 'Qinghai',
                'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Yunnan', 'Zhejiang',
                'Guangxi', 'Nei Mongol', 'Ningxia Hui', 'Xinjiang Uygur', 'Xizang',
                'Beijing', 'Chongqing', 'Shanghai', 'Tianjin'
            ]
        },
        'Hong Kong': {'code': 'HK', 'provinces': ['Hong Kong Island', 'Kowloon', 'New Territories']},
        'Macau': {'code': 'MO', 'provinces': ['Macau']},
        'Japan': {
            'code': 'JP',
            'provinces': [
                'Hokkaido', 'Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima',
                'Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa',
                'Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano',
                'Gifu', 'Shizuoka', 'Aichi', 'Mie', 'Shiga', 'Kyoto', 'Osaka',
                'Hyōgo', 'Nara', 'Wakayama', 'Tottori', 'Shimane', 'Okayama',
                'Hiroshima', 'Yamaguchi', 'Tokushima', 'Kagawa', 'Ehime', 'Kochi',
                'Fukuoka', 'Saga', 'Naoasaki', 'Kumamoto', 'Oita', 'Miyazaki', 'Kagoshima', 'Okinawa'
            ]
        },
        'South Korea': {
            'code': 'KR',
            'provinces': [
                'Busan', 'Chungcheongbuk-do', 'Chungcheongnam-do', 'Daegu', 'Daejeon', 'Gangwon-do',
                'Gwangju', 'Gyeonggi-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Incheon', 'Jeollabuk-do',
                'Jeollanam-do', 'Sejong', 'Seoul', 'Ulsan', 'Jeju'
            ]
        },
        'Taiwan': {
            'code': 'TW',
            'provinces': [
                'Taipei', 'New Taipei', 'Taichung', 'Tainan', 'Kaohsiung', 'Taoyuan',
                'Keelung', 'Hsinchu City', 'Chiayi City', 'Hsinchu County', 'Chiayi County',
                'Changhua', 'Nantou', 'Yulin', 'Miaoli', 'Pingtung', 'Yilan', 'Hualien',
                'Taitung', 'Penghu', 'Kinmen', 'Lienkiang'
            ]
        }
    }

    # Oceania
    oceania_countries = {
        'Australia': {
            'code': 'AU',
            'provinces': [
                'New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia',
                'Tasmania', 'Australian Capital Territory', 'Northern Territory'
            ]
        },
        'New Zealand': {
            'code': 'NZ',
            'provinces': [
                'Auckland', 'Bay of Plenty', 'Canterbury', 'Gisborne', 'Hawke\'s Bay',
                'Manawatu-Wanganui', 'Marlborough', 'Nelson', 'Northland', 'Otago',
                'Southland', 'Taranaki', 'Tasman', 'Waikato', 'Wellington', 'West Coast'
            ]
        }
    }
    
    continents = {
        'North America': north_america_countries,
        'Europe': europe_countries,
        'Asia': asia_countries,
        'Oceania': oceania_countries
    }

    for continent, countries in continents.items():
        for country, details in countries.items():
            state_id = 1
            for province in details['provinces']:
                # The state abbreviation logic needs to be robust for all cases.
                state_abbr_words = province.split(' ')
                state_abbr_raw = ''.join([word[0].upper() for word in state_abbr_words if word[0].isalpha()]).ljust(2, 'X')
                
                # Create a reliable 2-letter abbreviation
                state_abbr = state_abbr_raw[:2]

                # Create the new 8-character Geo_ID
                geo_id = f"{continent_codes[continent]}{details['code']}{state_abbr}{state_id:02d}"

                geography_data.append([
                    geo_id,
                    continent,
                    country,
                    details['code'],
                    province,
                    state_abbr,
                    generate_plausible_zip(country)
                ])
                state_id += 1

    dim_geography_df = pd.DataFrame(geography_data, columns=[
        'Geo_ID', 'Continent', 'Country', 'Country_Code', 'State_Province', 'State_Province_Abbr', 'Zip_Code'
    ])
    
    return dim_geography_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Geography table...")
    dim_geography_df = generate_dim_geography()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Geography.csv has been successfully generated with {len(dim_geography_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Geography table...
Saving Dim_Geography.csv...
Dim_Geography.csv has been successfully generated with 432 rows in 0.00 seconds.


### **5/6: 生成 Dim_Prices 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，新产品或不同时段价格可能会发生变化，此时就需要更新或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [43]:
# -*- coding: utf-8 -*-
"""Generate Dim_Prices Table"""

import pandas as pd
from datetime import datetime
import os
import random
import numpy as np

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_prices():
    """
    Generates the Dim_Prices table with prices for different vehicle models and time periods.
    """
    start_date = datetime(2013, 1, 1)
    end_date = datetime(2025, 6, 30)

    price_data = []

    # 定义不同车型的基础价格
    # 价格基于公开数据和市场趋势估算
    base_prices = {
        1: 75000,  # Model S
        2: 80000,  # Model X
        3: 40000,  # Model 3
        4: 50000,  # Model Y
        5: 120000, # Cybertruck
    }
    
    # 定义产品ID的映射，使其与产品维度表格式一致
    product_id_mapping = {
        1: 'PRO001',
        2: 'PRO002',
        3: 'PRO003',
        4: 'PRO004',
        5: 'PRO005',
    }

    # 手动添加 2013-2018 年的价格数据以确保销售数据生成准确
    # Model S (Product_ID = 1)
    price_data.append({'Quarter_Start_Date': datetime(2013, 1, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})
    price_data.append({'Quarter_Start_Date': datetime(2013, 4, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})
    price_data.append({'Quarter_Start_Date': datetime(2013, 7, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})
    price_data.append({'Quarter_Start_Date': datetime(2013, 10, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})

    # Model X (Product_ID = 2) 在2015年末发布
    # Model 3 (Product_ID = 3) 在2017年中发布
    # Model Y (Product_ID = 4) 在2020年初发布
    # Cybertruck (Product_ID = 5) 在2023年末发布
    
    # 填充 2014-2018 年的价格
    for year in range(2014, 2019):
        for month in [1, 4, 7, 10]:
            quarter_start = datetime(year, month, 1)
            # Model S 价格小幅波动
            price_s = base_prices[1] + np.random.randint(-2000, 2000)
            price_data.append({'Quarter_Start_Date': quarter_start, 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': price_s, 'Discounted_Price_USD': price_s})

            # Model X
            if year >= 2015 and quarter_start >= datetime(2015, 9, 1):
                price_x = base_prices[2] + np.random.randint(-2000, 2000)
                price_data.append({'Quarter_Start_Date': quarter_start, 'Product_ID': product_id_mapping[2], 'Standard_Price_USD': price_x, 'Discounted_Price_USD': price_x})

            # Model 3
            if year >= 2017 and quarter_start >= datetime(2017, 7, 1):
                price_3 = base_prices[3] + np.random.randint(-1000, 1000)
                price_data.append({'Quarter_Start_Date': quarter_start, 'Product_ID': product_id_mapping[3], 'Standard_Price_USD': price_3, 'Discounted_Price_USD': price_3})

    # 填充 2019 年至今的价格，并引入价格波动和折扣
    current_date = datetime(2019, 1, 1)
    while current_date <= end_date:
        for product_id, base_price in base_prices.items():
            if (product_id == 4 and current_date < datetime(2020, 1, 1)) or \
               (product_id == 5 and current_date < datetime(2023, 11, 1)):
                continue

            # 模拟价格波动（5%以内的随机波动）
            price_std = base_price * (1 + random.uniform(-0.05, 0.05))
            price_dis = price_std

            # 模拟折扣（约20%的记录有折扣）
            if random.random() < 0.20:
                discount_rate = random.uniform(0.01, 0.15)
                price_dis = price_std * (1 - discount_rate)
            
            price_data.append({
                'Quarter_Start_Date': current_date,
                'Product_ID': product_id_mapping[product_id],
                'Standard_Price_USD': round(price_std, 2),
                'Discounted_Price_USD': round(price_dis, 2)
            })

        # 移动到下一个季度
        if current_date.month == 10:
            current_date = current_date.replace(year=current_date.year + 1, month=1)
        else:
            current_date = current_date.replace(month=current_date.month + 3)

    dim_prices_df = pd.DataFrame(price_data)

    # 格式化日期列
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date']).dt.date

    # 生成 Price_ID，格式为 PRI###
    price_ids = [f'PRI{i:03d}' for i in range(1, len(dim_prices_df) + 1)]
    dim_prices_df['Price_ID'] = price_ids
    
    # 重新排序列以匹配您的要求
    dim_prices_df = dim_prices_df[['Price_ID', 'Product_ID', 'Quarter_Start_Date', 'Standard_Price_USD', 'Discounted_Price_USD']]
    
    return dim_prices_df

if __name__ == '__main__':
    print("正在生成 Dim_Prices 表...")
    dim_prices_df = generate_dim_prices()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    print("保存 Dim_Prices.csv...")
    dim_prices_df.to_csv(os.path.join(output_dir, 'Dim_Prices.csv'), index=False, encoding='utf-8')
    print("Dim_Prices.csv 已成功生成！")


正在生成 Dim_Prices 表...
保存 Dim_Prices.csv...
Dim_Prices.csv 已成功生成！


### **6/6: 生成 Fact_Sales 表 （没有空白营收行 追加到2013）(高度动态数据，最常被追加（append）的表) 只进不出”的设计哲学。每当一笔新的销售发生，就在 Fact_Sales 表中追加一行新的数据，而不会去修改之前已经存在的历史销售记录** ###

In [2]:
# -*- coding: utf-8 -*-
"""Generate Fact_Sales Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)

    # 季度营收数据 (单位: 10亿 USD)，已更新为更精确的数值
    quarterly_revenue = {
        # 2013年 ($2.01B) - 仅Model S销售，按季度比例分配
        (2013, 1): 0.4e9, (2013, 2): 0.45e9, (2013, 3): 0.53e9, (2013, 4): 0.63e9,
        # 2014年 ($3.2B) - 按季度比例分配
        (2014, 1): 0.65e9, (2014, 2): 0.75e9, (2014, 3): 0.85e9, (2014, 4): 0.95e9,
        # 2015年 ($4.05B) - 按季度比例分配
        (2015, 1): 0.8e9, (2015, 2): 0.9e9, (2015, 3): 1.0e9, (2015, 4): 1.35e9,
        # 2016年 ($7B) - 按季度比例分配
        (2016, 1): 1.4e9, (2016, 2): 1.6e9, (2016, 3): 1.9e9, (2016, 4): 2.1e9,
        # 2017年 ($11.76B) - 按季度比例分配
        (2017, 1): 2.3e9, (2017, 2): 2.6e9, (2017, 3): 3.0e9, (2017, 4): 3.86e9,
        # 2018年 ($21.46B) - 按季度比例分配
        (2018, 1): 4.1e9, (2018, 2): 4.9e9, (2018, 3): 5.8e9, (2018, 4): 6.66e9,
        # 2019-2025年季度营收，已更新为更精确的数值
        (2019, 1): 4.541e9, (2019, 2): 6.350e9, (2019, 3): 6.303e9, (2019, 4): 7.384e9,
        (2020, 1): 5.985e9, (2020, 2): 6.036e9, (2020, 3): 8.771e9, (2020, 4): 10.744e9,
        (2021, 1): 10.389e9, (2021, 2): 11.958e9, (2021, 3): 13.757e9, (2021, 4): 17.719e9,
        (2022, 1): 18.756e9, (2022, 2): 16.934e9, (2022, 3): 21.454e9, (2022, 4): 24.318e9,
        (2023, 1): 23.329e9, (2023, 2): 24.927e9, (2023, 3): 23.350e9, (2023, 4): 25.167e9,
        (2024, 1): 21.301e9, (2024, 2): 25.500e9, (2024, 3): 25.182e9, (2024, 4): 25.707e9,
        (2025, 1): 19.335e9, (2025, 2): 22.496e9
    }

    # 年度交付量数据，已根据你提供的实际历史数据更新
    unit_targets_by_year = {
        2013: 22442,
        2014: 31655,
        2015: 50517,
        2016: 76243,
        2017: 103091,
        2018: 245491,
        2019: 367656,
        2020: 499535,
        2021: 936222,
        2022: 1313851,
        2023: 1808581,
        2024: 1789226
    }
    unit_targets_by_year[2025] = 336681 + 384122
    
    # 2013-2018年季度交付量手动分配
    quarterly_unit_splits = {
        2013: {1: 4750, 2: 5150, 3: 5800, 4: 6742},
        2014: {1: 6450, 2: 7570, 3: 8800, 4: 8835},
        2015: {1: 10045, 2: 11532, 3: 11584, 4: 17356},
        2016: {1: 14810, 2: 18345, 3: 24500, 4: 18588},
        2017: {1: 25418, 2: 22000, 3: 26135, 4: 29538},
        2018: {1: 29980, 2: 40740, 3: 83780, 4: 90991}
    }

    # --------------------------
    # 步骤 1: 数据清洗和预处理
    # --------------------------
    if 'Product_Name' not in dim_product_df.columns:
        print("警告：Dim_Product.csv中缺少'Product_Name'列，正在根据Product_ID创建。")
        product_id_to_name = {
            'PRO001': 'Model S', 'PRO002': 'Model X', 'PRO003': 'Model 3', 'PRO004': 'Model Y', 'PRO005': 'Cybertruck'
        }
        dim_product_df['Product_Name'] = dim_product_df['Product_ID'].map(product_id_to_name).fillna('Other')
    
    # 补全国家列表，确保数据更真实
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        return 'North America'
            
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # 步骤 2: 定义权重
    # --------------------------
    product_weights_by_name = {
        'Model S': 0.45, 'Model X': 0.45, 'Model 3': 0.05, 'Model Y': 0.04, 'Cybertruck': 0.01
    }
    
    # 定义大陆、国家、省份的权重，并提供默认值
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    # 预计算所有 Geo_ID 的权重，并确保任何 Geo_ID 都有一个非零的权重
    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    
    # --------------------------
    # 步骤 3: 确保时间数据类型一致
    # --------------------------
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    dim_time_df['Year_Int'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_Int'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time

    # --------------------------
    # 步骤 4: 创建价格查找字典
    # --------------------------
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv 文件中缺少必要的列。请检查文件是否包含 'Standard_Price_USD' 和 'Discounted_Price_USD'。")

    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    model_avg_prices = dim_prices_df.groupby('Product_ID')['Standard_Price_USD'].mean().to_dict()

    # 将产品和地理信息转换为字典，以便在循环中快速查找
    product_weights_dict = {
        pid: product_weights_by_name.get(pname, 0.0001) for pid, pname in dim_product_df.set_index('Product_ID')['Product_Name'].to_dict().items()
    }
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()

    start_year = min(unit_targets_by_year.keys())
    
    total_generated_rows = 0
    header_written = False

    # 预先生成所有可能的组合及其权重，但**不**存储在巨大的 DataFrame 中
    all_product_ids = list(product_weights_dict.keys())
    all_geo_ids = list(geo_weights_dict.keys())
    
    # 建立组合到索引的映射
    combo_to_index = {}
    combo_list = []
    combo_weights_list = []
    
    idx = 0
    for prod_id, geo_id in product(all_product_ids, all_geo_ids):
        prod_weight = product_weights_dict.get(prod_id, 0.0001)
        geo_weight = geo_weights_dict.get(geo_id, 0.0001)
        combo_to_index[(prod_id, geo_id)] = idx
        combo_list.append((prod_id, geo_id))
        combo_weights_list.append(prod_weight * geo_weight)
        idx += 1

    total_combo_weight = sum(combo_weights_list)
    if total_combo_weight == 0:
        print("警告：总组合权重为零，无法进行数据生成。")
        return 0

    combo_probabilities = np.array(combo_weights_list) / total_combo_weight
    
    # 按年份和季度循环生成
    for year in range(start_year, end_date.year + 1):
        for quarter in range(1, 5):
            target_units = 0
            target_revenue = 0

            # 检查是否有该季度的数据
            if (year, quarter) not in quarterly_revenue:
                continue

            if year < 2019:
                if year not in quarterly_unit_splits or quarter not in quarterly_unit_splits[year]: continue
                target_units = quarterly_unit_splits[year][quarter]
                target_revenue = quarterly_revenue.get((year, quarter), 0)
            else:
                target_revenue = quarterly_revenue.get((year, quarter), 0)
                total_year_units = unit_targets_by_year.get(year, 0)
                if total_year_units == 0: continue
                total_year_revenue = sum(v for k, v in quarterly_revenue.items() if k[0] == year)
                if total_year_revenue == 0: continue
                quarter_revenue_ratio = target_revenue / total_year_revenue
                target_units = int(total_year_units * quarter_revenue_ratio)
            
            if target_units <= 0: continue
            
            print(f"正在为年份 {year} 第 {quarter} 季度生成 {target_units:,} 条销售记录...")
            
            # 使用 NumPy 直接进行高效抽样
            sampled_combo_indices = np.random.choice(len(combo_list), size=target_units, p=combo_probabilities)
            
            # 根据抽样结果，构建数据
            records = []
            quarter_time_ids = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]['Time_ID'].tolist()
            if not quarter_time_ids:
                print(f"警告：年份 {year} 第 {quarter} 季度没有可用的时间组合，跳过生成。")
                continue
            
            # 预先获取季度开始日期
            quarter_start_date = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]['Quarter_Start_Date'].iloc[0]

            # 新增逻辑：引入收入差异化
            generated_revenues = []
            transaction_details = []

            for i in range(target_units):
                combo_index = sampled_combo_indices[i]
                product_id, geo_id = combo_list[combo_index]
                
                prices = price_lookup.get((quarter_start_date, product_id))
                if prices:
                    standard_price = prices['Standard_Price_USD']
                    discounted_price = prices['Discounted_Price_USD']
                else:
                    standard_price = model_avg_prices.get(product_id, 0)
                    discounted_price = standard_price
                
                # 随机选择一个价格，模拟真实销售，80%概率为标准价，20%为折扣价
                is_discounted = np.random.choice([True, False], p=[0.2, 0.8])
                price_used = discounted_price if is_discounted else standard_price
                generated_revenues.append(price_used)

                time_id = np.random.choice(quarter_time_ids)
                customer_id = np.random.choice(customer_ids)
                
                transaction_details.append({
                    'Time_ID': time_id,
                    'Geo_ID': geo_id,
                    'Product_ID': product_id,
                    'Customer_ID': customer_id,
                    'Sales_Units': 1,
                    'Is_Discounted_Sale': is_discounted,
                    'Revenue_USD': 0 # 临时占位，稍后校准
                })

            if not generated_revenues:
                continue

            # 校准收入以匹配季度总营收
            total_generated_revenue = sum(generated_revenues)
            if total_generated_revenue > 0:
                scaling_factor = target_revenue / total_generated_revenue
            else:
                scaling_factor = 0
            
            for detail, revenue in zip(transaction_details, generated_revenues):
                detail['Revenue_USD'] = revenue * scaling_factor

            fact_sales_df_temp = pd.DataFrame(transaction_details)

            # 首次写入时带上表头，后续写入则不带
            fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
            header_written = True
            total_generated_rows += len(fact_sales_df_temp)
    
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在加载所有维度表...")
    try:
        dim_product_df = pd.read_csv(os.path.join('./output_data', 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join('./output_data', 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join('./output_data', 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join('./output_data', 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join('./output_data', 'Dim_Prices.csv'))
    except FileNotFoundError as e:
        print(f"错误：缺少一个或多个必需的 CSV 文件。请先运行所有维度生成脚本（1-5）。\n{e}")
        exit()

    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')

    # 移除旧文件以确保从一个干净的状态开始
    if os.path.exists(output_filepath):
        os.remove(output_filepath)
        print("已移除旧的 Fact_Sales.csv 文件。")

    print("正在生成 Fact_Sales 表...")
    total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)

    if total_rows > 0:
        end_time = time.time()
        print(f"Fact_Sales.csv 已成功生成 {total_rows:,} 行数据，耗时 {end_time - start_time:.2f} 秒。")
        print("数据生成完成！")
    else:
        print("数据生成失败。")


正在加载所有维度表...
已移除旧的 Fact_Sales.csv 文件。
正在生成 Fact_Sales 表...
正在为年份 2013 第 1 季度生成 4,750 条销售记录...
正在为年份 2013 第 2 季度生成 5,150 条销售记录...
正在为年份 2013 第 3 季度生成 5,800 条销售记录...
正在为年份 2013 第 4 季度生成 6,742 条销售记录...
正在为年份 2014 第 1 季度生成 6,450 条销售记录...
正在为年份 2014 第 2 季度生成 7,570 条销售记录...
正在为年份 2014 第 3 季度生成 8,800 条销售记录...
正在为年份 2014 第 4 季度生成 8,835 条销售记录...
正在为年份 2015 第 1 季度生成 10,045 条销售记录...
正在为年份 2015 第 2 季度生成 11,532 条销售记录...
正在为年份 2015 第 3 季度生成 11,584 条销售记录...
正在为年份 2015 第 4 季度生成 17,356 条销售记录...
正在为年份 2016 第 1 季度生成 14,810 条销售记录...
正在为年份 2016 第 2 季度生成 18,345 条销售记录...
正在为年份 2016 第 3 季度生成 24,500 条销售记录...
正在为年份 2016 第 4 季度生成 18,588 条销售记录...
正在为年份 2017 第 1 季度生成 25,418 条销售记录...
正在为年份 2017 第 2 季度生成 22,000 条销售记录...
正在为年份 2017 第 3 季度生成 26,135 条销售记录...
正在为年份 2017 第 4 季度生成 29,538 条销售记录...
正在为年份 2018 第 1 季度生成 29,980 条销售记录...
正在为年份 2018 第 2 季度生成 40,740 条销售记录...
正在为年份 2018 第 3 季度生成 83,780 条销售记录...
正在为年份 2018 第 4 季度生成 90,991 条销售记录...
正在为年份 2019 第 1 季度生成 67,927 条销售记录...
正在为年份 2019 第 2 季度生成 94,988 条销售记录...
正在为年份 2019 第 

In [11]:
# -*- coding: utf-8 -*-
"""Generate Fact_Sales Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)

    # 季度营收数据 (单位: 10亿 USD)，已根据财报数据更新
    quarterly_revenues_by_category = {
        # 2021Q1 - 2025Q2 财报数据
        (2021, 1): {'Automotive sales': 8.187e9, 'Automotive regulatory credits': 518e6, 'Automotive leasing': 297e6, 'Energy Generation & Storage': 494e6, 'Services & Other': 893e6},
        (2021, 2): {'Automotive sales': 9.520e9, 'Automotive regulatory credits': 354e6, 'Automotive leasing': 332e6, 'Energy Generation & Storage': 801e6, 'Services & Other': 951e6},
        (2021, 3): {'Automotive sales': 11.393e9, 'Automotive regulatory credits': 279e6, 'Automotive leasing': 385e6, 'Energy Generation & Storage': 806e6, 'Services & Other': 894e6},
        (2021, 4): {'Automotive sales': 15.025e9, 'Automotive regulatory credits': 314e6, 'Automotive leasing': 628e6, 'Energy Generation & Storage': 688e6, 'Services & Other': 1.064e9},
        (2022, 1): {'Automotive sales': 15.514e9, 'Automotive regulatory credits': 679e6, 'Automotive leasing': 668e6, 'Energy Generation & Storage': 616e6, 'Services & Other': 1.279e9},
        (2022, 2): {'Automotive sales': 13.670e9, 'Automotive regulatory credits': 344e6, 'Automotive leasing': 588e6, 'Energy Generation & Storage': 866e6, 'Services & Other': 1.466e9},
        (2022, 3): {'Automotive sales': 17.785e9, 'Automotive regulatory credits': 286e6, 'Automotive leasing': 621e6, 'Energy Generation & Storage': 1.117e9, 'Services & Other': 1.645e9},
        (2022, 4): {'Automotive sales': 20.241e9, 'Automotive regulatory credits': 467e6, 'Automotive leasing': 599e6, 'Energy Generation & Storage': 1.310e9, 'Services & Other': 1.701e9},
        (2023, 1): {'Automotive sales': 18.878e9, 'Automotive regulatory credits': 521e6, 'Automotive leasing': 564e6, 'Energy Generation & Storage': 1.529e9, 'Services & Other': 1.837e9},
        (2023, 2): {'Automotive sales': 20.419e9, 'Automotive regulatory credits': 282e6, 'Automotive leasing': 567e6, 'Energy Generation & Storage': 1.509e9, 'Services & Other': 2.150e9},
        (2023, 3): {'Automotive sales': 18.582e9, 'Automotive regulatory credits': 554e6, 'Automotive leasing': 489e6, 'Energy Generation & Storage': 1.559e9, 'Services & Other': 2.166e9},
        (2023, 4): {'Automotive sales': 20.630e9, 'Automotive regulatory credits': 433e6, 'Automotive leasing': 500e6, 'Energy Generation & Storage': 1.438e9, 'Services & Other': 2.166e9},
        (2024, 1): {'Automotive sales': 16.460e9, 'Automotive regulatory credits': 442e6, 'Automotive leasing': 476e6, 'Energy Generation & Storage': 1.635e9, 'Services & Other': 2.288e9},
        (2024, 2): {'Automotive sales': 18.530e9, 'Automotive regulatory credits': 890e6, 'Automotive leasing': 458e6, 'Energy Generation & Storage': 3.014e9, 'Services & Other': 2.608e9},
        (2024, 3): {'Automotive sales': 18.831e9, 'Automotive regulatory credits': 739e6, 'Automotive leasing': 446e6, 'Energy Generation & Storage': 2.376e9, 'Services & Other': 2.790e9},
        (2024, 4): {'Automotive sales': 18.659e9, 'Automotive regulatory credits': 692e6, 'Automotive leasing': 447e6, 'Energy Generation & Storage': 3.061e9, 'Services & Other': 2.848e9},
        (2025, 1): {'Automotive sales': 12.925e9, 'Automotive regulatory credits': 595e6, 'Automotive leasing': 447e6, 'Energy Generation & Storage': 2.730e9, 'Services & Other': 2.638e9},
        (2025, 2): {'Automotive sales': 15.787e9, 'Automotive regulatory credits': 439e6, 'Automotive leasing': 435e6, 'Energy Generation & Storage': 2.789e9, 'Services & Other': 3.046e9},
    }
    
    # 年度交付量数据
    unit_targets_by_year = {
        2013: 22442, 2014: 31655, 2015: 50517, 2016: 76243, 2017: 103091,
        2018: 245491, 2019: 367656, 2020: 499535, 2021: 936222, 2022: 1313851,
        2023: 1808581, 2024: 1789226, 2025: 336681 + 384122
    }
    
    # --------------------------
    # 步骤 1: 数据清洗和预处理
    # --------------------------
    dim_product_df.columns = dim_product_df.columns.str.strip()
    if 'Product_Name' not in dim_product_df.columns:
        raise KeyError("Dim_Product.csv中缺少'Product_Name'列，请检查文件。")
    if 'Product_Category' not in dim_product_df.columns:
        raise KeyError("Dim_Product.csv中缺少'Product_Category'列，请检查文件。")

    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        return 'North America'
        
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # 步骤 2: 定义权重
    # --------------------------
    # 定义大陆、国家、省份的权重
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    # 预计算所有 Geo_ID 的权重并归一化
    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)
    # 归一化步骤，修复 ValueError
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'] / dim_geography_df['Geo_Weight'].sum()

    customer_ids = dim_customer_df['Customer_ID'].values
    regulatory_customer_names = ['Stellantis', 'Honda', 'Ford']
    regulatory_customer_ids = dim_customer_df[dim_customer_df['Customer_Name'].isin(regulatory_customer_names)]['Customer_ID'].values
    
    # --------------------------
    # 步骤 3: 确保时间数据类型一致
    # --------------------------
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    dim_time_df['Year_Int'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_Int'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time

    # --------------------------
    # 步骤 4: 创建价格查找字典
    # --------------------------
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv 文件中缺少必要的列。请检查文件是否包含 'Standard_Price_USD' 和 'Discounted_Price_USD'。")

    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    model_avg_prices = dim_prices_df.groupby('Product_ID')['Standard_Price_USD'].mean().to_dict()
    rough_avg_price_per_car = np.mean(list(model_avg_prices.values()))

    # 建立产品类别到 Product_ID 的映射，并清洗类别名称
    product_category_map = defaultdict(list)
    for _, row in dim_product_df.iterrows():
        cleaned_category = row['Product_Category'].strip()
        product_category_map[cleaned_category].append(row['Product_ID'].strip())

    start_year = min(unit_targets_by_year.keys())
    
    total_generated_rows = 0
    header_written = False
    
    # 循环生成数据，从最早有交付量数据的年份开始
    for year in range(start_year, end_date.year + 1):
        for quarter in range(1, 5):
            if (year, quarter) > (end_date.year, end_date.month // 3 + (1 if end_date.month % 3 > 0 else 0)):
                continue

            print(f"正在生成 {year}年Q{quarter}季度的数据...")
            records = []
            quarter_time_ids = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]['Time_ID'].tolist()
            
            if not quarter_time_ids:
                continue

            quarter_start_date = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]['Quarter_Start_Date'].iloc[0]

            # ----------------------------------------------------
            # 步骤 5: 生成销售记录
            # ----------------------------------------------------
            
            if (year, quarter) in quarterly_revenues_by_category:
                # 使用详细财报数据生成销售记录 (2021Q1 onwards)
                quarter_revenue_data = quarterly_revenues_by_category[(year, quarter)]
                total_year_units = unit_targets_by_year.get(year, 0)
                total_year_revenue = sum(sum(d.values()) for y, d in quarterly_revenues_by_category.items() if y[0] == year)
                total_quarter_revenue = sum(quarter_revenue_data.values())
                quarter_revenue_ratio = total_quarter_revenue / total_year_revenue if total_year_revenue > 0 else 0
                car_units = int(total_year_units * quarter_revenue_ratio)

                car_product_ids = product_category_map.get('Automotive', [])
                if car_units > 0 and car_product_ids:
                    car_revenue_target = quarter_revenue_data.get('Automotive sales', 0)
                    
                    sampled_product_ids = np.random.choice(car_product_ids, size=car_units)
                    sampled_geo_ids = np.random.choice(dim_geography_df['Geo_ID'], size=car_units, p=dim_geography_df['Geo_Weight'].values)
                    
                    car_revenues = []
                    for i in range(car_units):
                        product_id = sampled_product_ids[i]
                        geo_id = sampled_geo_ids[i]
                        
                        prices = price_lookup.get((quarter_start_date, product_id))
                        price_used = prices['Discounted_Price_USD'] if prices and np.random.rand() < 0.2 else (prices['Standard_Price_USD'] if prices else model_avg_prices.get(product_id, 0))
                        
                        car_revenues.append(price_used)
                        
                        records.append({
                            'Time_ID': np.random.choice(quarter_time_ids),
                            'Geo_ID': geo_id,
                            'Product_ID': product_id,
                            'Customer_ID': np.random.choice(customer_ids),
                            'Sales_Units': 1,
                            'Is_Discounted_Sale': True if prices and np.random.rand() < 0.2 else False,
                            'Revenue_USD': 0 
                        })
                    
                    total_generated_car_revenue = sum(car_revenues)
                    if total_generated_car_revenue > 0:
                        scaling_factor = car_revenue_target / total_generated_car_revenue
                        for i, record in enumerate(records):
                            record['Revenue_USD'] = car_revenues[i] * scaling_factor
            
                # --- Automotive Regulatory Credits (监管积分) ---
                regulatory_revenue = quarter_revenue_data.get('Automotive regulatory credits', 0)
                if regulatory_revenue > 0:
                    credit_product_id = product_category_map.get('Financial & Regulatory', ['PRO011'])[0]
                    avg_deal_value = 10e6
                    num_deals = int(regulatory_revenue / avg_deal_value)
                    
                    for _ in range(num_deals):
                        records.append({
                            'Time_ID': np.random.choice(quarter_time_ids),
                            'Geo_ID': np.random.choice(dim_geography_df['Geo_ID'], p=dim_geography_df['Geo_Weight'].values),
                            'Product_ID': credit_product_id,
                            'Customer_ID': np.random.choice(regulatory_customer_ids),
                            'Sales_Units': 1,
                            'Is_Discounted_Sale': False,
                            'Revenue_USD': avg_deal_value
                        })

                # --- Automotive Leasing (车辆租赁) ---
                leasing_revenue = quarter_revenue_data.get('Automotive leasing', 0)
                if leasing_revenue > 0:
                    leasing_product_ids = product_category_map.get('Automotive Leasing', [])
                    if leasing_product_ids:
                        avg_lease_value = (10000 + 20000) / 2
                        num_leases = int(leasing_revenue / avg_lease_value)
                        
                        for _ in range(num_leases):
                                records.append({
                                    'Time_ID': np.random.choice(quarter_time_ids),
                                    'Geo_ID': np.random.choice(dim_geography_df['Geo_ID'], p=dim_geography_df['Geo_Weight'].values),
                                    'Product_ID': np.random.choice(leasing_product_ids),
                                    'Customer_ID': np.random.choice(customer_ids),
                                    'Sales_Units': 1,
                                    'Is_Discounted_Sale': False,
                                    'Revenue_USD': avg_lease_value
                                })

                # --- Energy Generation & Storage (能源) ---
                energy_revenue = quarter_revenue_data.get('Energy Generation & Storage', 0)
                if energy_revenue > 0:
                    energy_product_ids = product_category_map.get('Energy Generation & Storage', [])
                    if energy_product_ids:
                        avg_energy_value = (5000 + 150000) / 2
                        num_energy_sales = int(energy_revenue / avg_energy_value)
                        
                        for _ in range(num_energy_sales):
                            records.append({
                                'Time_ID': np.random.choice(quarter_time_ids),
                                'Geo_ID': np.random.choice(dim_geography_df['Geo_ID'], p=dim_geography_df['Geo_Weight'].values),
                                'Product_ID': np.random.choice(energy_product_ids),
                                'Customer_ID': np.random.choice(customer_ids),
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': avg_energy_value
                            })

                # --- Services & Other (服务及其他) ---
                services_revenue = quarter_revenue_data.get('Services & Other', 0)
                if services_revenue > 0:
                    service_product_ids = product_category_map.get('Services & Other', [])
                    if service_product_ids:
                        avg_service_value = (50 + 2000) / 2
                        num_service_sales = int(services_revenue / avg_service_value)
                        
                        for _ in range(num_service_sales):
                            records.append({
                                'Time_ID': np.random.choice(quarter_time_ids),
                                'Geo_ID': np.random.choice(dim_geography_df['Geo_ID'], p=dim_geography_df['Geo_Weight'].values),
                                'Product_ID': np.random.choice(service_product_ids),
                                'Customer_ID': np.random.choice(customer_ids),
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': avg_service_value
                            })

            else:
                # 新增: 为没有详细财报数据的年份生成粗略数据 (2013-2020)
                total_year_units = unit_targets_by_year.get(year, 0)
                if total_year_units > 0:
                    # 假设总营收 = 总交付量 * 粗略平均价格
                    total_year_revenue = total_year_units * rough_avg_price_per_car
                    # 将年度数据粗略平均分配到四个季度
                    quarter_units = int(total_year_units / 4)
                    quarter_revenue = total_year_revenue / 4
                    revenue_per_unit = quarter_revenue / quarter_units if quarter_units > 0 else 0
                    
                    car_product_ids = product_category_map.get('Automotive', [])
                    if car_product_ids:
                        for _ in range(quarter_units):
                            records.append({
                                'Time_ID': np.random.choice(quarter_time_ids),
                                'Geo_ID': np.random.choice(dim_geography_df['Geo_ID'], p=dim_geography_df['Geo_Weight'].values),
                                'Product_ID': np.random.choice(car_product_ids),
                                'Customer_ID': np.random.choice(customer_ids),
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': revenue_per_unit
                            })
                
            if not records:
                continue

            fact_sales_df_temp = pd.DataFrame(records)

            # 首次写入时带上表头，后续写入则不带
            fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
            header_written = True
            total_generated_rows += len(fact_sales_df_temp)
    
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在加载所有维度表...")
    try:
        dim_product_df = pd.read_csv(os.path.join('./output_data', 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join('./output_data', 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join('./output_data', 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join('./output_data', 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join('./output_data', 'Dim_Prices.csv'))
    except FileNotFoundError as e:
        print(f"错误：缺少一个或多个必需的 CSV 文件。请先运行所有维度生成脚本（1-5）。\n{e}")
        exit()

    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')

    if os.path.exists(output_filepath):
        os.remove(output_filepath)
        print("已移除旧的 Fact_Sales.csv 文件。")

    print("正在生成 Fact_Sales 表...")
    total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)

    if total_rows > 0:
        end_time = time.time()
        print(f"Fact_Sales.csv 已成功生成 {total_rows:,} 行数据，耗时 {end_time - start_time:.2f} 秒。")
        print("数据生成完成！")
    else:
        print("数据生成失败。")


正在加载所有维度表...
正在生成 Fact_Sales 表...
正在生成 2013年Q1季度的数据...
正在生成 2013年Q2季度的数据...
正在生成 2013年Q3季度的数据...
正在生成 2013年Q4季度的数据...
正在生成 2014年Q1季度的数据...
正在生成 2014年Q2季度的数据...
正在生成 2014年Q3季度的数据...
正在生成 2014年Q4季度的数据...
正在生成 2015年Q1季度的数据...
正在生成 2015年Q2季度的数据...
正在生成 2015年Q3季度的数据...
正在生成 2015年Q4季度的数据...
正在生成 2016年Q1季度的数据...
正在生成 2016年Q2季度的数据...
正在生成 2016年Q3季度的数据...
正在生成 2016年Q4季度的数据...
正在生成 2017年Q1季度的数据...
正在生成 2017年Q2季度的数据...
正在生成 2017年Q3季度的数据...
正在生成 2017年Q4季度的数据...
正在生成 2018年Q1季度的数据...
正在生成 2018年Q2季度的数据...
正在生成 2018年Q3季度的数据...
正在生成 2018年Q4季度的数据...
正在生成 2019年Q1季度的数据...
正在生成 2019年Q2季度的数据...
正在生成 2019年Q3季度的数据...
正在生成 2019年Q4季度的数据...
正在生成 2020年Q1季度的数据...
正在生成 2020年Q2季度的数据...
正在生成 2020年Q3季度的数据...
正在生成 2020年Q4季度的数据...
正在生成 2021年Q1季度的数据...
正在生成 2021年Q2季度的数据...
正在生成 2021年Q3季度的数据...
正在生成 2021年Q4季度的数据...
正在生成 2022年Q1季度的数据...
正在生成 2022年Q2季度的数据...
正在生成 2022年Q3季度的数据...
正在生成 2022年Q4季度的数据...
正在生成 2023年Q1季度的数据...
正在生成 2023年Q2季度的数据...
正在生成 2023年Q3季度的数据...
正在生成 2023年Q4季度的数据...
正在生成 2024年Q1季度的数据...
正在生成 2024年Q2季度的数据...
