### **1/6: 生成 Dim_Product 表 （覆盖绝大部分特斯拉产品）(简单静态数据)维度表更可能需要更新Update或追加Append。例如，新产品发布，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [34]:
# -*- coding: utf-8 -*-
"""1/6: Generate Dim_Product Table with detailed configurations."""

import pandas as pd
import os
import time
import numpy as np
import random
from itertools import product
from datetime import date

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_product():
    """
    Generates the Dim_Product table with all product and configuration details.
    """
    # 基础产品信息 (Base products) - Modified
    base_products = {
        'Model 3': 'Automotive Sales',
        'Model Y': 'Automotive Sales',
        'Model S': 'Automotive Sales',
        'Model X': 'Automotive Sales',
        'Cybertruck': 'Automotive Sales'
    }

    # 汽车版本和价格 (Variants and their prices)
    variants = {
        'Model 3': {
            'Rear-Wheel Drive': np.nan,
            'Long Range RWD': np.nan,
            'Long Range AWD': np.nan,
            'Performance': np.nan
        },
        'Model Y': {
            'Standard Range RWD': np.nan,
            'Long Range RWD': np.nan,
            'Long Range AWD': np.nan,
            'Performance': np.nan,
            'Model Y L (3-Row)': np.nan
        },
        'Model S': {
            'Long Range AWD': np.nan,
            'Plaid': np.nan
        },
        'Model X': {
            'All-Wheel Drive': np.nan,
            'Plaid': np.nan
        },
        'Cybertruck': {
            'Long Range': np.nan,
            'AWD': np.nan,
            'Cyberbeast': np.nan
        }
    }

    # 可配置选项和价格 (Configurable options and their prices)
    options = {
        'Paint_Color': {
            'Solid Black': np.nan, 'Deep Blue Metallic': np.nan, 'Stealth Grey': np.nan,
            'Ultra Red': np.nan, 'Quicksilver': np.nan, 'Pearl White Multi-Coat': np.nan, 'None': np.nan
        },
        'Wheel_Type': {
            'Aero Wheels': np.nan, '19" Nova Wheels': np.nan, 'Crossflow 19"': np.nan,
            'Helix 20"': np.nan, 'Base Wheels': np.nan, 'Performance Wheels': np.nan, 'None': np.nan
        },
        'Interior_Type': {
            'Black Interior': np.nan, 'Black and White Interior': np.nan, 'Cream Interior': np.nan, 'None': np.nan
        }
    }
    
    # Define product release dates for ID ordering
    # Use a dictionary to map product models to their approximate release dates
    release_dates = {
        'Model S': date(2012, 6, 1),
        'Model X': date(2015, 9, 1),
        'Model 3': date(2017, 7, 1),
        'Model Y': date(2020, 1, 1),
        'Cybertruck': date(2023, 11, 1),
        'N/A': date(2010, 1, 1) # A generic early date for non-automotive products
    }

    all_products = []

    # Generate all automotive product configurations first
    for model, category in base_products.items():
        if model in variants:
            variant_names = list(variants[model].keys())
            paint_colors = list(options['Paint_Color'].keys())
            wheel_types = list(options['Wheel_Type'].keys())
            interior_types = list(options['Interior_Type'].keys())
            
            combinations = list(product(variant_names, paint_colors, wheel_types, interior_types))
            
            for combo in combinations:
                variant, paint, wheel, interior = combo
                product_name = f"{model} {variant} - {paint} - {wheel} - {interior}"
                all_products.append({
                    'Product_Name': product_name,
                    'Product_Category': category,
                    'Product_Model': model,
                    'Product_Variant': variant,
                    'Paint_Color': paint,
                    'Wheel_Type': wheel,
                    'Interior_Type': interior
                })

    # Add non-automotive products, including the missing ones
    # The categories have been changed to match the user's request
    non_automotive_products = [
        ('Model 3 LR RWD Lease (24mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Model 3 LR RWD Lease (36mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Model Y LR RWD Lease (24mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Model Y LR RWD Lease (36mo)', 'Automotive Leasing', 'Lease', np.nan, np.nan, np.nan),
        ('Solar Panels', 'Energy Generation & Storage', 'Solar Panel', np.nan, np.nan, np.nan),
        ('Solar Roof', 'Energy Generation & Storage', 'Solar Roof', np.nan, np.nan, np.nan),
        ('Powerwall 3', 'Energy Generation & Storage', 'Powerwall', np.nan, np.nan, np.nan),
        ('Megapack', 'Energy Generation & Storage', 'Megapack', np.nan, np.nan, np.nan),
        ('FSD (Full Self-Driving)', 'Automotive Sales', 'Feature', np.nan, np.nan, np.nan),
        ('CyberCab (2026 Placeholder)', 'Automotive Sales', 'Service', np.nan, np.nan, np.nan),
        ('Regulatory Credits', 'Automotive Regulatory Credits', 'Credit', np.nan, np.nan, np.nan),
        ('Charging Equipment', 'Services & Other', 'Accessory', np.nan, np.nan, np.nan),
        ('Vehicle Accessories', 'Services & Other', 'Accessory', np.nan, np.nan, np.nan),
        ('Apparel', 'Services & Other', 'Apparel', np.nan, np.nan, np.nan),
        ('Lifestyle', 'Services & Other', 'Lifestyle', np.nan, np.nan, np.nan),
    ]

    for prod_name, cat, variant, paint, wheel, interior in non_automotive_products:
        all_products.append({
            'Product_Name': prod_name,
            'Product_Category': cat,
            'Product_Model': 'N/A', # Use 'N/A' for non-automotive products
            'Product_Variant': variant,
            'Paint_Color': paint,
            'Wheel_Type': wheel,
            'Interior_Type': interior
        })

    # Sort products by their release date to ensure a logical ID sequence
    all_products.sort(key=lambda x: release_dates.get(x['Product_Model'], date(2010, 1, 1)))

    # Assign sequential Product_ID to the sorted list
    for i, prod in enumerate(all_products):
        prod['Product_ID'] = f'PRO{i+1:03d}'
    
    # Define columns with the price column removed
    columns = [
        'Product_ID', 'Product_Name', 'Product_Category', 'Product_Variant',
        'Paint_Color', 'Wheel_Type', 'Interior_Type'
    ]
    
    df = pd.DataFrame(all_products, columns=columns)
    
    return df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Product table...")
    dim_product_df = generate_dim_product()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Product.csv...")
    # 修改了这一行，确保 NA 值被正确写入 CSV 文件
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8', na_rep='NA')
    
    end_time = time.time()
    print(f"Dim_Product.csv has been successfully generated with {len(dim_product_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Product table...
Saving Dim_Product.csv...
Dim_Product.csv has been successfully generated with 3151 rows in 0.03 seconds.


### **2/6: 生成 Dim_Time 表 (单一向前数据)维度表只追加Append。每一个时间点、每一天、每一个月都是一个既定的、永恒不变的事实。你无法“更新”昨天或去年的日期，此时就需要追加Append表中的相应记录。没有复杂的版本控制机制（Slowly Changing Dimension, SCD）** ###

In [15]:
# -*- coding: utf-8 -*-
"""2/6: Generate Dim_Time Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import datetime

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_time():
    """Generates the Dim_Time table."""
    # 修改起始年份为 2013
    start_date = datetime.date(2013, 1, 1)
    end_date = datetime.date(2025, 12, 31)
    date_range = [start_date + datetime.timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

    data = []
    time_id_counter = 1
    for date in date_range:
        # Generate the new Time_ID format (T + 7 digits)
        time_id = f'T{time_id_counter:07d}'
        
        data.append([
            time_id,
            date,
            date.year,
            f"Q{((date.month - 1) // 3) + 1}",
            date.month,
            date.day,
            date.isocalendar()[1],
            date.isoweekday(),
            date.strftime('%A')
        ])
        time_id_counter += 1
    
    return pd.DataFrame(data, columns=['Time_ID', 'Full_Date', 'Year', 'Quarter', 'Month', 'Day', 'Week_of_Year', 'Day_of_Week', 'Day_Name'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Time table...")
    dim_time_df = generate_dim_time()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Time.csv...")
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Time.csv has been successfully generated with {len(dim_time_df):,} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Time table...
Saving Dim_Time.csv...
Dim_Time.csv has been successfully generated with 4,748 rows in 0.02 seconds.


### **3/6: 生成 Dim_Customer 表 （修改性别和年龄分布）(相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的收入水平或家庭住址可能会发生变化，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [8]:
# -*- coding: utf-8 -*-
"""3/6: Generate Dim_Customer Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import string

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_customer(num_customers=50000):
    """Generates the Dim_Customer table."""
    
    # 调整性别分布，偏向男性
    genders = ['Male', 'Female']
    gender_probs = [0.75, 0.25] # Male: 75%, Female: 25%

    # 调整年龄组分布，偏向中年群体
    age_groups = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    age_probs = [0.05, 0.20, 0.35, 0.25, 0.10, 0.05]
    
    # 调整收入水平分布，偏向中高收入
    income_levels = ['Low', 'Medium', 'High']
    income_probs = [0.10, 0.80, 0.10]
    
    first_names = ['James', 'Mary', 'John', 'Patricia', 'Robert', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'David', 'Susan', 'Richard', 'Jessica', 'Joseph', 'Sarah', 'Thomas', 'Karen', 'Charles', 'Nancy', 'Christopher', 'Lisa', 'Daniel', 'Betty', 'Paul', 'Margaret', 'Mark', 'Sandra', 'Donald', 'Ashley', 'George', 'Kimberly', 'Kenneth', 'Donna', 'Steven', 'Emily', 'Edward', 'Carol', 'Brian', 'Michelle', 'Ronald', 'Amanda', 'Anthony', 'Melissa', 'Kevin', 'Deborah', 'Jason', 'Stephanie', 'Jeff', 'Maria', 'Gary', 'Heather', 'Timothy', 'Nicole', 'Jose', 'Denise', 'Larry', 'Megan', 'Jeffrey', 'Christina', 'Frank', 'Alexis', 'Scott', 'Tiffany', 'Eric', 'Lauren', 'Stephen', 'Rachel', 'Andrew', 'Crystal', 'Raymond', 'Kayla', 'Ryan', 'Danielle', 'Jacob', 'Brittany', 'Nicholas', 'Emma', 'Jonathan', 'Samantha', 'Laura', 'Alexis', 'Joshua', 'Brandon', 'Justin', 'Daniel', 'Daniel', 'Taylor']
    last_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young', 'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins', 'Stewart', 'Sanchez', 'Morris', 'Rogers', 'Reed', 'Cook', 'Morgan', 'Bell', 'Murphy', 'Bailey', 'Rivera', 'Cooper', 'Richardson', 'Cox', 'Howard', 'Ward', 'Torres', 'Peterson', 'Gray', 'Ramirez', 'James', 'Watson', 'Brooks', 'Kelly', 'Sanders', 'Price', 'Bennett', 'Wood', 'Barnes', 'Ross', 'Henderson', 'Coleman', 'Jenkins', 'Perry', 'Powell', 'Long', 'Patterson', 'Hughes', 'Flores', 'Washington', 'Butler', 'Simmons', 'Foster', 'Gonzales', 'Bryant', 'Alexander', 'Russell', 'Griffin', 'Diaz', 'Hayes', 'Myers', 'Ford', 'Hamilton', 'Graham', 'Sullivan', 'Wallace', 'Woods', 'Cole', 'West', 'Jordan', 'Owens', 'Reynolds', 'Fisher', 'Ellis', 'Harrison', 'Gibson', 'Mcdonald', 'Cruz', 'Marshall', 'Ortiz', 'Gomez', 'Murray', 'Freeman', 'Wells', 'Webb', 'Simpson', 'Stevens', 'Tucker', 'Porter', 'Hunter', 'Hicks', 'Crawford', 'Henry', 'Boyd', 'Mason', 'Kennedy', 'Warren', 'Dixon', 'Ramos', 'Reid', 'Carr', 'Chavez', 'Gibson']
    
    data = []
    
    # 跟踪每个组合的序号，确保不重复
    combination_tracker = {}
    
    # 随机生成个人客户数据
    for _ in range(num_customers):
        full_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        
        # 使用 np.random.choice 并指定概率
        gender_raw = np.random.choice(genders, p=gender_probs)
        age_group = np.random.choice(age_groups, p=age_probs)
        income_level = np.random.choice(income_levels, p=income_probs)
        
        # 随机生成两个字母的国家和省/州代码
        country_code = ''.join(random.choices(string.ascii_uppercase, k=2))
        state_code = ''.join(random.choices(string.ascii_uppercase, k=2))
        
        # 编码 Customer_ID
        gender_code = 'M' if gender_raw == 'Male' else 'W'
        
        # 构建组合前缀
        prefix = f'C{gender_code}{country_code}{state_code}'
        
        # 获取并递增序号
        if prefix not in combination_tracker:
            combination_tracker[prefix] = 0
        else:
            combination_tracker[prefix] += 1
            
        # 序号部分为四位数字，从0000开始
        sequential_id = f"{combination_tracker[prefix] % 10000:04d}"
        
        customer_id = f'{prefix}{sequential_id}'

        data.append([customer_id, full_name, 'Individual', gender_raw, age_group, income_level, 'NA', 'NA', 'NA'])

    individual_df = pd.DataFrame(data, columns=['Customer_ID', 'Customer_Name', 'Customer_Segment', 'Gender', 'Age_Group', 'Income_Level', 'Country', 'State_Province', 'City'])

    # 新增业务客户数据，用于购买监管积分
    business_customers = [
        ['B001', 'General Motors', 'Business', 'NA', 'NA', 'NA', 'United States', 'Michigan', 'Detroit'],
        ['B002', 'Ford Motor Company', 'Business', 'NA', 'NA', 'NA', 'United States', 'Michigan', 'Dearborn'],
        ['B003', 'Toyota', 'Business', 'NA', 'NA', 'NA', 'Japan', 'Aichi', 'Toyota'],
        ['B004', 'Volkswagen', 'Business', 'NA', 'NA', 'NA', 'Germany', 'Lower Saxony', 'Wolfsburg'],
        ['B005', 'Stellantis', 'Business', 'NA', 'NA', 'NA', 'Netherlands', 'North Holland', 'Amsterdam'],
        ['B006', 'Honda', 'Business', 'NA', 'NA', 'NA', 'Japan', 'Tokyo', 'Tokyo'],
        ['B007', 'Nissan', 'Business', 'NA', 'NA', 'NA', 'Japan', 'Kanagawa', 'Yokohama'],
        ['B008', 'Delta Air Lines', 'Business', 'NA', 'NA', 'NA', 'United States', 'Georgia', 'Atlanta'],
        ['B009', 'United Airlines', 'Business', 'NA', 'NA', 'NA', 'United States', 'Illinois', 'Chicago'],
        ['B010', 'American Airlines', 'Business', 'NA', 'NA', 'NA', 'United States', 'Texas', 'Fort Worth'],
        ['B011', 'Shell', 'Business', 'NA', 'NA', 'NA', 'Netherlands', 'South Holland', 'The Hague'],
        ['B012', 'ExxonMobil', 'Business', 'NA', 'NA', 'NA', 'United States', 'Texas', 'Irving'],
        ['B013', 'BP', 'Business', 'NA', 'NA', 'NA', 'United Kingdom', 'Greater London', 'London'],
        ['B014', 'Chevron', 'Business', 'NA', 'NA', 'NA', 'United States', 'California', 'San Ramon'],
        ['B015', 'TotalEnergies', 'Business', 'NA', 'NA', 'NA', 'France', 'Île-de-France', 'Courbevoie'],
    ]
    
    business_df = pd.DataFrame(business_customers, columns=['Customer_ID', 'Customer_Name', 'Customer_Segment', 'Gender', 'Age_Group', 'Income_Level', 'Country', 'State_Province', 'City'])

    # 合并个人和业务客户数据
    final_df = pd.concat([individual_df, business_df], ignore_index=True)
    
    return final_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Customer table...")
    dim_customer_df = generate_dim_customer()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Customer.csv has been successfully generated with {len(dim_customer_df)} rows in {end_time - start_time:.2f} seconds.")


Generating Dim_Customer table...
Saving Dim_Customer.csv...
Dim_Customer.csv has been successfully generated with 50015 rows in 0.97 seconds.


### **4/6: 生成 Dim_Geography 表 （按大洲、国家编号）(相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的地址可能会发生变化或更新到新的国家和城市，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [3]:
# -*- coding: utf-8 -*-
"""4/6: Generate Dim_Geography Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import string

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_plausible_zip(country):
    """Generates a plausible zip code based on the country, padded to 8 characters."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'
    
    # 辅助函数，用于填充到8位
    def pad_to_eight(s):
        return (s + '0' * 8)[:8]

    if country == 'United States':
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country == 'Canada':
        # A1A 1A1 format (6 characters + 1 space)
        code = f"{random.choice(letters)}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
        return pad_to_eight(code.replace(' ', '')) # Remove space for 8-char ID, or keep for Zip_Code column
    elif country == 'Mexico':
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country == 'United Kingdom':
        # AN NAA or ANN NAA format
        part1_letters = ''.join(random.choices(letters, k=random.choice([1, 2])))
        part1_digits = ''.join(random.choices(digits, k=random.choice([1, 2])))
        part2 = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        code = f"{part1_letters}{part1_digits} {part2}"
        return pad_to_eight(code.replace(' ', ''))
    elif country == 'France':
        # 5 digits, but first is not zero
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country in ['Germany', 'Italy', 'Spain', 'Switzerland', 'Netherlands', 'Denmark', 'Norway', 'Sweden', 'Finland', 'Greece', 'Iceland', 'Ireland', 'Luxembourg', 'Monaco']:
        # Most of Europe uses 5-digit numbers
        return pad_to_eight(f"{random.randint(10000, 99999)}")
    elif country == 'China':
        # 6-digit numeric
        return pad_to_eight(f"{random.randint(100000, 999999)}")
    elif country == 'Japan':
        # 7-digit numeric, often with a hyphen, so we generate and pad
        return pad_to_eight(f"{random.randint(1000000, 9999999)}")
    elif country in ['South Korea', 'Taiwan', 'Hong Kong', 'Macau']:
        # 5-7 digit numeric
        return pad_to_eight(f"{random.randint(10000, 9999999)}")
    elif country == 'Australia':
        # 4-digit numeric
        return pad_to_eight(f"{random.randint(1000, 9999)}")
    elif country == 'New Zealand':
        # 4-digit numeric
        return pad_to_eight(f"{random.randint(1000, 9999)}")
    else:
        return "00000000"

def generate_dim_geography():
    """
    Generates a Dim_Geography table.
    """
    geography_data = []
    
    # Continent codes for the new Geo_ID
    continent_codes = {
        'North America': 'NA',
        'Europe': 'EU',
        'Asia': 'AS',
        'Oceania': 'OC'
    }
    
    # North America
    north_america_countries = {
        'United States': {
            'code': 'US',
            'provinces': [
                'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
                'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
                'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
                'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
                'Wisconsin', 'Wyoming'
            ]
        },
        'Canada': {
            'code': 'CA',
            'provinces': [
                'Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
                'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Québec', 'Saskatchewan',
                'Northwest Territories', 'Nunavut', 'Yukon'
            ]
        },
        'Mexico': {
            'code': 'MX',
            'provinces': [
                'Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas',
                'Chihuahua', 'Coahuila', 'Colima', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo',
                'Jalisco', 'México', 'Distrito Federal', 'Michoacán', 'Morelos', 'Nayarit',
                'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
                'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas'
            ]
        }
    }

    # Europe
    europe_countries = {
        'Germany': {
            'code': 'DE',
            'provinces': [
                'Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg',
                'Hesse', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'North Rhine-Westphalia',
                'Rhineland-Palatinate', 'Saarland', 'Saxony', 'Saxony-Anhalt',
                'Schleswig-Holstein', 'Thuringia'
            ]
        },
        'United Kingdom': {
            'code': 'GB',
            'provinces': [
                'England', 'Scotland', 'Wales', 'Northern Ireland'
            ]
        },
        'Norway': {'code': 'NO', 'provinces': ['Oslo', 'Viken', 'Innlandet', 'Vestfold og Telemark', 'Agder', 'Rogaland', 'Vestland', 'Møre og Romsdal', 'Trøndelag', 'Nordland', 'Troms og Finnmark']},
        'France': {'code': 'FR', 'provinces': ['Bretagne', 'Normandie', 'Île-de-France', 'Auvergne-Rhône-Alpes', 'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Corsica', 'Grand Est', 'Hauts-de-France', 'Nouvelle-Aquitaine', 'Occitanie', 'Pays de la Loire', 'Provence-Alpes-Côte d\'Azur']},
        'Netherlands': {'code': 'NL', 'provinces': ['Drenthe', 'Flevoland', 'Friesland', 'Gelderland', 'Groningen', 'Limburg', 'North Brabant', 'North Holland', 'Overijssel', 'Utrecht', 'Zeeland', 'South Holland']},
        'Sweden': {'code': 'SE', 'provinces': ['Blekinge', 'Dalarna', 'Gotland', 'Gävleborg', 'Halland', 'Jämtland', 'Jönköping', 'Kalmar', 'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Södermanland', 'Uppsala', 'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland', 'Västra Götaland', 'Örebro', 'Östergötland']},
        'Switzerland': {'code': 'CH', 'provinces': ['Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen', 'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen', 'Graubünden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchâtel', 'Geneva', 'Jura']},
        'Italy': {'code': 'IT', 'provinces': ['Abruzzo', 'Aosta Valley', 'Apulia', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-Alto Adige', 'Umbria', 'Veneto']},
        'Spain': {'code': 'ES', 'provinces': ['Andalusia', 'Aragon', 'Principality of Asturias', 'Balearic Islands', 'Basque Country', 'Canary Islands', 'Cantabria', 'Castile and León', 'Castile-La Mancha', 'Catalonia', 'Community of Madrid', 'Valencian Community', 'Extremadura', 'Galicia', 'La Rioja', 'Region of Murcia', 'Foral Community of Navarre']},
        'Denmark': {'code': 'DK', 'provinces': ['Capital Region of Denmark', 'Central Denmark Region', 'North Denmark Region', 'Region Zealand', 'Region of Southern Denmark']},
        'Finland': {'code': 'FI', 'provinces': ['Åland Islands', 'Central Finland', 'Central Ostrobothnia', 'Kainuu', 'Kymenlaakso', 'Lapland', 'North Karelia', 'North Ostrobothnia', 'Northern Savonia', 'Päijät-Häme', 'Pirkanmaa', 'Satakunta', 'South Karelia', 'Southern Ostrobothnia', 'Southern Savonia', 'Tavastia Proper', 'Uusimaa', 'Southwest Finland']},
        'Greece': {'code': 'GR', 'provinces': ['Attica', 'Central Greece', 'Central Macedonia', 'Crete', 'East Macedonia and Thrace', 'Epirus', 'Ionian Islands', 'North Aegean', 'Peloponnese', 'South Aegean', 'Thessaly', 'West Greece', 'West Macedonia']},
        'Iceland': {'code': 'IS', 'provinces': ['Capital Region', 'Southern Peninsula', 'Western Region', 'Westfjords', 'Northwest Region', 'Northeast Region', 'Eastern Region', 'Southern Region']},
        'Ireland': {'code': 'IE', 'provinces': ['Connacht', 'Leinster', 'Munster', 'Ulster']},
        'Luxembourg': {'code': 'LU', 'provinces': ['Diekirch', 'Grevenmacher', 'Luxembourg']},
        'Monaco': {'code': 'MC', 'provinces': ['Monaco']}
    }
    
    # Asia
    asia_countries = {
        'China': {
            'code': 'CN',
            'provinces': [
                'Anhui', 'Fujian', 'Gansu', 'Guangdong', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang',
                'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Liaoning', 'Qinghai',
                'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Yunnan', 'Zhejiang',
                'Guangxi', 'Nei Mongol', 'Ningxia Hui', 'Xinjiang Uygur', 'Xizang',
                'Beijing', 'Chongqing', 'Shanghai', 'Tianjin'
            ]
        },
        'Hong Kong': {'code': 'HK', 'provinces': ['Hong Kong Island', 'Kowloon', 'New Territories']},
        'Macau': {'code': 'MO', 'provinces': ['Macau']},
        'Japan': {
            'code': 'JP',
            'provinces': [
                'Hokkaido', 'Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima',
                'Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa',
                'Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano',
                'Gifu', 'Shizuoka', 'Aichi', 'Mie', 'Shiga', 'Kyoto', 'Osaka',
                'Hyōgo', 'Nara', 'Wakayama', 'Tottori', 'Shimane', 'Okayama',
                'Hiroshima', 'Yamaguchi', 'Tokushima', 'Kagawa', 'Ehime', 'Kochi',
                'Fukuoka', 'Saga', 'Naoasaki', 'Kumamoto', 'Oita', 'Miyazaki', 'Kagoshima', 'Okinawa'
            ]
        },
        'South Korea': {
            'code': 'KR',
            'provinces': [
                'Busan', 'Chungcheongbuk-do', 'Chungcheongnam-do', 'Daegu', 'Daejeon', 'Gangwon-do',
                'Gwangju', 'Gyeonggi-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Incheon', 'Jeollabuk-do',
                'Jeollanam-do', 'Sejong', 'Seoul', 'Ulsan', 'Jeju'
            ]
        },
        'Taiwan': {
            'code': 'TW',
            'provinces': [
                'Taipei', 'New Taipei', 'Taichung', 'Tainan', 'Kaohsiung', 'Taoyuan',
                'Keelung', 'Hsinchu City', 'Chiayi City', 'Hsinchu County', 'Chiayi County',
                'Changhua', 'Nantou', 'Yulin', 'Miaoli', 'Pingtung', 'Yilan', 'Hualien',
                'Taitung', 'Penghu', 'Kinmen', 'Lienkiang'
            ]
        }
    }

    # Oceania
    oceania_countries = {
        'Australia': {
            'code': 'AU',
            'provinces': [
                'New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia',
                'Tasmania', 'Australian Capital Territory', 'Northern Territory'
            ]
        },
        'New Zealand': {
            'code': 'NZ',
            'provinces': [
                'Auckland', 'Bay of Plenty', 'Canterbury', 'Gisborne', 'Hawke\'s Bay',
                'Manawatu-Wanganui', 'Marlborough', 'Nelson', 'Northland', 'Otago',
                'Southland', 'Taranaki', 'Tasman', 'Waikato', 'Wellington', 'West Coast'
            ]
        }
    }
    
    continents = {
        'North America': north_america_countries,
        'Europe': europe_countries,
        'Asia': asia_countries,
        'Oceania': oceania_countries
    }

    for continent, countries in continents.items():
        for country, details in countries.items():
            state_id = 1
            for province in details['provinces']:
                # The state abbreviation logic needs to be robust for all cases.
                state_abbr_words = province.split(' ')
                state_abbr_raw = ''.join([word[0].upper() for word in state_abbr_words if word[0].isalpha()]).ljust(2, 'X')
                
                # Create a reliable 2-letter abbreviation
                state_abbr = state_abbr_raw[:2]

                # Create the new 8-character Geo_ID
                geo_id = f"{continent_codes[continent]}{details['code']}{state_abbr}{state_id:02d}"

                geography_data.append([
                    geo_id,
                    continent,
                    country,
                    details['code'],
                    province,
                    generate_plausible_zip(country)
                ])
                state_id += 1

    dim_geography_df = pd.DataFrame(geography_data, columns=[
        'Geo_ID', 'Continent', 'Country', 'Country_Code', 'State_Province', 'Zip_Code'
    ])
    
    return dim_geography_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Geography table...")
    dim_geography_df = generate_dim_geography()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Geography.csv...")
    # Using 'utf-8-sig' encoding to ensure proper display of non-English characters in Excel.
    # This adds a BOM (Byte Order Mark) to the file, which helps applications
    # correctly identify the encoding.
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8-sig')
    
    end_time = time.time()
    print(f"Dim_Geography.csv has been successfully generated with {len(dim_geography_df)} rows in {end_time - start_time:.2f} seconds.")


Generating Dim_Geography table...
Saving Dim_Geography.csv...
Dim_Geography.csv has been successfully generated with 432 rows in 0.01 seconds.


### **5/6: 生成 Dim_Prices 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，新产品或不同时段价格可能会发生变化，此时就需要更新或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）** ###

In [43]:
# -*- coding: utf-8 -*-
"""Generate Dim_Prices Table"""

import pandas as pd
from datetime import datetime
import os
import random
import numpy as np

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_prices():
    """
    Generates the Dim_Prices table with prices for different vehicle models and time periods.
    """
    start_date = datetime(2013, 1, 1)
    end_date = datetime(2025, 6, 30)

    price_data = []

    # 定义不同车型的基础价格
    # 价格基于公开数据和市场趋势估算
    base_prices = {
        1: 75000,  # Model S
        2: 80000,  # Model X
        3: 40000,  # Model 3
        4: 50000,  # Model Y
        5: 120000, # Cybertruck
    }
    
    # 定义产品ID的映射，使其与产品维度表格式一致
    product_id_mapping = {
        1: 'PRO001',
        2: 'PRO002',
        3: 'PRO003',
        4: 'PRO004',
        5: 'PRO005',
    }

    # 手动添加 2013-2018 年的价格数据以确保销售数据生成准确
    # Model S (Product_ID = 1)
    price_data.append({'Quarter_Start_Date': datetime(2013, 1, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})
    price_data.append({'Quarter_Start_Date': datetime(2013, 4, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})
    price_data.append({'Quarter_Start_Date': datetime(2013, 7, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})
    price_data.append({'Quarter_Start_Date': datetime(2013, 10, 1), 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': 75000, 'Discounted_Price_USD': 75000})

    # Model X (Product_ID = 2) 在2015年末发布
    # Model 3 (Product_ID = 3) 在2017年中发布
    # Model Y (Product_ID = 4) 在2020年初发布
    # Cybertruck (Product_ID = 5) 在2023年末发布
    
    # 填充 2014-2018 年的价格
    for year in range(2014, 2019):
        for month in [1, 4, 7, 10]:
            quarter_start = datetime(year, month, 1)
            # Model S 价格小幅波动
            price_s = base_prices[1] + np.random.randint(-2000, 2000)
            price_data.append({'Quarter_Start_Date': quarter_start, 'Product_ID': product_id_mapping[1], 'Standard_Price_USD': price_s, 'Discounted_Price_USD': price_s})

            # Model X
            if year >= 2015 and quarter_start >= datetime(2015, 9, 1):
                price_x = base_prices[2] + np.random.randint(-2000, 2000)
                price_data.append({'Quarter_Start_Date': quarter_start, 'Product_ID': product_id_mapping[2], 'Standard_Price_USD': price_x, 'Discounted_Price_USD': price_x})

            # Model 3
            if year >= 2017 and quarter_start >= datetime(2017, 7, 1):
                price_3 = base_prices[3] + np.random.randint(-1000, 1000)
                price_data.append({'Quarter_Start_Date': quarter_start, 'Product_ID': product_id_mapping[3], 'Standard_Price_USD': price_3, 'Discounted_Price_USD': price_3})

    # 填充 2019 年至今的价格，并引入价格波动和折扣
    current_date = datetime(2019, 1, 1)
    while current_date <= end_date:
        for product_id, base_price in base_prices.items():
            if (product_id == 4 and current_date < datetime(2020, 1, 1)) or \
               (product_id == 5 and current_date < datetime(2023, 11, 1)):
                continue

            # 模拟价格波动（5%以内的随机波动）
            price_std = base_price * (1 + random.uniform(-0.05, 0.05))
            price_dis = price_std

            # 模拟折扣（约20%的记录有折扣）
            if random.random() < 0.20:
                discount_rate = random.uniform(0.01, 0.15)
                price_dis = price_std * (1 - discount_rate)
            
            price_data.append({
                'Quarter_Start_Date': current_date,
                'Product_ID': product_id_mapping[product_id],
                'Standard_Price_USD': round(price_std, 2),
                'Discounted_Price_USD': round(price_dis, 2)
            })

        # 移动到下一个季度
        if current_date.month == 10:
            current_date = current_date.replace(year=current_date.year + 1, month=1)
        else:
            current_date = current_date.replace(month=current_date.month + 3)

    dim_prices_df = pd.DataFrame(price_data)

    # 格式化日期列
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date']).dt.date

    # 生成 Price_ID，格式为 PRI###
    price_ids = [f'PRI{i:03d}' for i in range(1, len(dim_prices_df) + 1)]
    dim_prices_df['Price_ID'] = price_ids
    
    # 重新排序列以匹配您的要求
    dim_prices_df = dim_prices_df[['Price_ID', 'Product_ID', 'Quarter_Start_Date', 'Standard_Price_USD', 'Discounted_Price_USD']]
    
    return dim_prices_df

if __name__ == '__main__':
    print("正在生成 Dim_Prices 表...")
    dim_prices_df = generate_dim_prices()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    print("保存 Dim_Prices.csv...")
    dim_prices_df.to_csv(os.path.join(output_dir, 'Dim_Prices.csv'), index=False, encoding='utf-8')
    print("Dim_Prices.csv 已成功生成！")


正在生成 Dim_Prices 表...
保存 Dim_Prices.csv...
Dim_Prices.csv 已成功生成！


### **6/6: 生成 Fact_Sales 表 （没有空白营收行 追加到2013）(高度动态数据，最常被追加（append）的表) 只进不出”的设计哲学。每当一笔新的销售发生，就在 Fact_Sales 表中追加一行新的数据，而不会去修改之前已经存在的历史销售记录** ###

In [9]:
# -*- coding: utf-8 -*-
"""Generate Fact_Sales Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)

    # 季度总营收数据 (单位: 10亿 USD)
    quarterly_total_revenue = {
        # 2013-2018年的数据根据提供的图片分配
        (2013, 1): 0.4e9, (2013, 2): 0.45e9, (2013, 3): 0.53e9, (2013, 4): 0.63e9,
        (2014, 1): 0.65e9, (2014, 2): 0.75e9, (2014, 3): 0.85e9, (2014, 4): 0.95e9,
        (2015, 1): 0.8e9, (2015, 2): 0.9e9, (2015, 3): 1.0e9, (2015, 4): 1.35e9,
        (2016, 1): 1.4e9, (2016, 2): 1.6e9, (2016, 3): 1.9e9, (2016, 4): 2.1e9,
        (2017, 1): 2.3e9, (2017, 2): 2.6e9, (2017, 3): 3.0e9, (2017, 4): 3.86e9,
        (2018, 1): 4.1e9, (2018, 2): 4.9e9, (2018, 3): 5.8e9, (2018, 4): 6.66e9,
        # 2019-2025年季度总营收，严格按照用户提供的财报数据
        (2019, 1): 4.541e9, (2019, 2): 6.350e9, (2019, 3): 6.303e9, (2019, 4): 7.384e9,
        (2020, 1): 5.985e9, (2020, 2): 6.036e9, (2020, 3): 8.771e9, (2020, 4): 10.744e9,
        (2021, 1): 10.389e9, (2021, 2): 11.958e9, (2021, 3): 13.757e9, (2021, 4): 17.719e9,
        (2022, 1): 18.756e9, (2022, 2): 16.934e9, (2022, 3): 21.454e9, (2022, 4): 24.318e9,
        (2023, 1): 23.329e9, (2023, 2): 24.927e9, (2023, 3): 23.350e9, (2023, 4): 25.167e9,
        (2024, 1): 21.301e9, (2024, 2): 25.500e9, (2024, 3): 25.182e9, (2024, 4): 25.707e9,
        (2025, 1): 19.335e9, (2025, 2): 22.496e9
    }
    
    # 季度汽车销售营收数据 (单位: 10亿 USD)，严格按照用户提供的财报数据
    quarterly_automotive_revenue = {
        (2019, 1): 4.541e9, (2019, 2): 6.350e9, (2019, 3): 6.303e9, (2019, 4): 7.384e9,
        (2020, 1): 5.985e9, (2020, 2): 4.911e9, (2020, 3): 7.346e9, (2020, 4): 9.034e9,
        (2021, 1): 8.187e9, (2021, 2): 9.520e9, (2021, 3): 11.393e9, (2021, 4): 15.025e9,
        (2022, 1): 15.514e9, (2022, 2): 13.670e9, (2022, 3): 17.785e9, (2022, 4): 20.241e9,
        (2023, 1): 18.878e9, (2023, 2): 20.419e9, (2023, 3): 18.582e9, (2023, 4): 20.630e9,
        (2024, 1): 16.460e9, (2024, 2): 18.530e9, (2024, 3): 18.831e9, (2024, 4): 18.659e9,
        (2025, 1): 12.925e9, (2025, 2): 15.787e9
    }

    # 年度交付量数据，已根据你提供的实际历史数据更新
    unit_targets_by_year = {
        2013: 22442, 2014: 31655, 2015: 50517, 2016: 76243, 2017: 103091,
        2018: 245491, 2019: 367656, 2020: 499535, 2021: 936222, 2022: 1313851,
        2023: 1808581, 2024: 1789226
    }
    unit_targets_by_year[2025] = 336681 + 384122
    
    # 2013-2018年季度交付量手动分配
    quarterly_unit_splits = {
        2013: {1: 4750, 2: 5150, 3: 5800, 4: 6742},
        2014: {1: 6450, 2: 7570, 3: 8800, 4: 8835},
        2015: {1: 10045, 2: 11532, 3: 11584, 4: 17356},
        2016: {1: 14810, 2: 18345, 3: 24500, 4: 18588},
        2017: {1: 25418, 2: 22000, 3: 26135, 4: 29538},
        2018: {1: 29980, 2: 40740, 3: 83780, 4: 90991}
    }

    # --------------------------
    # 步骤 1: 数据清洗和预处理
    # --------------------------
    if 'Product_Name' not in dim_product_df.columns:
        print("警告：Dim_Product.csv中缺少'Product_Name'列，正在根据Product_ID创建。")
        product_id_to_name = {
            'PRO001': 'Model S', 'PRO002': 'Model X', 'PRO003': 'Model 3', 'PRO004': 'Model Y', 'PRO005': 'Cybertruck'
        }
        dim_product_df['Product_Name'] = dim_product_df['Product_ID'].map(product_id_to_name).fillna('Other')
    
    # 补全国家列表，确保数据更真实
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        return 'North America'
        
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # 步骤 2: 定义权重
    # --------------------------
    # 调整产品权重，使其更符合实际销售情况
    product_weights_by_name = {
        'Model S': 0.05, 'Model X': 0.05, 'Model 3': 0.45, 'Model Y': 0.40, 'Cybertruck': 0.05
    }
    
    # 定义大陆、国家、省份的权重，并提供默认值
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    # 预计算所有 Geo_ID 的权重
    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    
    # --------------------------
    # 步骤 3: 确保时间数据类型一致
    # --------------------------
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    dim_time_df['Year_Int'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_Int'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time

    # --------------------------
    # 步骤 4: 创建价格查找字典
    # --------------------------
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv 文件中缺少必要的列。请检查文件是否包含 'Standard_Price_USD' 和 'Discounted_Price_USD'。")

    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    model_avg_prices = dim_prices_df.groupby('Product_ID')['Standard_Price_USD'].mean().to_dict()

    product_weights_dict = {
        pid: product_weights_by_name.get(pname, 0.0001) for pid, pname in dim_product_df.set_index('Product_ID')['Product_Name'].to_dict().items()
    }
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()

    start_year = min(unit_targets_by_year.keys())
    
    total_generated_rows = 0
    header_written = False

    all_product_ids = list(product_weights_dict.keys())
    all_geo_ids = list(geo_weights_dict.keys())
    
    combo_list = []
    combo_weights_list = []
    
    for prod_id, geo_id in product(all_product_ids, all_geo_ids):
        prod_weight = product_weights_dict.get(prod_id, 0.0001)
        geo_weight = geo_weights_dict.get(geo_id, 0.0001)
        combo_list.append((prod_id, geo_id))
        combo_weights_list.append(prod_weight * geo_weight)

    total_combo_weight = sum(combo_weights_list)
    if total_combo_weight == 0:
        print("警告：总组合权重为零，无法进行数据生成。")
        return 0

    combo_probabilities = np.array(combo_weights_list) / total_combo_weight
    
    for year in range(start_year, end_date.year + 1):
        for quarter in range(1, 5):
            target_units = 0
            target_sales_revenue = 0

            # 严格根据提供的财报数据分配汽车销售额
            if (year, quarter) in quarterly_automotive_revenue:
                target_sales_revenue = quarterly_automotive_revenue[(year, quarter)]
            else:
                # 对于2013-2018年，根据总营收比例估算汽车销售额
                if (year, quarter) in quarterly_total_revenue and year < 2019:
                    # 假设汽车销售占总营收的90% (一个合理的假设)
                    target_sales_revenue = quarterly_total_revenue[(year, quarter)] * 0.9 
                else:
                    continue

            # 根据季度交付量来计算目标单位数
            if year < 2019:
                if year not in quarterly_unit_splits or quarter not in quarterly_unit_splits[year]:
                    continue
                target_units = quarterly_unit_splits[year][quarter]
            else:
                total_year_units = unit_targets_by_year.get(year, 0)
                if total_year_units == 0:
                    continue
                # 使用季度汽车销售额占年度总销售额的比例来分配单位数
                total_year_sales_revenue = sum(v for k, v in quarterly_automotive_revenue.items() if k[0] == year)
                if total_year_sales_revenue == 0:
                    # 2013-2018年的特殊处理
                    total_year_sales_revenue = sum(v for k, v in quarterly_total_revenue.items() if k[0] == year) * 0.9
                    
                if total_year_sales_revenue == 0:
                     continue

                quarter_revenue_ratio = target_sales_revenue / total_year_sales_revenue
                target_units = int(total_year_units * quarter_revenue_ratio)
            
            if target_units <= 0:
                continue
            
            print(f"正在为年份 {year} 第 {quarter} 季度生成 {target_units:,} 条销售记录...")
            
            sampled_combo_indices = np.random.choice(len(combo_list), size=target_units, p=combo_probabilities)
            
            records = []
            quarter_time_ids = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]['Time_ID'].tolist()
            if not quarter_time_ids:
                print(f"警告：年份 {year} 第 {quarter} 季度没有可用的时间组合，跳过生成。")
                continue
            
            quarter_start_date = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]['Quarter_Start_Date'].iloc[0]

            generated_revenues = []
            transaction_details = []

            for i in range(target_units):
                combo_index = sampled_combo_indices[i]
                product_id, geo_id = combo_list[combo_index]
                
                prices = price_lookup.get((quarter_start_date, product_id))
                if prices:
                    standard_price = prices['Standard_Price_USD']
                    discounted_price = prices['Discounted_Price_USD']
                else:
                    standard_price = model_avg_prices.get(product_id, 0)
                    discounted_price = standard_price
                
                is_discounted = np.random.choice([True, False], p=[0.2, 0.8])
                price_used = discounted_price if is_discounted else standard_price
                generated_revenues.append(price_used)

                time_id = np.random.choice(quarter_time_ids)
                customer_id = np.random.choice(customer_ids)
                
                transaction_details.append({
                    'Time_ID': time_id,
                    'Geo_ID': geo_id,
                    'Product_ID': product_id,
                    'Customer_ID': customer_id,
                    'Sales_Units': 1,
                    'Is_Discounted_Sale': is_discounted,
                    'Revenue_USD': 0 # 临时占位，稍后校准
                })

            if not generated_revenues:
                continue

            # 校准收入以匹配季度汽车销售额
            total_generated_revenue = sum(generated_revenues)
            if total_generated_revenue > 0:
                scaling_factor = target_sales_revenue / total_generated_revenue
            else:
                scaling_factor = 0
            
            for detail, revenue in zip(transaction_details, generated_revenues):
                detail['Revenue_USD'] = revenue * scaling_factor

            fact_sales_df_temp = pd.DataFrame(transaction_details)

            fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
            header_written = True
            total_generated_rows += len(fact_sales_df_temp)
    
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在加载所有维度表...")
    try:
        dim_product_df = pd.read_csv(os.path.join('./output_data', 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join('./output_data', 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join('./output_data', 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join('./output_data', 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join('./output_data', 'Dim_Prices.csv'))
    except FileNotFoundError as e:
        print(f"错误：缺少一个或多个必需的 CSV 文件。请先运行所有维度生成脚本（1-5）。\n{e}")
        exit()

    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')

    if os.path.exists(output_filepath):
        os.remove(output_filepath)
        print("已移除旧的 Fact_Sales.csv 文件。")

    print("正在生成 Fact_Sales 表...")
    total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)

    if total_rows > 0:
        end_time = time.time()
        print(f"Fact_Sales.csv 已成功生成 {total_rows:,} 行数据，耗时 {end_time - start_time:.2f} 秒。")
        print("数据生成完成！")
    else:
        print("数据生成失败。")

正在加载所有维度表...
已移除旧的 Fact_Sales.csv 文件。
正在生成 Fact_Sales 表...
正在为年份 2013 第 1 季度生成 4,750 条销售记录...
正在为年份 2013 第 2 季度生成 5,150 条销售记录...
正在为年份 2013 第 3 季度生成 5,800 条销售记录...
正在为年份 2013 第 4 季度生成 6,742 条销售记录...
正在为年份 2014 第 1 季度生成 6,450 条销售记录...
正在为年份 2014 第 2 季度生成 7,570 条销售记录...
正在为年份 2014 第 3 季度生成 8,800 条销售记录...
正在为年份 2014 第 4 季度生成 8,835 条销售记录...
正在为年份 2015 第 1 季度生成 10,045 条销售记录...
正在为年份 2015 第 2 季度生成 11,532 条销售记录...
正在为年份 2015 第 3 季度生成 11,584 条销售记录...
正在为年份 2015 第 4 季度生成 17,356 条销售记录...
正在为年份 2016 第 1 季度生成 14,810 条销售记录...
正在为年份 2016 第 2 季度生成 18,345 条销售记录...
正在为年份 2016 第 3 季度生成 24,500 条销售记录...
正在为年份 2016 第 4 季度生成 18,588 条销售记录...
正在为年份 2017 第 1 季度生成 25,418 条销售记录...
正在为年份 2017 第 2 季度生成 22,000 条销售记录...
正在为年份 2017 第 3 季度生成 26,135 条销售记录...
正在为年份 2017 第 4 季度生成 29,538 条销售记录...
正在为年份 2018 第 1 季度生成 29,980 条销售记录...
正在为年份 2018 第 2 季度生成 40,740 条销售记录...
正在为年份 2018 第 3 季度生成 83,780 条销售记录...
正在为年份 2018 第 4 季度生成 90,991 条销售记录...
正在为年份 2019 第 1 季度生成 67,927 条销售记录...
正在为年份 2019 第 2 季度生成 94,988 条销售记录...
正在为年份 2019 第 

## **所有经营项目都有数据了** ##

In [57]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict

# --------------------------
# 修正后的生成 Dim_Time 表的函数
# --------------------------
def generate_dim_time_table(start_date, end_date):
    """
    Generate a time dimension table (dim_time_df) with all necessary columns.
    """
    time_series = pd.date_range(start=start_date, end=end_date)
    dim_time_df = pd.DataFrame(time_series, columns=['Full_Date'])
    
    dim_time_df['Time_ID'] = dim_time_df['Full_Date'].apply(lambda x: int(x.strftime('%Y%m%d')))
    
    dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Month_of_Year'] = dim_time_df['Full_Date'].dt.month
    dim_time_df['Day_of_Month'] = dim_time_df['Full_Date'].dt.day
    dim_time_df['Day_of_Week'] = dim_time_df['Full_Date'].dt.dayofweek
    dim_time_df['Week_of_Year'] = dim_time_df['Full_Date'].dt.isocalendar().week.astype(int)
    
    dim_time_df['Day_Name'] = dim_time_df['Full_Date'].dt.day_name()
    dim_time_df['Month_Name'] = dim_time_df['Full_Date'].dt.month_name()
    
    dim_time_df['Is_Weekend'] = dim_time_df['Day_of_Week'] >= 5
    
    return dim_time_df


def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output
    """
    end_date = datetime(2025, 6, 30)

    # --------------------------
    # Step 1: Parse and Load Data
    # --------------------------
    revenue_data = {
        'Automotive sales': {
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive regulatory credits': {
            (2013, 1): 0.0e6, (2013, 2): 0.0e6, (2013, 3): 0.0e6, (2013, 4): 0.0e6,
            (2014, 1): 0.0e6, (2014, 2): 0.0e6, (2014, 3): 0.0e6, (2014, 4): 0.0e6,
            (2015, 1): 0.0e6, (2015, 2): 0.0e6, (2015, 3): 0.0e6, (2015, 4): 0.0e6,
            (2016, 1): 0.0e6, (2016, 2): 0.0e6, (2016, 3): 0.0e6, (2016, 4): 0.0e6,
            (2017, 1): 0.0e6, (2017, 2): 0.0e6, (2017, 3): 0.0e6, (2017, 4): 0.0e6,
            (2018, 1): 0.0e6, (2018, 2): 0.0e6, (2018, 3): 0.0e6, (2018, 4): 0.0e6,
            (2019, 1): 0.0e6, (2019, 2): 0.0e6, (2019, 3): 0.0e6, (2019, 4): 0.0e6,
            (2020, 1): 0.0e6, (2020, 2): 0.0e6, (2020, 3): 0.0e6, (2020, 4): 0.0e6,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy generation and storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services and other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    unit_targets_by_year = defaultdict(int)
    unit_targets_by_year.update({
        2013: 22442, 2014: 31655, 2015: 50517, 2016: 76243, 2017: 103091,
        2018: 245491, 2019: 367656, 2020: 499535, 2021: 936222, 2022: 1313851,
        2023: 1808581, 2024: 1789226, 2025: 720802
    })
    
    # --------------------------
    # Step 2: Data Cleaning and Preprocessing
    # --------------------------
    if 'Product_Name' not in dim_product_df.columns:
        print("Warning: Missing 'Product_Name' column in Dim_Product.csv. Creating from Product_ID.")
        product_id_to_name = {
            'PRO001': 'Model S', 'PRO002': 'Model X', 'PRO003': 'Model 3', 'PRO004': 'Model Y', 'PRO005': 'Cybertruck', 'PRO006': 'Other Revenue'
        }
        dim_product_df['Product_Name'] = dim_product_df['Product_ID'].map(product_id_to_name).fillna('Other')
    
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        return 'North America'
        
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # Step 3: Define Weights
    # --------------------------
    product_weights_by_name = {
        'Model S': 0.05, 'Model X': 0.05, 'Model 3': 0.45, 'Model Y': 0.40, 'Cybertruck': 0.05
    }
    
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    
    # --------------------------
    # Step 4: Ensure Data Types are Consistent (Modified)
    # --------------------------
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    
    # Check for and create necessary columns if they don't exist
    if 'Year' not in dim_time_df.columns:
        dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    
    # Using 'Year' and 'Quarter_of_Year' for consistency
    dim_time_df['Year_Int'] = dim_time_df['Year']
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year']
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time

    # --------------------------
    # Step 5: Create Price Lookup Dictionary
    # --------------------------
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns. Please check if it contains 'Standard_Price_USD' and 'Discounted_Price_USD'.")

    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    model_avg_prices = dim_prices_df.groupby('Product_ID')['Standard_Price_USD'].mean().to_dict()

    product_weights_dict = {
        pid: product_weights_by_name.get(pname, 0.0001) for pid, pname in dim_product_df.set_index('Product_ID')['Product_Name'].to_dict().items()
    }
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()

    start_year = min(y for y, q in revenue_data['Automotive sales'].keys())
    
    total_generated_rows = 0
    header_written = False
    total_automotive_units = 0

    all_product_ids = list(product_weights_dict.keys())
    all_geo_ids = list(geo_weights_dict.keys())
    
    combo_list = []
    combo_weights_list = []
    
    for prod_id, geo_id in product(all_product_ids, all_geo_ids):
        prod_weight = product_weights_dict.get(prod_id, 0.0001)
        geo_weight = geo_weights_dict.get(geo_id, 0.0001)
        combo_list.append((prod_id, geo_id))
        combo_weights_list.append(prod_weight * geo_weight)

    total_combo_weight = sum(combo_weights_list)
    if total_combo_weight == 0:
        print("Warning: Total combination weight is zero. Cannot generate data.")
        return 0, 0

    combo_probabilities = np.array(combo_weights_list) / total_combo_weight
    
    # --------------------------
    # Step 6: Generate Fact Table Records (Optimized Logic)
    # --------------------------
    non_automotive_products = {
        'Automotive regulatory credits': 'PRO007',
        'Services and other': 'PRO008',
        'Automotive leasing': 'PRO009',
        'Energy generation and storage': 'PRO010'
    }
    
    for year in range(start_year, end_date.year + 1):
        for quarter in range(1, 5):
            if year == 2025 and quarter > 2:
                continue
            
            quarterly_revenue = {category: revenue_data.get(category, {}).get((year, quarter), 0) for category in revenue_data}
            
            # Find the number of days in the quarter
            quarter_dates_df = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]
            
            if quarter_dates_df.empty:
                print(f"Warning: No time IDs available for Year {year} Quarter {quarter}, skipping generation.")
                continue

            records = []
            
            for category, revenue in quarterly_revenue.items():
                if revenue > 0:
                    if category == 'Automotive sales':
                        total_year_units = unit_targets_by_year.get(year, 0)
                        
                        total_year_sales_revenue = sum(v for k, v in revenue_data['Automotive sales'].items() if k[0] == year)
                        
                        if total_year_sales_revenue > 0:
                            quarter_revenue_ratio = revenue / total_year_sales_revenue
                            target_units = int(total_year_units * quarter_revenue_ratio)
                            
                            if target_units > 0:
                                print(f"正在为{year}Q{quarter}生成 {target_units:,} 条汽车销售记录...")
                                
                                sampled_combo_indices = np.random.choice(len(combo_list), size=target_units, p=combo_probabilities)
                                average_price_per_unit = revenue / target_units
                                
                                for i in range(target_units):
                                    combo_index = sampled_combo_indices[i]
                                    product_id, geo_id = combo_list[combo_index]
                                    
                                    time_id = np.random.choice(quarter_dates_df['Time_ID'])
                                    customer_id = np.random.choice(customer_ids)
                                    
                                    records.append({
                                        'Time_ID': time_id,
                                        'Geo_ID': geo_id,
                                        'Product_ID': product_id,
                                        'Customer_ID': customer_id,
                                        'Sales_Units': 1,
                                        'Is_Discounted_Sale': False,
                                        'Revenue_USD': average_price_per_unit,
                                        'Revenue_Category': category
                                    })
                                total_automotive_units += target_units
                        
                    else:
                        product_id = non_automotive_products.get(category, 'PRO006')
                        quarter_time_id = quarter_dates_df.iloc[0]['Time_ID']
                        
                        records.append({
                            'Time_ID': quarter_time_id,
                            'Geo_ID': 'GEO001',
                            'Product_ID': product_id,
                            'Customer_ID': 'CUS001',
                            'Sales_Units': 1,
                            'Is_Discounted_Sale': False,
                            'Revenue_USD': revenue,
                            'Revenue_Category': category
                        })
            
            fact_sales_df_temp = pd.DataFrame(records)
            
            if not fact_sales_df_temp.empty:
                fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
                header_written = True
                total_generated_rows += len(fact_sales_df_temp)
    
    return total_generated_rows, total_automotive_units

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        # Check if Dim_Time.csv exists, if not, generate it
        dim_time_path = os.path.join(output_dir, 'Dim_Time.csv')
        if not os.path.exists(dim_time_path):
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print("Dim_Time.csv not found. Generating...")
            dim_time_df = generate_dim_time_table(datetime(2013, 1, 1), datetime(2025, 6, 30))
            dim_time_df.to_csv(dim_time_path, index=False)
            print("Dim_Time.csv generated.")

        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(dim_time_path)
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
    
    if dim_product_df is None or dim_time_df is None or dim_customer_df is None or dim_geography_df is None or dim_prices_df is None:
        print("Data generation failed due to missing files.")
    else:
        # Extend the Product dimension
        new_product_records = pd.DataFrame([
            {'Product_ID': 'PRO006', 'Product_Name': 'Other Revenue'},
            {'Product_ID': 'PRO007', 'Product_Name': 'Regulatory Credits'},
            {'Product_ID': 'PRO008', 'Product_Name': 'Services'},
            {'Product_ID': 'PRO009', 'Product_Name': 'Leasing'},
            {'Product_ID': 'PRO010', 'Product_Name': 'Energy'}
        ])
        dim_product_df = pd.concat([dim_product_df, new_product_records], ignore_index=True)

        # Extend the Geography dimension
        new_geo_record = pd.DataFrame([{'Geo_ID': 'GEO001', 'Country': 'Global', 'State_Province': 'NA'}])
        dim_geography_df = pd.concat([dim_geography_df, new_geo_record], ignore_index=True)
        
        # Extend the Customer dimension
        new_customer_record = pd.DataFrame([{'Customer_ID': 'CUS001', 'First_Name': 'Global', 'Last_Name': 'Customer', 'Gender': 'NA'}])
        dim_customer_df = pd.concat([dim_customer_df, new_customer_record], ignore_index=True)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
        
        if os.path.exists(output_filepath):
            os.remove(output_filepath)
    
        print("Generating Fact_Sales table...")
        total_rows, total_automotive_units = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
        if total_rows > 0:
            end_time = time.time()
            print(f"Fact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
            print(f"其中包含 {total_automotive_units:,} 条汽车销售记录。")
            print("Data generation complete!")
        else:
            print("Data generation failed.")

Loading all dimension tables and data sources from './output_data'...
Generating Fact_Sales table...
正在为2013Q1生成 5,071 条汽车销售记录...
正在为2013Q2生成 5,325 条汽车销售记录...
正在为2013Q3生成 5,705 条汽车销售记录...
正在为2013Q4生成 6,339 条汽车销售记录...
正在为2014Q1生成 7,147 条汽车销售记录...
正在为2014Q2生成 7,658 条汽车销售记录...
正在为2014Q3生成 8,169 条汽车销售记录...
正在为2014Q4生成 8,679 条汽车销售记录...
正在为2015Q1生成 11,510 条汽车销售记录...
正在为2015Q2生成 12,149 条汽车销售记录...
正在为2015Q3生成 12,789 条汽车销售记录...
正在为2015Q4生成 14,068 条汽车销售记录...
正在为2016Q1生成 16,574 条汽车销售记录...
正在为2016Q2生成 17,679 条汽车销售记录...
正在为2016Q3生成 19,889 条汽车销售记录...
正在为2016Q4生成 22,099 条汽车销售记录...
正在为2017Q1生成 22,411 条汽车销售记录...
正在为2017Q2生成 25,100 条汽车销售记录...
正在为2017Q3生成 26,893 条汽车销售记录...
正在为2017Q4生成 28,686 条汽车销售记录...
正在为2018Q1生成 38,948 条汽车销售记录...
正在为2018Q2生成 46,029 条汽车销售记录...
正在为2018Q3生成 77,896 条汽车销售记录...
正在为2018Q4生成 82,617 条汽车销售记录...
正在为2019Q1生成 64,660 条汽车销售记录...
正在为2019Q2生成 95,230 条汽车销售记录...
正在为2019Q3生成 94,567 条汽车销售记录...
正在为2019Q4生成 113,197 条汽车销售记录...
正在为2020Q1生成 93,348 条汽车销售记录...
正在为2020Q2生成 93,691 条汽车销售记录...
正在为202

## **移除空白大洲** ##

In [23]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict

# --------------------------
# 修正后的生成 Dim_Time 表的函数
# --------------------------
def generate_dim_time_table(start_date, end_date):
    """
    Generate a time dimension table (dim_time_df) with all necessary columns.
    """
    time_series = pd.date_range(start=start_date, end=end_date)
    dim_time_df = pd.DataFrame(time_series, columns=['Full_Date'])
    
    dim_time_df['Time_ID'] = dim_time_df['Full_Date'].apply(lambda x: int(x.strftime('%Y%m%d')))
    
    dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Month_of_Year'] = dim_time_df['Full_Date'].dt.month
    dim_time_df['Day_of_Month'] = dim_time_df['Full_Date'].dt.day
    dim_time_df['Day_of_Week'] = dim_time_df['Full_Date'].dt.dayofweek
    dim_time_df['Week_of_Year'] = dim_time_df['Full_Date'].dt.isocalendar().week.astype(int)
    
    dim_time_df['Day_Name'] = dim_time_df['Full_Date'].dt.day_name()
    dim_time_df['Month_Name'] = dim_time_df['Full_Date'].dt.month_name()
    
    dim_time_df['Is_Weekend'] = dim_time_df['Day_of_Week'] >= 5
    
    return dim_time_df


def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output
    """
    end_date = datetime(2025, 6, 30)

    # --------------------------
    # Step 1: Parse and Load Data (修正后的碳积分数据)
    # --------------------------
    revenue_data = {
        'Automotive sales': {
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive regulatory credits': {
            (2013, 1): 0.0e6, (2013, 2): 0.0e6, (2013, 3): 0.0e6, (2013, 4): 0.0e6,
            (2014, 1): 0.0e6, (2014, 2): 0.0e6, (2014, 3): 0.0e6, (2014, 4): 0.0e6,
            (2015, 1): 0.0e6, (2015, 2): 0.0e6, (2015, 3): 0.0e6, (2015, 4): 0.0e6,
            (2016, 1): 0.0e6, (2016, 2): 0.0e6, (2016, 3): 0.0e6, (2016, 4): 0.0e6,
            (2017, 1): 0.0e6, (2017, 2): 0.0e6, (2017, 3): 0.0e6, (2017, 4): 0.0e6,
            (2018, 1): 0.0e6, (2018, 2): 0.0e6, (2018, 3): 0.0e6, (2018, 4): 0.0e6,
            (2019, 1): 0.0e6, (2019, 2): 0.0e6, (2019, 3): 0.0e6, (2019, 4): 0.0e6,
            (2020, 1): 0.0e6, (2020, 2): 0.0e6, (2020, 3): 0.0e6, (2020, 4): 0.0e6,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy generation and storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services and other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    unit_targets_by_year = defaultdict(int)
    unit_targets_by_year.update({
        2013: 22442, 2014: 31655, 2015: 50517, 2016: 76243, 2017: 103091,
        2018: 245491, 2019: 367656, 2020: 499535, 2021: 936222, 2022: 1313851,
        2023: 1808581, 2024: 1789226, 2025: 720802
    })
    
    # --------------------------
    # Step 2: Data Cleaning and Preprocessing
    # --------------------------
    if 'Product_Name' not in dim_product_df.columns:
        print("Warning: Missing 'Product_Name' column in Dim_Product.csv. Creating from Product_ID.")
        product_id_to_name = {
            'PRO001': 'Model S', 'PRO002': 'Model X', 'PRO003': 'Model 3', 'PRO004': 'Model Y', 'PRO005': 'Cybertruck', 'PRO006': 'Other Revenue'
        }
        dim_product_df['Product_Name'] = dim_product_df['Product_ID'].map(product_id_to_name).fillna('Other')
    
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America' # Add Mexico to North America
        return 'Other' # Handle any countries not in the lists
        
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # Step 3: Define Weights
    # --------------------------
    product_weights_by_name = {
        'Model S': 0.05, 'Model X': 0.05, 'Model 3': 0.45, 'Model Y': 0.40, 'Cybertruck': 0.05
    }
    
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    
    # --------------------------
    # Step 4: Ensure Data Types are Consistent (Modified)
    # --------------------------
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    
    # Check for and create necessary columns if they don't exist
    if 'Year' not in dim_time_df.columns:
        dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    
    # Using 'Year' and 'Quarter_of_Year' for consistency
    dim_time_df['Year_Int'] = dim_time_df['Year']
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year']
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time

    # --------------------------
    # Step 5: Create Price Lookup Dictionary
    # --------------------------
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns. Please check if it contains 'Standard_Price_USD' and 'Discounted_Price_USD'.")

    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    model_avg_prices = dim_prices_df.groupby('Product_ID')['Standard_Price_USD'].mean().to_dict()

    product_weights_dict = {
        pid: product_weights_by_name.get(pname, 0.0001) for pid, pname in dim_product_df.set_index('Product_ID')['Product_Name'].to_dict().items()
    }
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()
    
    # 获取汽车销售的地理ID列表，排除非汽车销售的GEO001
    automotive_geo_ids = dim_geography_df[dim_geography_df['Geo_ID'] != 'GEO001']['Geo_ID'].tolist()

    start_year = min(y for y, q in revenue_data['Automotive sales'].keys())
    
    total_generated_rows = 0
    header_written = False
    total_automotive_units = 0

    all_product_ids = list(product_weights_dict.keys())
    # 修正: 使用所有地理ID，包括GEO001
    all_geo_ids = list(geo_weights_dict.keys())
    
    combo_list = []
    combo_weights_list = []
    
    for prod_id, geo_id in product(all_product_ids, all_geo_ids):
        # 修正：非汽车产品（PRO007-PRO010）只与GEO001组合，汽车产品（PRO001-PRO005）只与非GEO001的地理ID组合
        is_automotive_product = prod_id in ['PRO001', 'PRO002', 'PRO003', 'PRO004', 'PRO005']
        
        if is_automotive_product and geo_id != 'GEO001':
            prod_weight = product_weights_dict.get(prod_id, 0.0001)
            geo_weight = geo_weights_dict.get(geo_id, 0.0001)
            combo_list.append((prod_id, geo_id))
            combo_weights_list.append(prod_weight * geo_weight)
        elif not is_automotive_product and geo_id == 'GEO001':
            # 非汽车产品和GEO001的权重设为1，以确保被选中
            combo_list.append((prod_id, geo_id))
            combo_weights_list.append(1.0) # 修正: 给予高权重
    
    total_combo_weight = sum(combo_weights_list)
    if total_combo_weight == 0:
        print("Warning: Total combination weight is zero. Cannot generate data.")
        return 0, 0

    combo_probabilities = np.array(combo_weights_list) / total_combo_weight
    
    # --------------------------
    # Step 6: Generate Fact Table Records (Optimized Logic)
    # --------------------------
    non_automotive_products = {
        'Automotive regulatory credits': 'PRO007',
        'Services and other': 'PRO008',
        'Automotive leasing': 'PRO009',
        'Energy generation and storage': 'PRO010'
    }
    
    # 获取现有地理维度表中，不包括GEO001的所有地理ID
    real_geo_ids = dim_geography_df[dim_geography_df['Geo_ID'] != 'GEO001']['Geo_ID'].tolist()
    
    for year in range(start_year, end_date.year + 1):
        for quarter in range(1, 5):
            if year == 2025 and quarter > 2:
                continue
            
            quarterly_revenue = {category: revenue_data.get(category, {}).get((year, quarter), 0) for category in revenue_data}
            
            quarter_dates_df = dim_time_df[
                (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
            ]
            
            if quarter_dates_df.empty:
                print(f"Warning: No time IDs available for Year {year} Quarter {quarter}, skipping generation.")
                continue

            records = []
            
            for category, revenue in quarterly_revenue.items():
                if revenue > 0:
                    if category == 'Automotive sales':
                        total_year_units = unit_targets_by_year.get(year, 0)
                        
                        total_year_sales_revenue = sum(v for k, v in revenue_data['Automotive sales'].items() if k[0] == year)
                        
                        if total_year_sales_revenue > 0:
                            quarter_revenue_ratio = revenue / total_year_sales_revenue
                            target_units = int(total_year_units * quarter_revenue_ratio)
                            
                            if target_units > 0:
                                print(f"正在为{year}Q{quarter}生成 {target_units:,} 条汽车销售记录...")
                                
                                sampled_combo_indices = np.random.choice(len(combo_list), size=target_units, p=combo_probabilities)
                                average_price_per_unit = revenue / target_units
                                
                                for i in range(target_units):
                                    combo_index = sampled_combo_indices[i]
                                    product_id, geo_id = combo_list[combo_index]
                                    
                                    # 修正：确保汽车销售记录的geo_id不是GEO001
                                    if geo_id == 'GEO001':
                                        continue
                                    
                                    time_id = np.random.choice(quarter_dates_df['Time_ID'])
                                    customer_id = np.random.choice(customer_ids)
                                    
                                    records.append({
                                        'Time_ID': time_id,
                                        'Geo_ID': geo_id,
                                        'Product_ID': product_id,
                                        'Customer_ID': customer_id,
                                        'Sales_Units': 1,
                                        'Is_Discounted_Sale': False,
                                        'Revenue_USD': average_price_per_unit,
                                        'Revenue_Category': category
                                    })
                                total_automotive_units += target_units
                    else:
                        # 修正: 非汽车销售收入不再只使用GEO001，而是随机分配给所有真实地理ID
                        product_id = non_automotive_products.get(category, 'PRO006')
                        quarter_time_id = quarter_dates_df.iloc[0]['Time_ID']
                        
                        # 新增逻辑：将非汽车销售收入拆分成多条记录，并分配给不同的真实地理位置
                        # 这里我们简化处理，将总收入根据地理权重拆分并分配
                        
                        # 随机选择一个地理ID进行分配
                        # 修正: 使用真实地理ID列表进行随机选择
                        random_geo_id = np.random.choice(real_geo_ids)
                        
                        records.append({
                            'Time_ID': quarter_time_id,
                            'Geo_ID': random_geo_id, # 修正: 分配给一个随机的真实地理ID
                            'Product_ID': product_id,
                            'Customer_ID': 'CUS001',
                            'Sales_Units': 1,
                            'Is_Discounted_Sale': False,
                            'Revenue_USD': revenue,
                            'Revenue_Category': category
                        })
            
            fact_sales_df_temp = pd.DataFrame(records)
            
            if not fact_sales_df_temp.empty:
                fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
                header_written = True
                total_generated_rows += len(fact_sales_df_temp)
    
    return total_generated_rows, total_automotive_units

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        # Check if Dim_Time.csv exists, if not, generate it
        dim_time_path = os.path.join(output_dir, 'Dim_Time.csv')
        if not os.path.exists(dim_time_path):
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print("Dim_Time.csv not found. Generating...")
            dim_time_df = generate_dim_time_table(datetime(2013, 1, 1), datetime(2025, 6, 30))
            dim_time_df.to_csv(dim_time_path, index=False)
            print("Dim_Time.csv generated.")

        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(dim_time_path)
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
    
    if dim_product_df is None or dim_time_df is None or dim_customer_df is None or dim_geography_df is None or dim_prices_df is None:
        print("Data generation failed due to missing files.")
    else:
        # Extend the Product dimension
        new_product_records = pd.DataFrame([
            {'Product_ID': 'PRO006', 'Product_Name': 'Other Revenue'},
            {'Product_ID': 'PRO007', 'Product_Name': 'Regulatory Credits'},
            {'Product_ID': 'PRO008', 'Product_Name': 'Services'},
            {'Product_ID': 'PRO009', 'Product_Name': 'Leasing'},
            {'Product_ID': 'PRO010', 'Product_Name': 'Energy'}
        ])
        dim_product_df = pd.concat([dim_product_df, new_product_records], ignore_index=True)

        # Extend the Customer dimension
        new_customer_record = pd.DataFrame([{'Customer_ID': 'CUS001', 'First_Name': 'Global', 'Last_Name': 'Customer', 'Gender': 'NA'}])
        dim_customer_df = pd.concat([dim_customer_df, new_customer_record], ignore_index=True)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
        
        if os.path.exists(output_filepath):
            os.remove(output_filepath)
    
        print("Generating Fact_Sales table...")
        total_rows, total_automotive_units = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
        if total_rows > 0:
            end_time = time.time()
            print(f"Fact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
            print(f"其中包含 {total_automotive_units:,} 条汽车销售记录。")
            print("Data generation complete!")
        else:
            print("Data generation failed.")

Loading all dimension tables and data sources from './output_data'...
Generating Fact_Sales table...
正在为2013Q1生成 5,071 条汽车销售记录...
正在为2013Q2生成 5,325 条汽车销售记录...
正在为2013Q3生成 5,705 条汽车销售记录...
正在为2013Q4生成 6,339 条汽车销售记录...
正在为2014Q1生成 7,147 条汽车销售记录...
正在为2014Q2生成 7,658 条汽车销售记录...
正在为2014Q3生成 8,169 条汽车销售记录...
正在为2014Q4生成 8,679 条汽车销售记录...
正在为2015Q1生成 11,510 条汽车销售记录...
正在为2015Q2生成 12,149 条汽车销售记录...
正在为2015Q3生成 12,789 条汽车销售记录...
正在为2015Q4生成 14,068 条汽车销售记录...
正在为2016Q1生成 16,574 条汽车销售记录...
正在为2016Q2生成 17,679 条汽车销售记录...
正在为2016Q3生成 19,889 条汽车销售记录...
正在为2016Q4生成 22,099 条汽车销售记录...
正在为2017Q1生成 22,411 条汽车销售记录...
正在为2017Q2生成 25,100 条汽车销售记录...
正在为2017Q3生成 26,893 条汽车销售记录...
正在为2017Q4生成 28,686 条汽车销售记录...
正在为2018Q1生成 38,948 条汽车销售记录...
正在为2018Q2生成 46,029 条汽车销售记录...
正在为2018Q3生成 77,896 条汽车销售记录...
正在为2018Q4生成 82,617 条汽车销售记录...
正在为2019Q1生成 64,660 条汽车销售记录...
正在为2019Q2生成 95,230 条汽车销售记录...
正在为2019Q3生成 94,567 条汽车销售记录...
正在为2019Q4生成 113,197 条汽车销售记录...
正在为2020Q1生成 93,348 条汽车销售记录...
正在为2020Q2生成 93,691 条汽车销售记录...
正在为202

## **带有验证机制的数据生成代码** ##

In [9]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict
import random

# --------------------------
# 生成 Dim_Time 表的函数
# --------------------------
def generate_dim_time_table(start_date, end_date):
    """
    生成一个包含所有必要列的时间维度表 (dim_time_df)。
    """
    time_series = pd.date_range(start=start_date, end=end_date)
    dim_time_df = pd.DataFrame(time_series, columns=['Full_Date'])
    
    dim_time_df['Time_ID'] = dim_time_df['Full_Date'].apply(lambda x: int(x.strftime('%Y%m%d')))
    
    dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Month_of_Year'] = dim_time_df['Full_Date'].dt.month
    dim_time_df['Day_of_Month'] = dim_time_df['Full_Date'].dt.day
    dim_time_df['Day_of_Week'] = dim_time_df['Full_Date'].dt.dayofweek
    dim_time_df['Week_of_Year'] = dim_time_df['Full_Date'].dt.isocalendar().week.astype(int)
    
    dim_time_df['Day_Name'] = dim_time_df['Full_Date'].dt.day_name()
    dim_time_df['Month_Name'] = dim_time_df['Full_Date'].dt.month_name()
    
    dim_time_df['Is_Weekend'] = dim_time_df['Day_of_Week'] >= 5
    
    return dim_time_df

# --------------------------
# 生成 Fact_Sales 表的函数
# --------------------------
def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    生成 Fact_Sales 表，其逻辑经过优化以匹配财务数据。
    """
    start_date_of_data = datetime(2013, 1, 1)
    end_date_of_data = datetime(2025, 6, 30)

    # --------------------------
    # 步骤 1: 解析和加载数据 (保持不变)
    # --------------------------
    revenue_data = {
        'Automotive sales': {
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive regulatory credits': {
            (2013, 1): 0.0e6, (2013, 2): 0.0e6, (2013, 3): 0.0e6, (2013, 4): 0.0e6,
            (2014, 1): 0.0e6, (2014, 2): 0.0e6, (2014, 3): 0.0e6, (2014, 4): 0.0e6,
            (2015, 1): 0.0e6, (2015, 2): 0.0e6, (2015, 3): 0.0e6, (2015, 4): 0.0e6,
            (2016, 1): 0.0e6, (2016, 2): 0.0e6, (2016, 3): 0.0e6, (2016, 4): 0.0e6,
            (2017, 1): 0.0e6, (2017, 2): 0.0e6, (2017, 3): 0.0e6, (2017, 4): 0.0e6,
            (2018, 1): 0.0e6, (2018, 2): 0.0e6, (2018, 3): 0.0e6, (2018, 4): 0.0e6,
            (2019, 1): 0.0e6, (2019, 2): 0.0e6, (2019, 3): 0.0e6, (2019, 4): 0.0e6,
            (2020, 1): 0.0e6, (2020, 2): 0.0e6, (2020, 3): 0.0e6, (2020, 4): 0.0e6,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy generation and storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services and other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    # --------------------------
    # 步骤 2: 数据清理和预处理 (保持不变)
    # --------------------------
    if 'Product_Name' not in dim_product_df.columns:
        product_id_to_name = {
            'PRO001': 'Model S', 'PRO002': 'Model X', 'PRO003': 'Model 3', 'PRO004': 'Model Y', 'PRO005': 'Cybertruck',
            'PRO007': 'Regulatory Credits', 'PRO008': 'Services', 'PRO009': 'Leasing', 'PRO010': 'Energy'
        }
        dim_product_df['Product_Name'] = dim_product_df['Product_ID'].map(product_id_to_name).fillna('Other')
    
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    
    # --------------------------
    # 步骤 3: 定义权重和映射
    # --------------------------
    non_automotive_products_map = {
        'Automotive regulatory credits': 'PRO007',
        'Automotive leasing': 'PRO009',
        'Energy generation and storage': 'PRO010',
        'Services and other': 'PRO008'
    }

    automotive_product_ids = ['PRO001', 'PRO002', 'PRO003', 'PRO004', 'PRO005']
    non_automotive_product_ids = [p_id for p_name, p_id in non_automotive_products_map.items()]

    product_weights_by_name = {
        'Model S': 0.05, 'Model X': 0.05, 'Model 3': 0.45, 'Model Y': 0.40, 'Cybertruck': 0.05
    }
    
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America'
        return 'Other'
    
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    
    # 修正：将 dim_prices_df 转换为易于查找的字典，使用 (年份, 季度) 作为键
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    dim_prices_df['Year'] = dim_prices_df['Quarter_Start_Date'].dt.year
    dim_prices_df['Quarter'] = dim_prices_df['Quarter_Start_Date'].dt.quarter
    
    price_lookup = dim_prices_df.set_index(['Year', 'Quarter', 'Product_ID'])[['Standard_Price_USD', 'Discounted_Price_USD']].to_dict('index')

    # --------------------------
    # 步骤 4: 精细化数据生成逻辑
    # --------------------------
    
    header_written = False
    total_generated_rows = 0
    total_automotive_units = 0
    
    all_geo_ids = dim_geography_df['Geo_ID'].tolist()
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()

    # 创建汽车产品-地理组合及其权重
    automotive_combos = []
    automotive_weights = []
    for prod_id in automotive_product_ids:
        for geo_id in all_geo_ids:
            if geo_id != 'GEO001':
                prod_weight = product_weights_by_name.get(dim_product_df[dim_product_df['Product_ID'] == prod_id]['Product_Name'].iloc[0], 0)
                geo_weight = geo_weights_dict.get(geo_id, 0)
                if prod_weight > 0 and geo_weight > 0:
                    automotive_combos.append((prod_id, geo_id))
                    automotive_weights.append(prod_weight * geo_weight)
    
    total_automotive_weight = sum(automotive_weights)
    automotive_probabilities = np.array(automotive_weights) / total_automotive_weight if total_automotive_weight > 0 else np.array([])

    # 创建非汽车产品-地理组合及其权重
    non_automotive_combos = []
    non_automotive_weights = []
    for category, prod_id in non_automotive_products_map.items():
        if prod_id in ['PRO007', 'PRO009']: #Credits and Leasing, using GEO001
            non_automotive_combos.append((prod_id, 'GEO001'))
            non_automotive_weights.append(1.0)
        else: # Services and Energy, distribute by geography
            for geo_id in dim_geography_df[dim_geography_df['Geo_ID'] != 'GEO001']['Geo_ID'].tolist():
                non_automotive_combos.append((prod_id, geo_id))
                non_automotive_weights.append(geo_weights_dict.get(geo_id, 0.0001))
    
    total_non_automotive_weight = sum(non_automotive_weights)
    non_automotive_probabilities = np.array(non_automotive_weights) / total_non_automotive_weight if total_non_automotive_weight > 0 else np.array([])
    
    # 主生成循环
    start_year = min(y for y, q in revenue_data['Automotive sales'].keys())
    
    for year in range(start_year, end_date_of_data.year + 1):
        for quarter in range(1, 5):
            if year == 2025 and quarter > 2:
                continue

            quarter_key = (year, quarter)
            
            print(f"Generating data for Year {year} Quarter {quarter}...")

            # **核心逻辑 1: 分配汽车销售收入 (调整后)**
            automotive_revenue = revenue_data.get('Automotive sales', {}).get(quarter_key, 0)
            if automotive_revenue > 0 and len(automotive_probabilities) > 0:
                records = []
                
                # 1. 计算每个产品-地理组合的销售单位数
                
                # 确保每个产品都能查找到价格，否则 total_avg_price 会是 0
                total_avg_price = 0
                for prod_id in automotive_product_ids:
                    price_info = price_lookup.get((year, quarter, prod_id))
                    if price_info:
                        product_name = dim_product_df[dim_product_df['Product_ID'] == prod_id]['Product_Name'].iloc[0]
                        price = price_info.get('Standard_Price_USD', 0)
                        weight = product_weights_by_name.get(product_name, 0)
                        total_avg_price += price * weight

                if total_avg_price > 0:
                    total_units = round(automotive_revenue / total_avg_price)
                    
                    # 2. 按比例分配单位数
                    distributed_units = {combo: total_units * prob for combo, prob in zip(automotive_combos, automotive_probabilities)}
                    
                    # 3. 生成记录
                    for (prod_id, geo_id), units_share in distributed_units.items():
                        units_to_generate = round(units_share)
                        if units_to_generate > 0:
                            try:
                                price_info = price_lookup.get((year, quarter, prod_id))
                                if not price_info:
                                    continue
                                unit_price = price_info['Standard_Price_USD']
                                
                                for _ in range(int(units_to_generate)):
                                    time_id = np.random.choice(dim_time_df[(dim_time_df['Year'] == year) & (dim_time_df['Quarter_of_Year'] == quarter)]['Time_ID'])
                                    # Ensure there are customers to choose from
                                    available_customers = dim_customer_df[dim_customer_df['Customer_ID'] != 'CUS001']['Customer_ID']
                                    if not available_customers.empty:
                                        customer_id = np.random.choice(available_customers)
                                    else:
                                        customer_id = 'CUS001'
                                        
                                    records.append({
                                        'Time_ID': time_id,
                                        'Geo_ID': geo_id,
                                        'Product_ID': prod_id,
                                        'Customer_ID': customer_id,
                                        'Sales_Units': 1,
                                        'Is_Discounted_Sale': False,
                                        'Revenue_USD': unit_price,
                                        'Revenue_Category': 'Automotive sales'
                                    })
                            except KeyError:
                                pass # 忽略缺失价格数据
                    
                    fact_sales_df_temp = pd.DataFrame(records)
                    if not fact_sales_df_temp.empty:
                        fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
                        header_written = True
                        total_generated_rows += len(fact_sales_df_temp)
                        total_automotive_units += fact_sales_df_temp['Sales_Units'].sum()

            # **核心逻辑 2: 分配非汽车销售收入 (保持不变)**
            for category, prod_id in non_automotive_products_map.items():
                non_automotive_revenue = revenue_data.get(category, {}).get(quarter_key, 0)
                if non_automotive_revenue > 0:
                    records = []
                    
                    category_combos = [c for c in non_automotive_combos if c[0] == prod_id]
                    category_weights = [w for c, w in zip(non_automotive_combos, non_automotive_weights) if c[0] == prod_id]
                    total_cat_weight = sum(category_weights)
                    category_probabilities = np.array(category_weights) / total_cat_weight if total_cat_weight > 0 else np.array([])
                    
                    if len(category_probabilities) == 0:
                        continue
                        
                    distributed_revenue = {combo: non_automotive_revenue * prob for combo, prob in zip(category_combos, category_probabilities)}
                    
                    for (na_prod_id, na_geo_id), na_revenue_share in distributed_revenue.items():
                        if na_revenue_share > 0:
                            time_id = np.random.choice(dim_time_df[(dim_time_df['Year'] == year) & (dim_time_df['Quarter_of_Year'] == quarter)]['Time_ID'])
                            records.append({
                                'Time_ID': time_id,
                                'Geo_ID': na_geo_id,
                                'Product_ID': na_prod_id,
                                'Customer_ID': 'CUS001',
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': na_revenue_share,
                                'Revenue_Category': category
                            })
                    
                    fact_sales_df_temp = pd.DataFrame(records)
                    if not fact_sales_df_temp.empty:
                        fact_sales_df_temp.to_csv(output_filepath, mode='a', header=not header_written, index=False, encoding='utf-8')
                        header_written = True
                        total_generated_rows += len(fact_sales_df_temp)

    # 最终汇总验证
    if os.path.exists(output_filepath):
        print("\n--- Generating Summary Report ---")
        final_df = pd.read_csv(output_filepath)
        final_df['Full_Date'] = pd.to_datetime(final_df['Time_ID'], format='%Y%m%d')
        final_df['Year'] = final_df['Full_Date'].dt.year
        final_df['Quarter'] = final_df['Full_Date'].dt.quarter
        
        summary = final_df.groupby(['Revenue_Category', 'Year', 'Quarter'])['Revenue_USD'].sum().reset_index()

        for category, quarterly_data in revenue_data.items():
            print(f"\nComparing '{category}' revenue:")
            for (year, quarter), target_revenue in quarterly_data.items():
                generated_revenue = summary[(summary['Revenue_Category'] == category) & (summary['Year'] == year) & (summary['Quarter'] == quarter)]['Revenue_USD'].sum()
                
                match = np.isclose(target_revenue, generated_revenue, atol=1e-2)
                print(f"  {year}Q{quarter} | Target: ${target_revenue:,.2f} | Generated: ${generated_revenue:,.2f} | Match: {match}")

    return total_generated_rows, total_automotive_units

# --------------------------
# 修正后的主程序
# --------------------------
if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        dim_time_path = os.path.join(output_dir, 'Dim_Time.csv')
        if not os.path.exists(dim_time_path):
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print("Dim_Time.csv not found. Generating...")
            dim_time_df_temp = generate_dim_time_table(datetime(2013, 1, 1), datetime(2025, 6, 30))
            dim_time_df_temp.to_csv(dim_time_path, index=False)
            print("Dim_Time.csv generated.")

        # 加载 Dim_Time.csv 并确保数据类型正确
        dim_time_df = pd.read_csv(dim_time_path)
        dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
        dim_time_df['Time_ID'] = dim_time_df['Full_Date'].apply(lambda x: int(x.strftime('%Y%m%d')))
        dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
        
        # 加载其他维度表
        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
    
    if dim_product_df is None or dim_time_df is None or dim_customer_df is None or dim_geography_df is None or dim_prices_df is None:
        print("Data generation failed due to missing files.")
    else:
        new_product_records = pd.DataFrame([
            {'Product_ID': 'PRO006', 'Product_Name': 'Other Revenue'},
            {'Product_ID': 'PRO007', 'Product_Name': 'Regulatory Credits'},
            {'Product_ID': 'PRO008', 'Product_Name': 'Services'},
            {'Product_ID': 'PRO009', 'Product_Name': 'Leasing'},
            {'Product_ID': 'PRO010', 'Product_Name': 'Energy'}
        ])
        dim_product_df = pd.concat([dim_product_df, new_product_records], ignore_index=True)

        new_customer_record = pd.DataFrame([{'Customer_ID': 'CUS001', 'First_Name': 'Global', 'Last_Name': 'Customer', 'Gender': 'NA'}])
        dim_customer_df = pd.concat([dim_customer_df, new_customer_record], ignore_index=True)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
        
        if os.path.exists(output_filepath):
            os.remove(output_filepath)
    
        print("\nGenerating Fact_Sales table...")
        total_rows, total_automotive_units = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
        if total_rows > 0:
            end_time = time.time()
            print(f"\nFact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
            print(f"其中包含 {total_automotive_units:,} 条汽车销售记录。")
            print("Data generation complete!")
        else:
            print("Data generation failed.")

Loading all dimension tables and data sources from './output_data'...

Generating Fact_Sales table...
Generating data for Year 2013 Quarter 1...
Generating data for Year 2013 Quarter 2...
Generating data for Year 2013 Quarter 3...
Generating data for Year 2013 Quarter 4...
Generating data for Year 2014 Quarter 1...
Generating data for Year 2014 Quarter 2...
Generating data for Year 2014 Quarter 3...
Generating data for Year 2014 Quarter 4...
Generating data for Year 2015 Quarter 1...
Generating data for Year 2015 Quarter 2...
Generating data for Year 2015 Quarter 3...
Generating data for Year 2015 Quarter 4...
Generating data for Year 2016 Quarter 1...
Generating data for Year 2016 Quarter 2...
Generating data for Year 2016 Quarter 3...
Generating data for Year 2016 Quarter 4...
Generating data for Year 2017 Quarter 1...
Generating data for Year 2017 Quarter 2...
Generating data for Year 2017 Quarter 3...
Generating data for Year 2017 Quarter 4...
Generating data for Year 2018 Quarter 

## **有5大类销售数据** ##

In [29]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from itertools import product
from collections import defaultdict

# --------------------------
# Generates Dim_Time table
# --------------------------
def generate_dim_time_table(start_date, end_date):
    """
    Generate a time dimension table (dim_time_df) with all necessary columns.
    """
    time_series = pd.date_range(start=start_date, end=end_date)
    dim_time_df = pd.DataFrame(time_series, columns=['Full_Date'])
    
    dim_time_df['Time_ID'] = dim_time_df['Full_Date'].apply(lambda x: int(x.strftime('%Y%m%d')))
    
    dim_time_df['Year'] = dim_time_df['Full_Date'].dt.year
    dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Month_of_Year'] = dim_time_df['Full_Date'].dt.month
    dim_time_df['Day_of_Month'] = dim_time_df['Full_Date'].dt.day
    dim_time_df['Day_of_Week'] = dim_time_df['Full_Date'].dt.dayofweek
    dim_time_df['Week_of_Year'] = dim_time_df['Full_Date'].dt.isocalendar().week.astype(int)
    
    dim_time_df['Day_Name'] = dim_time_df['Full_Date'].dt.day_name()
    dim_time_df['Month_Name'] = dim_time_df['Full_Date'].dt.month_name()
    
    dim_time_df['Is_Weekend'] = dim_time_df['Day_of_Week'] >= 5
    
    return dim_time_df

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)
    
    # --------------------------
    # Step 1: Parse and Load Data
    # --------------------------
    # 调换了汽车销售和汽车租赁的季度收入数据
    revenue_data = {
        'Automotive sales': {
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive regulatory credits': {
            (2013, 1): 0.0e6, (2013, 2): 0.0e6, (2013, 3): 0.0e6, (2013, 4): 0.0e6,
            (2014, 1): 0.0e6, (2014, 2): 0.0e6, (2014, 3): 0.0e6, (2014, 4): 0.0e6,
            (2015, 1): 0.0e6, (2015, 2): 0.0e6, (2015, 3): 0.0e6, (2015, 4): 0.0e6,
            (2016, 1): 0.0e6, (2016, 2): 0.0e6, (2016, 3): 0.0e6, (2016, 4): 0.0e6,
            (2017, 1): 0.0e6, (2017, 2): 0.0e6, (2017, 3): 0.0e6, (2017, 4): 0.0e6,
            (2018, 1): 0.0e6, (2018, 2): 0.0e6, (2018, 3): 0.0e6, (2018, 4): 0.0e6,
            (2019, 1): 0.0e6, (2019, 2): 0.0e6, (2019, 3): 0.0e6, (2019, 4): 0.0e6,
            (2020, 1): 0.0e6, (2020, 2): 0.0e6, (2020, 3): 0.0e6, (2020, 4): 0.0e6,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy generation and storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services and other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }

    # --------------------------
    # Step 2: Data Cleaning and Preprocessing
    # --------------------------
    product_id_to_name = {
        'PRO001': 'Model S', 'PRO002': 'Model X', 'PRO003': 'Model 3', 'PRO004': 'Model Y',
        'PRO005': 'Cybertruck', 'PRO006': 'Other Revenue', 'PRO007': 'Regulatory Credits',
        'PRO008': 'Services & Other', 'PRO009': 'Leasing', 'PRO010': 'Energy Generation & Storage',
        'PRO011': 'FSD', 'PRO012': 'Cybercab'
    }
    dim_product_df['Product_Name'] = dim_product_df['Product_ID'].map(product_id_to_name).fillna('Other')

    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America'
        return 'Other'
        
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # Step 3: Define Weights for Geographic Distribution
    # --------------------------
    product_weights_by_name = {
        'Model S': 0.05, 'Model X': 0.05, 'Model 3': 0.45, 'Model Y': 0.40, 'Cybertruck': 0.04, 'Cybercab': 0.01
    }
    
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }

    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            for state in states:
                state_w = state_province_weights.get(state, 0.01)
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values

    # --------------------------
    # Step 4: Ensure Data Types are Consistent
    # --------------------------
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    
    dim_time_df['Year_Int'] = dim_time_df['Year'].astype(int)
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year'].astype(int)
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time

    # --------------------------
    # Step 5: Create Price Lookup Dictionary
    # --------------------------
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns.")

    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    # --------------------------
    # Step 6: Define Product-Geography Combinations and Weights
    # --------------------------
    product_weights_dict = {
        pid: product_weights_by_name.get(pname, 0.0001) for pid, pname in dim_product_df.set_index('Product_ID')['Product_Name'].to_dict().items()
    }

    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()
    
    automotive_product_ids = ['PRO001', 'PRO002', 'PRO003', 'PRO004', 'PRO005', 'PRO012']
    non_automotive_product_ids = ['PRO007', 'PRO008', 'PRO009', 'PRO010', 'PRO011']

    # 获取所有真实的地理ID，包括可能存在的其他 ID
    real_geo_ids = dim_geography_df['Geo_ID'].tolist()

    # 构建汽车产品与地理位置的组合
    automotive_combos = []
    automotive_combo_weights = []
    
    for prod_id, geo_id in product(automotive_product_ids, real_geo_ids):
        prod_weight = product_weights_dict.get(prod_id, 0.0001)
        geo_weight = geo_weights_dict.get(geo_id, 0.0001)
        automotive_combos.append((prod_id, geo_id))
        automotive_combo_weights.append(prod_weight * geo_weight)

    total_automotive_weight = sum(automotive_combo_weights)
    if total_automotive_weight == 0:
        print("Warning: Total automotive combination weight is zero. Cannot generate data.")
        return 0
    
    automotive_probabilities = np.array(automotive_combo_weights) / total_automotive_weight

    # --------------------------
    # Step 7: Generate Fact Table Records
    # --------------------------
    start_year = min(y for y, q in revenue_data['Automotive sales'].keys())

    total_generated_rows = 0
    
    non_automotive_revenue_per_transaction = {
        'Automotive regulatory credits': 10e6,
        'Services and other': 100e3,
        'Automotive leasing': 100e3,
        'Energy generation and storage': 100e3
    }
    
    fsd_product_id = 'PRO011'
    fsd_price = 12000

    with open(output_filepath, 'w', newline='', encoding='utf-8') as f:
        writer = None
        
        for year in range(start_year, end_date.year + 1):
            for quarter in range(1, 5):
                if year == 2025 and quarter > 2:
                    continue
                
                quarterly_revenue = {category: revenue_data.get(category, {}).get((year, quarter), 0) for category in revenue_data}
                
                quarter_dates_df = dim_time_df[
                    (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
                ]
                
                if quarter_dates_df.empty:
                    print(f"Warning: No time IDs available for Year {year} Quarter {quarter}, skipping generation.")
                    continue

                records = []
                
                # 1. Handle Automotive sales
                automotive_revenue = quarterly_revenue.get('Automotive sales', 0)
                if automotive_revenue > 0:
                    quarter_start_date = quarter_dates_df.iloc[0]['Quarter_Start_Date']
                    
                    automotive_product_prices = {
                        prod_id: price_lookup.get((quarter_start_date, prod_id), {}).get('Standard_Price_USD', 100000)
                        for prod_id in automotive_product_ids if prod_id in ['PRO001', 'PRO002', 'PRO003', 'PRO004', 'PRO005']
                    }
                    
                    # Add Cybercab price manually
                    automotive_product_prices['PRO012'] = 200000 
                    
                    weighted_average_price = sum(
                        automotive_product_prices.get(prod, 0) * product_weights_dict.get(prod, 0) for prod in automotive_product_ids
                    )
                    
                    if weighted_average_price == 0:
                        continue
                    
                    target_units = int(automotive_revenue / weighted_average_price)
                    
                    if target_units > 0:
                        print(f"Generating {target_units:,} automotive sales records for {year}Q{quarter}...")
                        sampled_combo_indices = np.random.choice(len(automotive_combos), size=target_units, p=automotive_probabilities)
                        
                        for i in range(target_units):
                            combo_index = sampled_combo_indices[i]
                            product_id, geo_id = automotive_combos[combo_index]
                            
                            time_id = np.random.choice(quarter_dates_df['Time_ID'])
                            customer_id = np.random.choice(customer_ids)
                            
                            transaction_revenue = automotive_product_prices.get(product_id, weighted_average_price) * (1 + np.random.normal(0, 0.05))
                            
                            records.append({
                                'Time_ID': time_id,
                                'Geo_ID': geo_id,
                                'Product_ID': product_id,
                                'Customer_ID': customer_id,
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': transaction_revenue,
                                'Revenue_Category': 'Automotive sales'
                            })

                # 2. Handle Non-Automotive Revenue
                non_automotive_categories = [
                    'Automotive regulatory credits',
                    'Services and other',
                    'Automotive leasing',
                    'Energy generation and storage'
                ]
                
                for category in non_automotive_categories:
                    revenue = quarterly_revenue.get(category, 0)
                    if revenue > 0:
                        product_id = non_automotive_product_ids[non_automotive_categories.index(category)]
                        
                        transaction_unit = non_automotive_revenue_per_transaction.get(category, 1000)
                        num_transactions = int(revenue / transaction_unit)
                        if num_transactions == 0 and revenue > 0:
                             num_transactions = 1 
                        
                        individual_revenue = revenue / num_transactions if num_transactions > 0 else 0
                        
                        for _ in range(num_transactions):
                            time_id = np.random.choice(quarter_dates_df['Time_ID'])
                            # 分配给一个随机的真实地理ID
                            geo_id = np.random.choice(real_geo_ids)
                            
                            records.append({
                                'Time_ID': time_id,
                                'Geo_ID': geo_id,
                                'Product_ID': product_id,
                                'Customer_ID': 'CUS001',
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': individual_revenue,
                                'Revenue_Category': category
                            })
                            
                # 3. Handle FSD sales
                fsd_revenue = quarterly_revenue.get('Services and other', 0) * 0.1 # Assume 10% of services revenue is FSD
                if fsd_revenue > 0:
                    num_fsd_transactions = int(fsd_revenue / fsd_price)
                    if num_fsd_transactions == 0 and fsd_revenue > 0:
                        num_fsd_transactions = 1
                    
                    fsd_per_transaction = fsd_revenue / num_fsd_transactions
                    
                    for _ in range(num_fsd_transactions):
                        time_id = np.random.choice(quarter_dates_df['Time_ID'])
                        # 分配给一个随机的真实地理ID
                        geo_id = np.random.choice(real_geo_ids)
                        customer_id = np.random.choice(customer_ids)
                        
                        records.append({
                            'Time_ID': time_id,
                            'Geo_ID': geo_id,
                            'Product_ID': fsd_product_id,
                            'Customer_ID': customer_id,
                            'Sales_Units': 1,
                            'Is_Discounted_Sale': False,
                            'Revenue_USD': fsd_per_transaction,
                            'Revenue_Category': 'Services & Other'
                        })
                
                fact_sales_df_temp = pd.DataFrame(records)
                
                if not fact_sales_df_temp.empty:
                    if writer is None:
                        fact_sales_df_temp.to_csv(f, header=True, index=False, encoding='utf-8')
                        writer = True
                    else:
                        fact_sales_df_temp.to_csv(f, header=False, index=False, encoding='utf-8')
                    total_generated_rows += len(fact_sales_df_temp)
    
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        dim_time_path = os.path.join(output_dir, 'Dim_Time.csv')
        if not os.path.exists(dim_time_path):
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print("Dim_Time.csv not found. Generating...")
            dim_time_df = generate_dim_time_table(datetime(2013, 1, 1), datetime(2025, 6, 30))
            dim_time_df.to_csv(dim_time_path, index=False)
            print("Dim_Time.csv generated.")

        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(dim_time_path)
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
    
    if dim_product_df is None or dim_time_df is None or dim_customer_df is None or dim_geography_df is None or dim_prices_df is None:
        print("Data generation failed due to missing files.")
    else:
        new_product_records = pd.DataFrame([
            {'Product_ID': 'PRO006', 'Product_Name': 'Other Revenue'},
            {'Product_ID': 'PRO007', 'Product_Name': 'Regulatory Credits'},
            {'Product_ID': 'PRO008', 'Product_Name': 'Services & Other'},
            {'Product_ID': 'PRO009', 'Product_Name': 'Leasing'},
            {'Product_ID': 'PRO010', 'Product_Name': 'Energy Generation & Storage'},
            {'Product_ID': 'PRO011', 'Product_Name': 'FSD'},
            {'Product_ID': 'PRO012', 'Product_Name': 'Cybercab'}
        ])
        dim_product_df = pd.concat([dim_product_df, new_product_records], ignore_index=True)

        new_customer_record = pd.DataFrame([{'Customer_ID': 'CUS001', 'First_Name': 'Global', 'Last_Name': 'Customer', 'Gender': 'NA'}])
        dim_customer_df = pd.concat([dim_customer_df, new_customer_record], ignore_index=True)
        
        # 移除了所有创建新的地理ID的逻辑
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
        
        if os.path.exists(output_filepath):
            os.remove(output_filepath)
    
        print("Generating Fact_Sales table...")
        total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
        if total_rows > 0:
            end_time = time.time()
            print(f"Fact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
            print("Data generation complete!")
        else:
            print("Data generation failed.")

Loading all dimension tables and data sources from './output_data'...
Generating Fact_Sales table...
Generating 4,010 automotive sales records for 2013Q1...
Generating 4,210 automotive sales records for 2013Q2...
Generating 4,511 automotive sales records for 2013Q3...
Generating 5,012 automotive sales records for 2013Q4...
Generating 7,013 automotive sales records for 2014Q1...
Generating 7,513 automotive sales records for 2014Q2...
Generating 8,024 automotive sales records for 2014Q3...
Generating 8,524 automotive sales records for 2014Q4...
Generating 9,026 automotive sales records for 2015Q1...
Generating 9,528 automotive sales records for 2015Q2...
Generating 10,016 automotive sales records for 2015Q3...
Generating 11,135 automotive sales records for 2015Q4...
Generating 15,200 automotive sales records for 2016Q1...
Generating 16,222 automotive sales records for 2016Q2...
Generating 18,231 automotive sales records for 2016Q3...
Generating 20,226 automotive sales records for 2016Q4.

## **800版本** ##

In [18]:
import pandas as pd
import numpy as np
import os
import time
import datetime
from datetime import datetime
from itertools import product
from collections import defaultdict
import random

def generate_transaction_id(system_id: str) -> str:
    """
    根据给定的编码规则生成一个唯一的交易ID。

    编码规则: TR + 时间戳(YYMMDDHHmmss) + 系统/商户ID + 随机数(6位)
    例如: TR250901201234ABC987654

    Args:
        system_id: 3位系统或商户ID，例如 'WEB', 'POS'。

    Returns:
        生成的交易ID字符串。
    """
    # 验证系统ID的长度是否为3位
    if len(system_id) != 3:
        raise ValueError("System ID must be exactly 3 characters long.")

    # 1. 固定前缀
    prefix = "TR"

    # 2. 生成时间戳 (YYMMDDHHmmss)
    now = datetime.now()
    timestamp = now.strftime("%y%m%d%H%M%S")

    # 3. 随机数/序列号 (这里为了演示，使用6位随机数)
    # 在实际生产环境中，建议使用原子性的自增序列号来保证唯一性
    random_part = str(random.randint(100000, 999999))

    # 4. 拼接所有部分
    transaction_id = f"{prefix}{timestamp}{system_id.upper()}{random_part}"

    return transaction_id

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)

    # --------------------------
    # Step 1: Parse and Load Data (Hardcoded for this example)
    # --------------------------
    # This data is used to drive the generation volume and should be consistent
    # with the categories in dim_product_df.
    # 此数据源为特斯拉2013-2025Q2的季度交付量（Delivery）数据，用于驱动“Automotive Sales”的记录生成
    # 来源：特斯拉公开财报，数据以千辆（k）为单位，实际交付数量需乘以1000
    delivery_data = {
        'Automotive Sales': {
            (2013, 1): 4900, (2013, 2): 6350, (2013, 3): 5500, (2013, 4): 6892,
            (2014, 1): 6450, (2014, 2): 7575, (2014, 3): 7785, (2014, 4): 10140,
            (2015, 1): 11532, (2015, 2): 11532, (2015, 3): 11580, (2015, 4): 17400,
            (2016, 1): 14810, (2016, 2): 14402, (2016, 3): 24823, (2016, 4): 22252,
            (2017, 1): 25418, (2017, 2): 22000, (2017, 3): 26131, (2017, 4): 29870,
            (2018, 1): 29980, (2018, 2): 40740, (2018, 3): 83775, (2018, 4): 90900,
            (2019, 1): 63000, (2019, 2): 95200, (2019, 3): 97000, (2019, 4): 112000,
            (2020, 1): 88400, (2020, 2): 90891, (2020, 3): 139593, (2020, 4): 180570,
            (2021, 1): 184877, (2021, 2): 201250, (2021, 3): 241300, (2021, 4): 308600,
            (2022, 1): 310048, (2022, 2): 254695, (2022, 3): 343830, (2022, 4): 405278,
            (2023, 1): 422875, (2023, 2): 466140, (2023, 3): 435059, (2023, 4): 484507,
            (2024, 1): 386810, (2024, 2): 442000, (2024, 3): 440000, (2024, 4): 445000,
            (2025, 1): 400000, (2025, 2): 450000
        }
    }

    # 原始的收入数据，可以作为参考
    revenue_data = {
        'Automotive sales': {
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive regulatory credits': {
            (2013, 1): 0.0e6, (2013, 2): 0.0e6, (2013, 3): 0.0e6, (2013, 4): 0.0e6,
            (2014, 1): 0.0e6, (2014, 2): 0.0e6, (2014, 3): 0.0e6, (2014, 4): 0.0e6,
            (2015, 1): 0.0e6, (2015, 2): 0.0e6, (2015, 3): 0.0e6, (2015, 4): 0.0e6,
            (2016, 1): 0.0e6, (2016, 2): 0.0e6, (2016, 3): 0.0e6, (2016, 4): 0.0e6,
            (2017, 1): 0.0e6, (2017, 2): 0.0e6, (2017, 3): 0.0e6, (2017, 4): 0.0e6,
            (2018, 1): 0.0e6, (2018, 2): 0.0e6, (2018, 3): 0.0e6, (2018, 4): 0.0e6,
            (2019, 1): 0.0e6, (2019, 2): 0.0e6, (2019, 3): 0.0e6, (2019, 4): 0.0e6,
            (2020, 1): 0.0e6, (2020, 2): 0.0e6, (2020, 3): 0.0e6, (2020, 4): 0.0e6,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy generation and storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services and other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    # Define and calculate geographical weights
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America'
        return 'Other'
    
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)
    dim_geography_df['Geo_Weight'] = 0.0
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02, 'Other': 0.001}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }
    
    for _, row in dim_geography_df.iterrows():
        continent = row['Continent']
        country = row['Country']
        state = row['State_Province']
        
        c_weight = continent_weights.get(continent, 0.001)
        country_w = country_weights.get(country, 0.01)
        state_w = state_province_weights.get(state, 0.01)
        
        dim_geography_df.loc[(dim_geography_df['Geo_ID'] == row['Geo_ID']), 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Year_Int'] = dim_time_df['Year'].astype(int)
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year'].astype(int)
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time
    
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns.")
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    # --------------------------
    # Step 2: Dynamically Map Products to Categories
    # --------------------------
    
    # 使用所有独特的产品类别
    all_product_categories = dim_product_df['Product_Category'].unique().tolist()
    
    category_to_product_ids = defaultdict(list)
    for _, row in dim_product_df.iterrows():
        category_to_product_ids[row['Product_Category']].append(row['Product_ID'])

    print("\n--------------------")
    print("开始遍历所有维度ID...")
    print("--------------------")

    # 打印总结信息，使用动态获取的类别
    for category in all_product_categories:
        print(f"找到的 '{category}' 产品ID数量: {len(category_to_product_ids[category])}")
    print(f"找到的地理ID数量: {len(dim_geography_df['Geo_ID'].tolist())}")

    print("遍历成功，开始生成销售事实表。")
    print("--------------------")

    # 预先计算一次地理概率，避免在循环中重复计算
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()
    real_geo_ids = dim_geography_df['Geo_ID'].tolist()
    geo_weights_array = np.array(list(geo_weights_dict.values()))
    total_geo_weight = sum(geo_weights_dict.values())
    
    if total_geo_weight > 0:
        geo_probabilities = geo_weights_array / total_geo_weight
    else:
        num_geos = len(real_geo_ids)
        geo_probabilities = np.full(num_geos, 1.0 / num_geos)

    # --------------------------
    # Step 3: Generate Fact Table Records
    # --------------------------
    start_year = min(y for y, q in delivery_data['Automotive Sales'].keys())
    total_generated_rows = 0
    total_car_deliveries = 0 # 新增：用于跟踪汽车交付总数
    
    # 定义每笔交易的平均收入（针对非按件销售的类别）
    transaction_revenue_per_category = {
        'Automotive Regulatory Credits': 10e6,
        'Services & Other': 100e3,
        'Automotive Leasing': 100e3,
        'Energy Generation & Storage': 100e3,
    }

    with open(output_filepath, 'w', newline='', encoding='utf-8') as f:
        writer = None
        
        for year in range(start_year, end_date.year + 1):
            yearly_rows_generated = 0
            for quarter in range(1, 5):
                if year == 2025 and quarter > 2:
                    continue
                
                quarter_dates_df = dim_time_df[
                    (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
                ]
                
                if quarter_dates_df.empty:
                    continue

                records = []
                
                # 为每笔交易生成一个唯一的ID
                system_id = "WEB" # 假设交易来自网页端
                
                for category_name in all_product_categories:
                    if category_name == 'Automotive Sales':
                        # 从交付量数据中获取目标数量
                        target_units = delivery_data.get(category_name, {}).get((year, quarter), 0)
                        
                        if target_units > 0:
                            quarter_start_date = quarter_dates_df.iloc[0]['Quarter_Start_Date']
                            product_ids_for_category = category_to_product_ids.get(category_name)

                            if not product_ids_for_category:
                                continue

                            # 随机选择产品和地理位置
                            sampled_product_ids = np.random.choice(product_ids_for_category, size=target_units)
                            sampled_geo_ids = np.random.choice(real_geo_ids, size=target_units, p=geo_probabilities)
                            
                            for i in range(target_units):
                                product_id = sampled_product_ids[i]
                                geo_id = sampled_geo_ids[i]
                                time_id = np.random.choice(quarter_dates_df['Time_ID'])
                                customer_id = np.random.choice(customer_ids)
                                
                                # 根据产品价格估算收入，增加一些随机性
                                base_price = price_lookup.get((quarter_start_date, product_id), {}).get('Standard_Price_USD', 100000)
                                transaction_revenue = base_price * (1 + np.random.normal(0, 0.05))
                                
                                records.append({
                                    'TransactionID': generate_transaction_id(system_id),
                                    'Time_ID': time_id,
                                    'Geo_ID': geo_id,
                                    'Product_ID': product_id,
                                    'Customer_ID': customer_id,
                                    'Sales_Units': 1,
                                    'Is_Discounted_Sale': False,
                                    'Revenue_USD': transaction_revenue,
                                    'Revenue_Category': category_name
                                })
                            total_car_deliveries += target_units
                    else:
                        # 对于其他类别，仍使用原始的收入数据进行计算
                        revenue = revenue_data.get(category_name, {}).get((year, quarter), 0)
                        product_ids_for_category = category_to_product_ids.get(category_name)
                        
                        if revenue > 0 and product_ids_for_category:
                            transaction_unit = transaction_revenue_per_category.get(category_name, 1000)
                            num_transactions = int(revenue / transaction_unit)
                            if num_transactions == 0:
                                num_transactions = 1
                            
                            individual_revenue = revenue / num_transactions if num_transactions > 0 else 0
                            
                            for _ in range(num_transactions):
                                time_id = np.random.choice(quarter_dates_df['Time_ID'])
                                geo_id = np.random.choice(real_geo_ids, p=geo_probabilities)
                                product_id = np.random.choice(product_ids_for_category)
                                
                                records.append({
                                    'TransactionID': generate_transaction_id(system_id),
                                    'Time_ID': time_id,
                                    'Geo_ID': geo_id,
                                    'Product_ID': product_id,
                                    'Customer_ID': 'CUS001',
                                    'Sales_Units': 1,
                                    'Is_Discounted_Sale': False,
                                    'Revenue_USD': individual_revenue,
                                    'Revenue_Category': category_name
                                })
                
                fact_sales_df_temp = pd.DataFrame(records)
                
                if not fact_sales_df_temp.empty:
                    if writer is None:
                        fact_sales_df_temp.to_csv(f, header=True, index=False, encoding='utf-8')
                        writer = True
                    else:
                        fact_sales_df_temp.to_csv(f, header=False, index=False, encoding='utf-8')
                    total_generated_rows += len(fact_sales_df_temp)
                    yearly_rows_generated += len(fact_sales_df_temp)
            
            if yearly_rows_generated > 0:
                print(f"Generated {yearly_rows_generated:,} records for year {year}.")

    print(f"\nTotal car deliveries generated: {total_car_deliveries:,}")
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join(output_dir, 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
        print("所有维度表加载成功。")
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
        print("Data generation failed due to missing files.")
        exit()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
    
    if os.path.exists(output_filepath):
        os.remove(output_filepath)

    print("\n开始整理和生成销售事实表...")
    total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
    if total_rows > 0:
        end_time = time.time()
        print(f"\nFact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
        print("数据生成完成!")
    else:
        print("数据生成失败。")

Loading all dimension tables and data sources from './output_data'...
所有维度表加载成功。

开始整理和生成销售事实表...

--------------------
开始遍历所有维度ID...
--------------------
找到的 'Automotive Leasing' 产品ID数量: 4
找到的 'Energy Generation & Storage' 产品ID数量: 4
找到的 'Automotive Sales' 产品ID数量: 3138
找到的 'Automotive Regulatory Credits' 产品ID数量: 1
找到的 'Services & Other' 产品ID数量: 4
找到的地理ID数量: 432
遍历成功，开始生成销售事实表。
--------------------
Generated 23,642 records for year 2013.
Generated 31,950 records for year 2014.
Generated 52,044 records for year 2015.
Generated 76,287 records for year 2016.
Generated 103,419 records for year 2017.
Generated 245,395 records for year 2018.
Generated 367,200 records for year 2019.
Generated 499,454 records for year 2020.
Generated 936,027 records for year 2021.
Generated 1,313,851 records for year 2022.
Generated 1,808,581 records for year 2023.
Generated 1,713,810 records for year 2024.
Generated 850,000 records for year 2025.

Total car deliveries generated: 8,021,660

Fact_Sales.csv succe

## **500万行速度优化（做了价格平均）** ##

In [19]:
import pandas as pd
import numpy as np
import os
import time
import datetime
from datetime import datetime
from itertools import product
from collections import defaultdict
import random

def generate_transaction_id(system_id: str) -> str:
    """
    根据给定的编码规则生成一个唯一的交易ID。

    编码规则: TR + 时间戳(YYMMDDHHmmss) + 系统/商户ID + 随机数(6位)
    例如: TR250901201234ABC987654

    Args:
        system_id: 3位系统或商户ID，例如 'WEB', 'POS'。

    Returns:
        生成的交易ID字符串。
    """
    # 验证系统ID的长度是否为3位
    if len(system_id) != 3:
        raise ValueError("System ID must be exactly 3 characters long.")

    # 1. 固定前缀
    prefix = "TR"

    # 2. 生成时间戳 (YYMMDDHHmmss)
    now = datetime.now()
    timestamp = now.strftime("%y%m%d%H%M%S")

    # 3. 随机数/序列号 (这里为了演示，使用6位随机数)
    # 在实际生产环境中，建议使用原子性的自增序列号来保证唯一性
    random_part = str(random.randint(100000, 999999))

    # 4. 拼接所有部分
    transaction_id = f"{prefix}{timestamp}{system_id.upper()}{random_part}"

    return transaction_id

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)

    # --------------------------
    # Step 1: Parse and Load Data (Hardcoded for this example)
    # --------------------------
    # 此数据源为特斯拉2013-2025Q2的季度收入数据（Revenue）
    # 来源：特斯拉公开财报，单位为百万美元（M），实际收入需乘以1,000,000
    revenue_data = {
        'Automotive Sales': {           
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive Regulatory Credits': {
            (2013, 1): 0, (2013, 2): 0, (2013, 3): 0, (2013, 4): 0,
            (2014, 1): 0, (2014, 2): 0, (2014, 3): 0, (2014, 4): 0,
            (2015, 1): 0, (2015, 2): 0, (2015, 3): 0, (2015, 4): 0,
            (2016, 1): 0, (2016, 2): 0, (2016, 3): 0, (2016, 4): 0,
            (2017, 1): 0, (2017, 2): 0, (2017, 3): 0, (2017, 4): 0,
            (2018, 1): 0, (2018, 2): 0, (2018, 3): 0, (2018, 4): 0,
            (2019, 1): 0, (2019, 2): 0, (2019, 3): 0, (2019, 4): 0,
            (2020, 1): 0, (2020, 2): 0, (2020, 3): 0, (2020, 4): 0,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive Leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy Generation & Storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services & Other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    # 原始的交付量数据，可以作为参考
    delivery_data = {
        'Automotive Sales': {           
            (2013, 1): 400.0, (2013, 2): 420.0, (2013, 3): 450.0, (2013, 4): 500.0,
            (2014, 1): 700.0, (2014, 2): 750.0, (2014, 3): 800.0, (2014, 4): 850.0,
            (2015, 1): 900.0, (2015, 2): 950.0, (2015, 3): 1000.0, (2015, 4): 1100.0,
            (2016, 1): 1500.0, (2016, 2): 1600.0, (2016, 3): 1800.0, (2016, 4): 2000.0,
            (2017, 1): 2500.0, (2017, 2): 2800.0, (2017, 3): 3000.0, (2017, 4): 3200.0,
            (2018, 1): 3300.0, (2018, 2): 3900.0, (2018, 3): 6600.0, (2018, 4): 7000.0,
            (2019, 1): 3509.0, (2019, 2): 5168.0, (2019, 3): 5132.0, (2019, 4): 6143.0,
            (2020, 1): 4893.0, (2020, 2): 4911.0, (2020, 3): 7346.0, (2020, 4): 9034.0,
            (2021, 1): 8187.0, (2021, 2): 9520.0, (2021, 3): 11393.0, (2021, 4): 15025.0,
            (2022, 1): 15514.0, (2022, 2): 13670.0, (2022, 3): 17785.0, (2022, 4): 20241.0,
            (2023, 1): 18878.0, (2023, 2): 20419.0, (2023, 3): 18582.0, (2023, 4): 20630.0,
            (2024, 1): 16460.0, (2024, 2): 18530.0, (2024, 3): 18831.0, (2024, 4): 18659.0,
            (2025, 1): 12925.0, (2025, 2): 15787.0
        }
    }

    # Define and calculate geographical weights
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America'
        return 'Other'
    
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)
    dim_geography_df['Geo_Weight'] = 0.0
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02, 'Other': 0.001}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }
    
    for _, row in dim_geography_df.iterrows():
        continent = row['Continent']
        country = row['Country']
        state = row['State_Province']
        
        c_weight = continent_weights.get(continent, 0.001)
        country_w = country_weights.get(country, 0.01)
        state_w = state_province_weights.get(state, 0.01)
        
        dim_geography_df.loc[(dim_geography_df['Geo_ID'] == row['Geo_ID']), 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Year_Int'] = dim_time_df['Year'].astype(int)
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year'].astype(int)
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time
    
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns.")
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    # --------------------------
    # Step 2: Dynamically Map Products to Categories
    # --------------------------
    
    # 使用所有独特的产品类别
    all_product_categories = dim_product_df['Product_Category'].unique().tolist()
    
    category_to_product_ids = defaultdict(list)
    for _, row in dim_product_df.iterrows():
        category_to_product_ids[row['Product_Category']].append(row['Product_ID'])

    print("\n--------------------")
    print("开始遍历所有维度ID...")
    print("--------------------")

    # 打印总结信息，使用动态获取的类别
    for category in all_product_categories:
        print(f"找到的 '{category}' 产品ID数量: {len(category_to_product_ids[category])}")
    print(f"找到的地理ID数量: {len(dim_geography_df['Geo_ID'].tolist())}")

    print("遍历成功，开始生成销售事实表。")
    print("--------------------")

    # 预先计算一次地理概率，避免在循环中重复计算
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()
    real_geo_ids = dim_geography_df['Geo_ID'].tolist()
    geo_weights_array = np.array(list(geo_weights_dict.values()))
    total_geo_weight = sum(geo_weights_dict.values())
    
    if total_geo_weight > 0:
        geo_probabilities = geo_weights_array / total_geo_weight
    else:
        num_geos = len(real_geo_ids)
        geo_probabilities = np.full(num_geos, 1.0 / num_geos)

    # --------------------------
    # Step 3: Generate Fact Table Records
    # --------------------------
    start_year = min(y for y, q in revenue_data['Automotive Sales'].keys())
    total_generated_rows = 0
    
    # 定义每笔交易的平均收入（针对非按件销售的类别）
    transaction_revenue_per_category = {
        'Automotive Regulatory Credits': 10e6,
        'Services & Other': 100e3,
        'Automotive Leasing': 100e3,
        'Energy Generation & Storage': 100e3,
    }
    
    # 针对'Automotive Sales'，定义一个平均交易金额，用于从总收入估算交易笔数
    avg_car_price = 80000  # 假设一辆汽车的平均价格为80,000美元
    
    with open(output_filepath, 'w', newline='', encoding='utf-8') as f:
        writer = None
        
        for year in range(start_year, end_date.year + 1):
            yearly_rows_generated = 0
            for quarter in range(1, 5):
                if year == 2025 and quarter > 2:
                    continue
                
                quarter_dates_df = dim_time_df[
                    (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
                ]
                
                if quarter_dates_df.empty:
                    continue

                records = []
                
                # 为每笔交易生成一个唯一的ID
                system_id = "WEB" # 假设交易来自网页端
                
                for category_name in all_product_categories:
                    revenue = revenue_data.get(category_name, {}).get((year, quarter), 0)
                    product_ids_for_category = category_to_product_ids.get(category_name)
                    
                    if not product_ids_for_category:
                        continue

                    if revenue > 0:
                        if category_name == 'Automotive Sales':
                            # 使用收入数据除以平均价格来估算交易笔数
                            num_transactions = int(revenue / avg_car_price)
                            if num_transactions == 0:
                                num_transactions = 1
                            individual_revenue = revenue / num_transactions
                        else:
                            # 对于其他类别，使用已定义的平均交易收入来估算交易笔数
                            transaction_unit = transaction_revenue_per_category.get(category_name, 1000)
                            num_transactions = int(revenue / transaction_unit)
                            if num_transactions == 0:
                                num_transactions = 1
                            individual_revenue = revenue / num_transactions
                        
                        sampled_product_ids = np.random.choice(product_ids_for_category, size=num_transactions)
                        sampled_geo_ids = np.random.choice(real_geo_ids, size=num_transactions, p=geo_probabilities)
                        sampled_customer_ids = np.random.choice(customer_ids, size=num_transactions)
                        sampled_time_ids = np.random.choice(quarter_dates_df['Time_ID'].values, size=num_transactions)
                        
                        # 确保 Revenue_USD 的总和接近于原始收入
                        # 在这里使用一个简单的随机分配，使总和接近
                        revenue_per_record = individual_revenue
                        
                        for i in range(num_transactions):
                            records.append({
                                'TransactionID': generate_transaction_id(system_id),
                                'Time_ID': sampled_time_ids[i],
                                'Geo_ID': sampled_geo_ids[i],
                                'Product_ID': sampled_product_ids[i],
                                'Customer_ID': sampled_customer_ids[i],
                                'Sales_Units': 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': revenue_per_record * (1 + np.random.normal(0, 0.05)),
                                'Revenue_Category': category_name
                            })
                
                fact_sales_df_temp = pd.DataFrame(records)
                
                if not fact_sales_df_temp.empty:
                    if writer is None:
                        fact_sales_df_temp.to_csv(f, header=True, index=False, encoding='utf-8')
                        writer = True
                    else:
                        fact_sales_df_temp.to_csv(f, header=False, index=False, encoding='utf-8')
                    total_generated_rows += len(fact_sales_df_temp)
                    yearly_rows_generated += len(fact_sales_df_temp)
            
            if yearly_rows_generated > 0:
                print(f"Generated {yearly_rows_generated:,} records for year {year}.")

    print(f"\nTotal generated records: {total_generated_rows:,}")
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join(output_dir, 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
        print("所有维度表加载成功。")
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
        print("Data generation failed due to missing files.")
        exit()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
    
    if os.path.exists(output_filepath):
        os.remove(output_filepath)

    print("\n开始整理和生成销售事实表...")
    total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
    if total_rows > 0:
        end_time = time.time()
        print(f"\nFact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
        print("数据生成完成!")
    else:
        print("数据生成失败。")

Loading all dimension tables and data sources from './output_data'...
所有维度表加载成功。

开始整理和生成销售事实表...

--------------------
开始遍历所有维度ID...
--------------------
找到的 'Automotive Leasing' 产品ID数量: 4
找到的 'Energy Generation & Storage' 产品ID数量: 4
找到的 'Automotive Sales' 产品ID数量: 3138
找到的 'Automotive Regulatory Credits' 产品ID数量: 1
找到的 'Services & Other' 产品ID数量: 4
找到的地理ID数量: 432
遍历成功，开始生成销售事实表。
--------------------
Generated 24,505 records for year 2013.
Generated 42,140 records for year 2014.
Generated 53,865 records for year 2015.
Generated 92,330 records for year 2016.
Generated 151,490 records for year 2017.
Generated 270,200 records for year 2018.
Generated 295,659 records for year 2019.
Generated 380,819 records for year 2020.
Generated 634,035 records for year 2021.
Generated 965,059 records for year 2022.
Generated 1,146,280 records for year 2023.
Generated 1,130,744 records for year 2024.
Generated 479,851 records for year 2025.

Total generated records: 5,666,977

Fact_Sales.csv successfully g

## **800万行速度优化 修改车辆数（做了价格平均）** ##

In [20]:
import pandas as pd
import numpy as np
import os
import time
import datetime
from datetime import datetime
from itertools import product
from collections import defaultdict
import random

def generate_transaction_id(system_id: str) -> str:
    """
    根据给定的编码规则生成一个唯一的交易ID。
    编码规则: TR + 时间戳(YYMMDDHHmmss) + 系统/商户ID + 随机数(6位)
    例如: TR250901201234ABC987654

    Args:
        system_id: 3位系统或商户ID，例如 'WEB', 'POS'。

    Returns:
        生成的交易ID字符串。
    """
    if len(system_id) != 3:
        raise ValueError("System ID must be exactly 3 characters long.")
    
    prefix = "TR"
    now = datetime.now()
    timestamp = now.strftime("%y%m%d%H%M%S")
    random_part = str(random.randint(100000, 999999))
    
    transaction_id = f"{prefix}{timestamp}{system_id.upper()}{random_part}"
    
    return transaction_id

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    end_date = datetime(2025, 6, 30)

    # --------------------------
    # Step 1: Parse and Load Data (Hardcoded for this example)
    # --------------------------
    # 特斯拉2013-2025Q2的季度收入数据（Revenue）
    revenue_data = {
        'Automotive Sales': {        
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive Regulatory Credits': {
            (2013, 1): 0, (2013, 2): 0, (2013, 3): 0, (2013, 4): 0,
            (2014, 1): 0, (2014, 2): 0, (2014, 3): 0, (2014, 4): 0,
            (2015, 1): 0, (2015, 2): 0, (2015, 3): 0, (2015, 4): 0,
            (2016, 1): 0, (2016, 2): 0, (2016, 3): 0, (2016, 4): 0,
            (2017, 1): 0, (2017, 2): 0, (2017, 3): 0, (2017, 4): 0,
            (2018, 1): 0, (2018, 2): 0, (2018, 3): 0, (2018, 4): 0,
            (2019, 1): 0, (2019, 2): 0, (2019, 3): 0, (2019, 4): 0,
            (2020, 1): 0, (2020, 2): 0, (2020, 3): 0, (2020, 4): 0,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive Leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy Generation & Storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2014, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services & Other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    # 优化点1：引入真实的季度交付量数据
    delivery_data_official = {
        'Automotive Sales': {
            (2013, 1): 4900, (2013, 2): 5150, (2013, 3): 5500, (2013, 4): 6892,
            (2014, 1): 7535, (2014, 2): 8763, (2014, 3): 9834, (2014, 4): 11627,
            (2015, 1): 10045, (2015, 2): 11532, (2015, 3): 11603, (2015, 4): 17478,
            (2016, 1): 14815, (2016, 2): 14402, (2016, 3): 24882, (2016, 4): 22252,
            (2017, 1): 25418, (2017, 2): 22000, (2017, 3): 26150, (2017, 4): 29870,
            (2018, 1): 29980, (2018, 2): 40740, (2018, 3): 83500, (2018, 4): 90700,
            (2019, 1): 63000, (2019, 2): 95200, (2019, 3): 97000, (2019, 4): 112000,
            (2020, 1): 88400, (2020, 2): 90891, (2020, 3): 139593, (2020, 4): 180570,
            (2021, 1): 184800, (2021, 2): 201304, (2021, 3): 241300, (2021, 4): 308600,
            (2022, 1): 310048, (2022, 2): 254695, (2022, 3): 343830, (2022, 4): 405278,
            (2023, 1): 422875, (2023, 2): 466140, (2023, 3): 435059, (2023, 4): 484507,
            (2024, 1): 386810, (2024, 2): 442000, (2024, 3): 450000, (2024, 4): 440000,
            (2025, 1): 300000, (2025, 2): 350000
        }
    }

    # Define and calculate geographical weights
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America'
        return 'Other'
    
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)
    dim_geography_df['Geo_Weight'] = 0.0
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02, 'Other': 0.001}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }
    
    for _, row in dim_geography_df.iterrows():
        continent = row['Continent']
        country = row['Country']
        state = row['State_Province']
        
        c_weight = continent_weights.get(continent, 0.001)
        country_w = country_weights.get(country, 0.01)
        state_w = state_province_weights.get(state, 0.01)
        
        dim_geography_df.loc[(dim_geography_df['Geo_ID'] == row['Geo_ID']), 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Year_Int'] = dim_time_df['Year'].astype(int)
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year'].astype(int)
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time
    
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns.")
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    # --------------------------
    # Step 2: Dynamically Map Products to Categories
    # --------------------------
    
    all_product_categories = dim_product_df['Product_Category'].unique().tolist()
    
    category_to_product_ids = defaultdict(list)
    for _, row in dim_product_df.iterrows():
        category_to_product_ids[row['Product_Category']].append(row['Product_ID'])

    print("\n--------------------")
    print("开始遍历所有维度ID...")
    print("--------------------")

    for category in all_product_categories:
        print(f"找到的 '{category}' 产品ID数量: {len(category_to_product_ids[category])}")
    print(f"找到的地理ID数量: {len(dim_geography_df['Geo_ID'].tolist())}")

    print("遍历成功，开始生成销售事实表。")
    print("--------------------")

    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()
    real_geo_ids = dim_geography_df['Geo_ID'].tolist()
    geo_weights_array = np.array(list(geo_weights_dict.values()))
    total_geo_weight = sum(geo_weights_dict.values())
    
    if total_geo_weight > 0:
        geo_probabilities = geo_weights_array / total_geo_weight
    else:
        num_geos = len(real_geo_ids)
        geo_probabilities = np.full(num_geos, 1.0 / num_geos)

    # --------------------------
    # Step 3: Generate Fact Table Records
    # --------------------------
    start_year = min(y for y, q in revenue_data['Automotive Sales'].keys())
    total_generated_rows = 0
    
    # 定义每笔交易的平均收入（针对非按件销售的类别）
    transaction_revenue_per_category = {
        'Automotive Regulatory Credits': 10e6,
        'Services & Other': 100e3,
        'Automotive Leasing': 100e3,
        'Energy Generation & Storage': 100e3,
    }
    
    with open(output_filepath, 'w', newline='', encoding='utf-8') as f:
        writer = None
        
        for year in range(start_year, end_date.year + 1):
            yearly_rows_generated = 0
            for quarter in range(1, 5):
                if year == 2025 and quarter > 2:
                    continue
                
                quarter_dates_df = dim_time_df[
                    (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
                ]
                
                if quarter_dates_df.empty:
                    continue

                records = []
                system_id = "WEB"
                
                for category_name in all_product_categories:
                    revenue = revenue_data.get(category_name, {}).get((year, quarter), 0)
                    product_ids_for_category = category_to_product_ids.get(category_name)
                    
                    if not product_ids_for_category:
                        continue

                    if revenue > 0:
                        if category_name == 'Automotive Sales':
                            # 优化点2：使用实际交付量作为交易笔数
                            num_transactions = delivery_data_official['Automotive Sales'].get((year, quarter), 0)
                            if num_transactions == 0:
                                continue
                            
                            individual_revenue = revenue / num_transactions
                        else:
                            transaction_unit = transaction_revenue_per_category.get(category_name, 1000)
                            num_transactions = int(revenue / transaction_unit)
                            if num_transactions == 0:
                                num_transactions = 1
                            individual_revenue = revenue / num_transactions
                        
                        sampled_product_ids = np.random.choice(product_ids_for_category, size=num_transactions)
                        sampled_geo_ids = np.random.choice(real_geo_ids, size=num_transactions, p=geo_probabilities)
                        sampled_customer_ids = np.random.choice(customer_ids, size=num_transactions)
                        sampled_time_ids = np.random.choice(quarter_dates_df['Time_ID'].values, size=num_transactions)
                        
                        revenue_per_record = individual_revenue
                        
                        for i in range(num_transactions):
                            records.append({
                                'TransactionID': generate_transaction_id(system_id),
                                'Time_ID': sampled_time_ids[i],
                                'Geo_ID': sampled_geo_ids[i],
                                'Product_ID': sampled_product_ids[i],
                                'Customer_ID': sampled_customer_ids[i],
                                'Sales_Units': 1 if category_name == 'Automotive Sales' else 1,
                                'Is_Discounted_Sale': False,
                                'Revenue_USD': revenue_per_record * (1 + np.random.normal(0, 0.05)),
                                'Revenue_Category': category_name
                            })
                
                fact_sales_df_temp = pd.DataFrame(records)
                
                if not fact_sales_df_temp.empty:
                    if writer is None:
                        fact_sales_df_temp.to_csv(f, header=True, index=False, encoding='utf-8')
                        writer = True
                    else:
                        fact_sales_df_temp.to_csv(f, header=False, index=False, encoding='utf-8')
                    total_generated_rows += len(fact_sales_df_temp)
                    yearly_rows_generated += len(fact_sales_df_temp)
            
            if yearly_rows_generated > 0:
                print(f"Generated {yearly_rows_generated:,} records for year {year}.")

    print(f"\nTotal generated records: {total_generated_rows:,}")
    return total_generated_rows

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join(output_dir, 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
        print("所有维度表加载成功。")
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
        print("Data generation failed due to missing files.")
        exit()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
    
    if os.path.exists(output_filepath):
        os.remove(output_filepath)

    print("\n开始整理和生成销售事实表...")
    total_rows = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath)
    
    if total_rows > 0:
        end_time = time.time()
        print(f"\nFact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
        print("数据生成完成!")
    else:
        print("数据生成失败。")

Loading all dimension tables and data sources from './output_data'...
所有维度表加载成功。

开始整理和生成销售事实表...

--------------------
开始遍历所有维度ID...
--------------------
找到的 'Automotive Leasing' 产品ID数量: 4
找到的 'Energy Generation & Storage' 产品ID数量: 4
找到的 'Automotive Sales' 产品ID数量: 3138
找到的 'Automotive Regulatory Credits' 产品ID数量: 1
找到的 'Services & Other' 产品ID数量: 4
找到的地理ID数量: 432
遍历成功，开始生成销售事实表。
--------------------
Generated 24,822 records for year 2013.
Generated 41,149 records for year 2014.
Generated 55,148 records for year 2015.
Generated 82,431 records for year 2016.
Generated 111,178 records for year 2017.
Generated 255,120 records for year 2018.
Generated 413,460 records for year 2019.
Generated 552,974 records for year 2020.
Generated 1,018,478 records for year 2021.
Generated 1,438,786 records for year 2022.
Generated 1,973,499 records for year 2023.
Generated 1,943,555 records for year 2024.
Generated 770,952 records for year 2025.

Total generated records: 8,681,552

Fact_Sales.csv successful

## **遍历、加载维度表的数据计算和生成事实表** ##

In [16]:
import pandas as pd
import numpy as np
import os
import time
import datetime
from datetime import datetime
from itertools import product
from collections import defaultdict
import random

def generate_transaction_id(system_id: str) -> str:
    """
    根据给定的编码规则生成一个唯一的交易ID。

    编码规则: TR + 时间戳(YYMMDDHHmmss) + 系统/商户ID + 随机数(6位)
    例如: TR250901201234ABC987654

    Args:
        system_id: 3位系统或商户ID，例如 'WEB', 'POS'。

    Returns:
        生成的交易ID字符串。
    """
    # 验证系统ID的长度是否为3位
    if len(system_id) != 3:
        raise ValueError("System ID must be exactly 3 characters long.")

    # 1. 固定前缀
    prefix = "TR"

    # 2. 生成时间戳 (YYMMDDHHmmss)
    now = datetime.now()
    timestamp = now.strftime("%y%m%d%H%M%S")

    # 3. 随机数/序列号 (这里为了演示，使用6位随机数)
    # 在实际生产环境中，建议使用原子性的自增序列号来保证唯一性
    random_part = str(random.randint(100000, 999999))

    # 4. 拼接所有部分
    transaction_id = f"{prefix}{timestamp}{system_id.upper()}{random_part}"

    return transaction_id

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df, output_filepath, scale_factor=1):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    
    Args:
        scale_factor (int): A multiplier to scale the total number of records.
    """
    end_date = datetime(2025, 6, 30)

    # --------------------------
    # Step 1: Parse and Load Data (Hardcoded for this example)
    # --------------------------
    # 此数据源为特斯拉2013-2025Q2的季度收入数据（Revenue）
    # 来源：特斯拉公开财报，单位为百万美元（M），实际收入需乘以1,000,000
    revenue_data = {
        'Automotive Sales': {      
            (2013, 1): 400.0e6, (2013, 2): 420.0e6, (2013, 3): 450.0e6, (2013, 4): 500.0e6,
            (2014, 1): 700.0e6, (2014, 2): 750.0e6, (2014, 3): 800.0e6, (2014, 4): 850.0e6,
            (2015, 1): 900.0e6, (2015, 2): 950.0e6, (2015, 3): 1000.0e6, (2015, 4): 1100.0e6,
            (2016, 1): 1500.0e6, (2016, 2): 1600.0e6, (2016, 3): 1800.0e6, (2016, 4): 2000.0e6,
            (2017, 1): 2500.0e6, (2017, 2): 2800.0e6, (2017, 3): 3000.0e6, (2017, 4): 3200.0e6,
            (2018, 1): 3300.0e6, (2018, 2): 3900.0e6, (2018, 3): 6600.0e6, (2018, 4): 7000.0e6,
            (2019, 1): 3509.0e6, (2019, 2): 5168.0e6, (2019, 3): 5132.0e6, (2019, 4): 6143.0e6,
            (2020, 1): 4893.0e6, (2020, 2): 4911.0e6, (2020, 3): 7346.0e6, (2020, 4): 9034.0e6,
            (2021, 1): 8187.0e6, (2021, 2): 9520.0e6, (2021, 3): 11393.0e6, (2021, 4): 15025.0e6,
            (2022, 1): 15514.0e6, (2022, 2): 13670.0e6, (2022, 3): 17785.0e6, (2022, 4): 20241.0e6,
            (2023, 1): 18878.0e6, (2023, 2): 20419.0e6, (2023, 3): 18582.0e6, (2023, 4): 20630.0e6,
            (2024, 1): 16460.0e6, (2024, 2): 18530.0e6, (2024, 3): 18831.0e6, (2024, 4): 18659.0e6,
            (2025, 1): 12925.0e6, (2025, 2): 15787.0e6
        },
        'Automotive Regulatory Credits': {
            (2013, 1): 0, (2013, 2): 0, (2013, 3): 0, (2013, 4): 0,
            (2014, 1): 0, (2014, 2): 0, (2014, 3): 0, (2014, 4): 0,
            (2015, 1): 0, (2015, 2): 0, (2015, 3): 0, (2015, 4): 0,
            (2016, 1): 0, (2016, 2): 0, (2016, 3): 0, (2016, 4): 0,
            (2017, 1): 0, (2017, 2): 0, (2017, 3): 0, (2017, 4): 0,
            (2018, 1): 0, (2018, 2): 0, (2018, 3): 0, (2018, 4): 0,
            (2019, 1): 0, (2019, 2): 0, (2019, 3): 0, (2019, 4): 0,
            (2020, 1): 0, (2020, 2): 0, (2020, 3): 0, (2020, 4): 0,
            (2021, 1): 518.0e6, (2021, 2): 354.0e6, (2021, 3): 279.0e6, (2021, 4): 314.0e6,
            (2022, 1): 679.0e6, (2022, 2): 344.0e6, (2022, 3): 286.0e6, (2022, 4): 467.0e6,
            (2023, 1): 521.0e6, (2023, 2): 282.0e6, (2023, 3): 554.0e6, (2023, 4): 433.0e6,
            (2024, 1): 442.0e6, (2024, 2): 890.0e6, (2024, 3): 739.0e6, (2024, 4): 692.0e6,
            (2025, 1): 595.0e6, (2025, 2): 439.0e6
        },
        'Automotive Leasing': {
            (2013, 1): 25.0e6, (2013, 2): 28.0e6, (2013, 3): 30.0e6, (2013, 4): 32.0e6,
            (2014, 1): 35.0e6, (2014, 2): 38.0e6, (2014, 3): 40.0e6, (2014, 4): 42.0e6,
            (2015, 1): 45.0e6, (2015, 2): 48.0e6, (2015, 3): 50.0e6, (2015, 4): 55.0e6,
            (2016, 1): 60.0e6, (2016, 2): 65.0e6, (2016, 3): 70.0e6, (2016, 4): 75.0e6,
            (2017, 1): 80.0e6, (2017, 2): 85.0e6, (2017, 3): 90.0e6, (2017, 4): 95.0e6,
            (2018, 1): 100.0e6, (2018, 2): 110.0e6, (2018, 3): 120.0e6, (2018, 4): 130.0e6,
            (2019, 1): 215.0e6, (2019, 2): 208.0e6, (2019, 3): 221.0e6, (2019, 4): 225.0e6,
            (2020, 1): 239.0e6, (2020, 2): 268.0e6, (2020, 3): 265.0e6, (2020, 4): 280.0e6,
            (2021, 1): 297.0e6, (2021, 2): 332.0e6, (2021, 3): 385.0e6, (2021, 4): 628.0e6,
            (2022, 1): 668.0e6, (2022, 2): 588.0e6, (2022, 3): 621.0e6, (2022, 4): 599.0e6,
            (2023, 1): 564.0e6, (2023, 2): 567.0e6, (2023, 3): 489.0e6, (2023, 4): 500.0e6,
            (2024, 1): 476.0e6, (2024, 2): 458.0e6, (2024, 3): 446.0e6, (2024, 4): 447.0e6,
            (2025, 1): 447.0e6, (2025, 2): 435.0e6
        },
        'Energy Generation & Storage': {
            (2013, 1): 10.0e6, (2013, 2): 12.0e6, (2013, 3): 14.0e6, (2013, 4): 16.0e6,
            (2104, 1): 18.0e6, (2014, 2): 20.0e6, (2014, 3): 22.0e6, (2014, 4): 24.0e6,
            (2015, 1): 26.0e6, (2015, 2): 28.0e6, (2015, 3): 30.0e6, (2015, 4): 32.0e6,
            (2016, 1): 35.0e6, (2016, 2): 38.0e6, (2016, 3): 40.0e6, (2016, 4): 42.0e6,
            (2017, 1): 45.0e6, (2017, 2): 48.0e6, (2017, 3): 50.0e6, (2017, 4): 55.0e6,
            (2018, 1): 60.0e6, (2018, 2): 65.0e6, (2018, 3): 70.0e6, (2018, 4): 75.0e6,
            (2019, 1): 324.0e6, (2019, 2): 369.0e6, (2019, 3): 402.0e6, (2019, 4): 436.0e6,
            (2020, 1): 293.0e6, (2020, 2): 370.0e6, (2020, 3): 579.0e6, (2020, 4): 752.0e6,
            (2021, 1): 494.0e6, (2021, 2): 801.0e6, (2021, 3): 806.0e6, (2021, 4): 688.0e6,
            (2022, 1): 616.0e6, (2022, 2): 866.0e6, (2022, 3): 1117.0e6, (2022, 4): 1310.0e6,
            (2023, 1): 1529.0e6, (2023, 2): 1509.0e6, (2023, 3): 1559.0e6, (2023, 4): 1438.0e6,
            (2024, 1): 1635.0e6, (2024, 2): 3014.0e6, (2024, 3): 2376.0e6, (2024, 4): 3061.0e6,
            (2025, 1): 2730.0e6, (2025, 2): 2789.0e6
        },
        'Services & Other': {
            (2013, 1): 15.0e6, (2013, 2): 17.0e6, (2013, 3): 19.0e6, (2013, 4): 20.0e6,
            (2014, 1): 22.0e6, (2014, 2): 24.0e6, (2014, 3): 26.0e6, (2014, 4): 28.0e6,
            (2015, 1): 30.0e6, (2015, 2): 32.0e6, (2015, 3): 35.0e6, (2015, 4): 38.0e6,
            (2016, 1): 40.0e6, (2016, 2): 45.0e6, (2016, 3): 48.0e6, (2016, 4): 50.0e6,
            (2017, 1): 55.0e6, (2017, 2): 58.0e6, (2017, 3): 55.0e6, (2017, 4): 58.0e6,
            (2018, 1): 65.0e6, (2018, 2): 70.0e6, (2018, 3): 75.0e6, (2018, 4): 80.0e6,
            (2019, 1): 493.0e6, (2019, 2): 605.0e6, (2019, 3): 548.0e6, (2019, 4): 580.0e6,
            (2020, 1): 560.0e6, (2020, 2): 487.0e6, (2020, 3): 581.0e6, (2020, 4): 678.0e6,
            (2021, 1): 893.0e6, (2021, 2): 951.0e6, (2021, 3): 894.0e6, (2021, 4): 1064.0e6,
            (2022, 1): 1279.0e6, (2022, 2): 1466.0e6, (2022, 3): 1645.0e6, (2022, 4): 1701.0e6,
            (2023, 1): 1837.0e6, (2023, 2): 2150.0e6, (2023, 3): 2166.0e6, (2023, 4): 2166.0e6,
            (2024, 1): 2288.0e6, (2024, 2): 2608.0e6, (2024, 3): 2790.0e6, (2024, 4): 2848.0e6,
            (2025, 1): 2638.0e6, (2025, 2): 3046.0e6
        }
    }
    
    # 定义每笔交易的平均收入（针对非按件销售的类别）
    transaction_revenue_per_category = {
        'Automotive Regulatory Credits': 10e6,
        'Services & Other': 100e3,
        'Automotive Leasing': 100e3,
        'Energy Generation & Storage': 100e3,
    }

    # --------------------------
    # Step 2: Define Product Distribution Weights
    # --------------------------
    # 模拟不同时期各个汽车型号的市场份额变化
    # 注意：这些权重是模拟数据，旨在使生成的数据更具现实感
    automotive_product_distribution = {
        'Model S': {
            '2013-2016': 0.8,
            '2017-2018': 0.3,
            '2019-2022': 0.1,
            '2023-2025': 0.05,
        },
        'Model X': {
            '2015-2018': 0.2,
            '2019-2022': 0.1,
            '2023-2025': 0.05,
        },
        'Model 3': {
            '2017-2018': 0.5,
            '2019-2022': 0.6,
            '2023-2025': 0.4,
        },
        'Model Y': {
            '2020-2022': 0.2,
            '2023-2025': 0.4,
        },
        'Cybertruck': {
            '2023-2025': 0.1,
        }
    }

    # Define and calculate geographical weights
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    def get_continent(country):
        if country in asia_countries: return 'Asia'
        if country in oceania_countries: return 'Oceania'
        if country in europe_countries: return 'Europe'
        if country in ['United States', 'Canada', 'Mexico']: return 'North America'
        return 'Other'
    
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)
    dim_geography_df['Geo_Weight'] = 0.0
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02, 'Other': 0.001}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.70, 'Japan': 0.18, 'South Korea': 0.13, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.75, 'New Zealand': 0.25, 'Taiwan': 0.1}
    state_province_weights = {
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10,
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15,
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10,
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05,
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10,
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10,
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10,
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05,
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05,
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05,
    }
    
    for _, row in dim_geography_df.iterrows():
        continent = row['Continent']
        country = row['Country']
        state = row['State_Province']
        
        c_weight = continent_weights.get(continent, 0.001)
        country_w = country_weights.get(country, 0.01)
        state_w = state_province_weights.get(state, 0.01)
        
        dim_geography_df.loc[(dim_geography_df['Geo_ID'] == row['Geo_ID']), 'Geo_Weight'] = c_weight * country_w * state_w
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].replace(0.0, 0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    if 'Quarter_of_Year' not in dim_time_df.columns:
        dim_time_df['Quarter_of_Year'] = dim_time_df['Full_Date'].dt.quarter
    dim_time_df['Year_Int'] = dim_time_df['Year'].astype(int)
    dim_time_df['Quarter_Int'] = dim_time_df['Quarter_of_Year'].astype(int)
    dim_time_df['Quarter_Start_Date'] = dim_time_df['Full_Date'].dt.to_period('Q').dt.start_time
    
    price_lookup = {}
    if 'Standard_Price_USD' not in dim_prices_df.columns or 'Discounted_Price_USD' not in dim_prices_df.columns:
        raise KeyError("Dim_Prices.csv is missing required columns.")
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    for _, row in dim_prices_df.iterrows():
        quarter_start_date = row['Quarter_Start_Date']
        product_id = row['Product_ID']
        price_lookup[(quarter_start_date, product_id)] = {
            'Standard_Price_USD': row['Standard_Price_USD'],
            'Discounted_Price_USD': row['Discounted_Price_USD']
        }

    # --------------------------
    # Step 3: Dynamically Map Products to Categories
    # --------------------------
    
    # 使用所有独特的产品类别
    all_product_categories = dim_product_df['Product_Category'].unique().tolist()
    
    category_to_product_ids = defaultdict(list)
    product_id_to_name = dim_product_df.set_index('Product_ID')['Product_Name'].to_dict()
    
    for _, row in dim_product_df.iterrows():
        category_to_product_ids[row['Product_Category']].append(row['Product_ID'])

    print("\n--------------------")
    print("开始遍历所有维度ID...")
    print("--------------------")

    # 打印总结信息，使用动态获取的类别
    for category in all_product_categories:
        print(f"找到的 '{category}' 产品ID数量: {len(category_to_product_ids[category])}")
    print(f"找到的地理ID数量: {len(dim_geography_df['Geo_ID'].tolist())}")

    print("遍历成功，开始生成销售事实表。")
    print("--------------------")

    # 预先计算一次地理概率，避免在循环中重复计算
    geo_weights_dict = dim_geography_df.set_index('Geo_ID')['Geo_Weight'].to_dict()
    real_geo_ids = dim_geography_df['Geo_ID'].tolist()
    geo_weights_array = np.array(list(geo_weights_dict.values()))
    total_geo_weight = sum(geo_weights_dict.values())
    
    if total_geo_weight > 0:
        geo_probabilities = geo_weights_array / total_geo_weight
    else:
        num_geos = len(real_geo_ids)
        geo_probabilities = np.full(num_geos, 1.0 / num_geos)

    # --------------------------
    # Step 4: Generate Fact Table Records
    # --------------------------
    start_year = min(y for y, q in revenue_data['Automotive Sales'].keys())
    total_generated_rows = 0
    
    with open(output_filepath, 'w', newline='', encoding='utf-8') as f:
        writer = None
        
        for year in range(start_year, end_date.year + 1):
            yearly_rows_generated = 0
            for quarter in range(1, 5):
                if year == 2025 and quarter > 2:
                    continue
                
                quarter_dates_df = dim_time_df[
                    (dim_time_df['Year_Int'] == year) & (dim_time_df['Quarter_Int'] == quarter)
                ]
                
                if quarter_dates_df.empty:
                    continue

                records = []
                
                # 为每笔交易生成一个唯一的ID
                system_id = "WEB" # 假设交易来自网页端
                
                for category_name in all_product_categories:
                    revenue = revenue_data.get(category_name, {}).get((year, quarter), 0)
                    
                    if revenue > 0:
                        if category_name == 'Automotive Sales':
                            # 1. 动态获取 'Automotive Sales' 类别下的所有产品
                            automotive_product_ids = category_to_product_ids.get(category_name, [])
                            
                            # 2. 如果该类别下没有产品，则跳过
                            if not automotive_product_ids:
                                continue
                            
                            # 3. 为每个产品分配权重，优先使用硬编码的权重，未列出的产品使用默认权重
                            current_product_weights = {}
                            total_assigned_weight = 0
                            
                            # 将 hardcoded_distribution 转换为 product_id 到 weight 的映射
                            hardcoded_product_id_weights = {}
                            for p_name, time_weights in automotive_product_distribution.items():
                                matching_product_df = dim_product_df[dim_product_df['Product_Name'] == p_name]
                                if not matching_product_df.empty:
                                    product_id = matching_product_df['Product_ID'].iloc[0]
                                    for time_range, weight in time_weights.items():
                                        start_y, end_y = map(int, time_range.split('-'))
                                        if start_y <= year <= end_y:
                                            hardcoded_product_id_weights[product_id] = weight
                                            total_assigned_weight += weight
                                else:
                                    # 这部分不再打印警告，因为我们现在会处理所有产品，而不仅仅是硬编码的
                                    pass

                            # 计算剩余的未分配权重
                            remaining_weight = max(0, 1.0 - total_assigned_weight)
                            unassigned_product_ids = [pid for pid in automotive_product_ids if pid not in hardcoded_product_id_weights]
                            default_weight = remaining_weight / len(unassigned_product_ids) if unassigned_product_ids else 0

                            # 分配默认权重给剩余产品
                            for pid in unassigned_product_ids:
                                current_product_weights[pid] = default_weight

                            # 合并硬编码和默认权重
                            current_product_weights.update(hardcoded_product_id_weights)
                            
                            # 4. 再次确保权重总和为1
                            total_weight = sum(current_product_weights.values())
                            if total_weight > 0:
                                normalized_weights = {p_id: w / total_weight for p_id, w in current_product_weights.items()}
                            else: # 如果没有有效权重，则平均分配
                                normalized_weights = {p_id: 1.0 / len(current_product_weights) for p_id in current_product_weights.keys()}

                            # 5. 计算加权平均价格
                            weighted_avg_price = 0
                            for product_id, weight in normalized_weights.items():
                                quarter_start_date = quarter_dates_df['Quarter_Start_Date'].iloc[0]
                                price_info = price_lookup.get((quarter_start_date, product_id))
                                if price_info:
                                    weighted_avg_price += price_info['Standard_Price_USD'] * weight

                            if weighted_avg_price == 0:
                                continue
                            
                            # 6. 根据加权平均价格估算交易数量
                            # --- 关键修改：在这里应用缩放因子 ---
                            num_transactions = int(revenue / weighted_avg_price)
                            num_transactions *= scale_factor
                            if num_transactions == 0:
                                num_transactions = 1
                            
                            # 7. 根据归一化权重生成交易记录
                            for _ in range(num_transactions):
                                sampled_product_id = random.choices(list(normalized_weights.keys()), weights=list(normalized_weights.values()), k=1)[0]
                                quarter_start_date = quarter_dates_df['Quarter_Start_Date'].iloc[0]
                                price_info = price_lookup.get((quarter_start_date, sampled_product_id))

                                if price_info:
                                    is_discounted = random.random() < 0.2
                                    revenue_per_record = price_info['Discounted_Price_USD'] if is_discounted else price_info['Standard_Price_USD']
                                    
                                    records.append({
                                        'TransactionID': generate_transaction_id(system_id),
                                        'Time_ID': random.choice(quarter_dates_df['Time_ID'].values),
                                        'Geo_ID': random.choices(real_geo_ids, weights=geo_probabilities, k=1)[0],
                                        'Product_ID': sampled_product_id,
                                        'Customer_ID': random.choice(customer_ids),
                                        'Sales_Units': 1,
                                        'Is_Discounted_Sale': is_discounted,
                                        'Revenue_USD': revenue_per_record * (1 + np.random.normal(0, 0.01)),
                                        'Revenue_Category': category_name
                                    })
                        else:
                            # Original logic for other categories remains the same
                            product_ids_for_category = category_to_product_ids.get(category_name)
                            transaction_unit = transaction_revenue_per_category.get(category_name, 1000)
                            num_transactions = int(revenue / transaction_unit)
                            # --- 关键修改：在这里应用缩放因子 ---
                            num_transactions *= scale_factor
                            if num_transactions == 0:
                                num_transactions = 1
                            
                            sampled_product_ids = np.random.choice(product_ids_for_category, size=num_transactions)
                            sampled_geo_ids = np.random.choice(real_geo_ids, size=num_transactions, p=geo_probabilities)
                            sampled_customer_ids = np.random.choice(customer_ids, size=num_transactions)
                            sampled_time_ids = np.random.choice(quarter_dates_df['Time_ID'].values, size=num_transactions)

                            for i in range(num_transactions):
                                records.append({
                                    'TransactionID': generate_transaction_id(system_id),
                                    'Time_ID': sampled_time_ids[i],
                                    'Geo_ID': sampled_geo_ids[i],
                                    'Product_ID': sampled_product_ids[i],
                                    'Customer_ID': sampled_customer_ids[i],
                                    'Sales_Units': 1,
                                    'Is_Discounted_Sale': False,
                                    'Revenue_USD': revenue / num_transactions * (1 + np.random.normal(0, 0.05)),
                                    'Revenue_Category': category_name
                                })
                
                fact_sales_df_temp = pd.DataFrame(records)
                
                if not fact_sales_df_temp.empty:
                    if writer is None:
                        fact_sales_df_temp.to_csv(f, header=True, index=False, encoding='utf-8')
                        writer = True
                    else:
                        fact_sales_df_temp.to_csv(f, header=False, index=False, encoding='utf-8')
                    total_generated_rows += len(fact_sales_df_temp)
                    yearly_rows_generated += len(fact_sales_df_temp)
            
            if yearly_rows_generated > 0:
                print(f"Generated {yearly_rows_generated:,} records for year {year}.")

    print(f"\nTotal generated records: {total_generated_rows:,}")
    return total_rows

if __name__ == '__main__':
    start_time = time.time()
    
    output_dir = './output_data'
    
    print(f"Loading all dimension tables and data sources from '{output_dir}'...")
    
    dim_product_df = None
    dim_time_df = None
    dim_customer_df = None
    dim_geography_df = None
    dim_prices_df = None
    
    try:
        dim_product_df = pd.read_csv(os.path.join(output_dir, 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join(output_dir, 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join(output_dir, 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join(output_dir, 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join(output_dir, 'Dim_Prices.csv'))
        
        print("所有维度表加载成功。")
        
    except FileNotFoundError as e:
        print(f"Error: One or more required CSV files are missing. Please ensure all dimension tables are in the '{output_dir}' directory. The file '{e.filename}' was not found.")
        print("Data generation failed due to missing files.")
        exit()
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_filepath = os.path.join(output_dir, 'Fact_Sales.csv')
    
    if os.path.exists(output_filepath):
        os.remove(output_filepath)

    print("\n开始整理和生成销售事实表...")
    # --- 关键修改：在这里调用函数时传入一个更大的缩放因子，例如 10 ---
    total_rows = generate_fact_sales(
        dim_product_df, 
        dim_time_df, 
        dim_customer_df, 
        dim_geography_df, 
        dim_prices_df, 
        output_filepath, 
        scale_factor=10
    )
    
    if total_rows > 0:
        end_time = time.time()
        print(f"\nFact_Sales.csv successfully generated {total_rows:,} rows of data in {end_time - start_time:.2f} seconds.")
        print("数据生成完成!")
    else:
        print("数据生成失败。")


Loading all dimension tables and data sources from './output_data'...
所有维度表加载成功。

开始整理和生成销售事实表...

--------------------
开始遍历所有维度ID...
--------------------
找到的 'Automotive Leasing' 产品ID数量: 4
找到的 'Energy Generation & Storage' 产品ID数量: 4
找到的 'Automotive Sales' 产品ID数量: 3138
找到的 'Automotive Regulatory Credits' 产品ID数量: 1
找到的 'Services & Other' 产品ID数量: 4
找到的地理ID数量: 432
遍历成功，开始生成销售事实表。
--------------------
Generated 23,800 records for year 2013.
Generated 32,100 records for year 2014.
Generated 44,900 records for year 2015.
Generated 60,800 records for year 2016.
Generated 77,400 records for year 2017.
Generated 102,000 records for year 2018.
Generated 462,600 records for year 2019.
Generated 535,200 records for year 2020.
Generated 824,740 records for year 2021.
Generated 1,249,350 records for year 2022.
Generated 1,649,180 records for year 2023.
Generated 2,247,450 records for year 2024.
Generated 1,209,520 records for year 2025.

Total generated records: 8,519,040

Fact_Sales.csv successfull