**1/6: 生成 Dim_Product 表 (简单静态数据)维度表更可能需要更新Update或追加Append。例如，新产品发布，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [13]:
# -*- coding: utf-8 -*-
"""1/6: Generate Dim_Product Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_product():
    """Generates the Dim_Product table."""
    data = [
        [1, 'Model 3', 'Sedan', 46500.00, '2017-07-28'],
        [2, 'Model Y', 'SUV', 55000.00, '2020-03-13'],
        [3, 'Model S', 'Sedan', 82500.00, '2012-06-22'],
        [4, 'Model X', 'SUV', 95000.00, '2015-09-29'],
        [5, 'Cybertruck', 'Truck', 70000.00, '2023-11-30']
    ]
    df = pd.DataFrame(data, columns=['Model_ID', 'Model_Name', 'Model_Category', 'Model_Base_Price_USD', 'Model_Launch_Date'])
    df['Model_Launch_Date'] = pd.to_datetime(df['Model_Launch_Date'])
    return df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Product table...")
    dim_product_df = generate_dim_product()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Product.csv...")
    dim_product_df.to_csv(os.path.join(output_dir, 'Dim_Product.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Product.csv has been successfully generated with {len(dim_product_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Product table...
Saving Dim_Product.csv...
Dim_Product.csv has been successfully generated with 5 rows in 0.00 seconds.


**2/6: 生成 Dim_Time 表 (单一向前数据)维度表只追加Append。每一个时间点、每一天、每一个月都是一个既定的、永恒不变的事实。你无法“更新”昨天或去年的日期，此时就需要追加Append表中的相应记录。没有复杂的版本控制机制（Slowly Changing Dimension, SCD）**

In [12]:
# -*- coding: utf-8 -*-
"""2/6: Generate Dim_Time Table"""

import pandas as pd
import os
import time
import numpy as np
import random
import datetime

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_time():
    """Generates the Dim_Time table."""
    start_date = datetime.date(2017, 1, 1)
    end_date = datetime.date(2025, 12, 31)
    date_range = [start_date + datetime.timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

    data = []
    for date in date_range:
        data.append([
            int(date.strftime('%Y%m%d')),
            date,
            date.year,
            f"Q{((date.month - 1) // 3) + 1}",
            date.month,
            date.day,
            date.isocalendar()[1],
            date.isoweekday(),
            date.strftime('%A')
        ])
    
    return pd.DataFrame(data, columns=['Time_ID', 'Full_Date', 'Year', 'Quarter', 'Month', 'Day', 'Week_of_Year', 'Day_of_Week', 'Day_Name'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Time table...")
    dim_time_df = generate_dim_time()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Time.csv...")
    dim_time_df.to_csv(os.path.join(output_dir, 'Dim_Time.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Time.csv has been successfully generated with {len(dim_time_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Time table...
Saving Dim_Time.csv...
Dim_Time.csv has been successfully generated with 3287 rows in 0.03 seconds.


**3/6: 生成 Dim_Customer 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的收入水平或家庭住址可能会发生变化，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [11]:
# -*- coding: utf-8 -*-
"""3/6: Generate Dim_Customer Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_customer(num_customers=50000):
    """Generates the Dim_Customer table."""
    genders = ['Male', 'Female', 'Other']
    age_groups = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']
    income_levels = ['Low', 'Medium', 'High']
    first_names = ['James', 'Mary', 'John', 'Patricia', 'Robert', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'David', 'Susan', 'Richard', 'Jessica', 'Joseph', 'Sarah', 'Thomas', 'Karen', 'Charles', 'Nancy', 'Christopher', 'Lisa', 'Daniel', 'Betty', 'Paul', 'Margaret', 'Mark', 'Sandra', 'Donald', 'Ashley', 'George', 'Kimberly', 'Kenneth', 'Donna', 'Steven', 'Emily', 'Edward', 'Carol', 'Brian', 'Michelle', 'Ronald', 'Amanda', 'Anthony', 'Melissa', 'Kevin', 'Deborah', 'Jason', 'Stephanie', 'Jeff', 'Maria', 'Gary', 'Heather', 'Timothy', 'Nicole', 'Jose', 'Denise', 'Larry', 'Megan', 'Jeffrey', 'Christina', 'Frank', 'Alexis', 'Scott', 'Tiffany', 'Eric', 'Lauren', 'Stephen', 'Rachel', 'Andrew', 'Crystal', 'Raymond', 'Kayla', 'Ryan', 'Danielle', 'Jacob', 'Brittany', 'Nicholas', 'Emma', 'Jonathan', 'Samantha', 'Laura', 'Alexis', 'Joshua', 'Brandon', 'Justin', 'Daniel', 'Daniel', 'Taylor']
    last_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor', 'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson', 'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young', 'Hernandez', 'King', 'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins', 'Stewart', 'Sanchez', 'Morris', 'Rogers', 'Reed', 'Cook', 'Morgan', 'Bell', 'Murphy', 'Bailey', 'Rivera', 'Cooper', 'Richardson', 'Cox', 'Howard', 'Ward', 'Torres', 'Peterson', 'Gray', 'Ramirez', 'James', 'Watson', 'Brooks', 'Kelly', 'Sanders', 'Price', 'Bennett', 'Wood', 'Barnes', 'Ross', 'Henderson', 'Coleman', 'Jenkins', 'Perry', 'Powell', 'Long', 'Patterson', 'Hughes', 'Flores', 'Washington', 'Butler', 'Simmons', 'Foster', 'Gonzales', 'Bryant', 'Alexander', 'Russell', 'Griffin', 'Diaz', 'Hayes', 'Myers', 'Ford', 'Hamilton', 'Graham', 'Sullivan', 'Wallace', 'Woods', 'Cole', 'West', 'Jordan', 'Owens', 'Reynolds', 'Fisher', 'Ellis', 'Harrison', 'Gibson', 'Mcdonald', 'Cruz', 'Marshall', 'Ortiz', 'Gomez', 'Murray', 'Freeman', 'Wells', 'Webb', 'Simpson', 'Stevens', 'Tucker', 'Porter', 'Hunter', 'Hicks', 'Crawford', 'Henry', 'Boyd', 'Mason', 'Kennedy', 'Warren', 'Dixon', 'Ramos', 'Reid', 'Carr', 'Chavez', 'Gibson']
    
    data = []
    for i in range(1, num_customers + 1):
        full_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        gender = random.choice(genders)
        age_group = random.choice(age_groups)
        income_level = random.choice(income_levels)
        data.append([i, full_name, gender, age_group, income_level])
        
    return pd.DataFrame(data, columns=['Customer_ID', 'Customer_Name', 'Gender', 'Age_Group', 'Income_Level'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Customer table...")
    dim_customer_df = generate_dim_customer()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Customer.csv...")
    dim_customer_df.to_csv(os.path.join(output_dir, 'Dim_Customer.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Customer.csv has been successfully generated with {len(dim_customer_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Customer table...
Saving Dim_Customer.csv...
Dim_Customer.csv has been successfully generated with 50000 rows in 0.09 seconds.


**4/6: 生成 Dim_Geography 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，一个客户的地址可能会发生变化或更新到新的国家和城市，此时就需要更新Update或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [10]:
# -*- coding: utf-8 -*-
"""4/6: Generate Dim_Geography Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_plausible_zip(country, state_province_abbr):
    """Generates a plausible zip code based on the country and state/province."""
    letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'

    if country == 'United States':
        return f"{random.randint(10000, 99999)}"
    elif country == 'Canada':
        return f"{random.choice(letters)}{random.choice(digits)}{random.choice(letters)} {random.choice(digits)}{random.choice(letters)}{random.choice(digits)}"
    elif country == 'Mexico':
        return f"{random.randint(10000, 99999)}"
    elif country in ['Germany', 'Italy', 'Spain', 'Switzerland', 'Netherlands', 'Denmark', 'Norway', 'Sweden', 'Finland', 'Greece', 'Iceland', 'Ireland', 'Luxembourg', 'Monaco']:
        return f"{random.randint(10000, 99999)}"
    elif country == 'United Kingdom':
        part1 = ''.join(random.choices(letters, k=random.choice([1, 2]))) + ''.join(random.choices(digits, k=random.choice([1, 2])))
        part2 = f"{random.choice(digits)}{random.choice(letters)}{random.choice(letters)}"
        return f"{part1} {part2}"
    elif country == 'France':
        return f"{random.randint(1, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}{random.randint(0, 9)}"
    elif country == 'China':
        return f"{random.randint(100000, 999999)}"
    elif country in ['Japan', 'South Korea', 'Taiwan', 'Hong Kong', 'Macau']:
        return f"{random.randint(10000, 9999999)}"
    elif country == 'Australia':
        return f"{random.randint(1000, 9999)}"
    elif country == 'New Zealand':
        return f"{random.randint(1000, 9999)}"
    else:
        return ""

def generate_dim_geography():
    """
    Generates a Dim_Geography table.
    """
    geography_data = []
    geo_id = 1
    
    # North America
    north_america_countries = {
        'United States': {
            'code': 'US',
            'provinces': [
                'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
                'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
                'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
                'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
                'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
                'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
                'Wisconsin', 'Wyoming'
            ]
        },
        'Canada': {
            'code': 'CA',
            'provinces': [
                'Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador',
                'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Québec', 'Saskatchewan',
                'Northwest Territories', 'Nunavut', 'Yukon'
            ]
        },
        'Mexico': {
            'code': 'MX',
            'provinces': [
                'Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas',
                'Chihuahua', 'Coahuila', 'Colima', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo',
                'Jalisco', 'México', 'Distrito Federal', 'Michoacán', 'Morelos', 'Nayarit',
                'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
                'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas'
            ]
        }
    }

    # Europe
    europe_countries = {
        'Germany': {
            'code': 'DE',
            'provinces': [
                'Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg',
                'Hesse', 'Lower Saxony', 'Mecklenburg-Vorpommern', 'North Rhine-Westphalia',
                'Rhineland-Palatinate', 'Saarland', 'Saxony', 'Saxony-Anhalt',
                'Schleswig-Holstein', 'Thuringia'
            ]
        },
        'United Kingdom': {
            'code': 'GB',
            'provinces': [
                'England', 'Scotland', 'Wales', 'Northern Ireland'
            ]
        },
        'Norway': {'code': 'NO', 'provinces': ['Oslo', 'Viken', 'Innlandet', 'Vestfold og Telemark', 'Agder', 'Rogaland', 'Vestland', 'Møre og Romsdal', 'Trøndelag', 'Nordland', 'Troms og Finnmark']},
        'France': {'code': 'FR', 'provinces': ['Bretagne', 'Normandie', 'Île-de-France', 'Auvergne-Rhône-Alpes', 'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Corsica', 'Grand Est', 'Hauts-de-France', 'Nouvelle-Aquitaine', 'Occitanie', 'Pays de la Loire', 'Provence-Alpes-Côte d\'Azur']},
        'Netherlands': {'code': 'NL', 'provinces': ['Drenthe', 'Flevoland', 'Friesland', 'Gelderland', 'Groningen', 'Limburg', 'North Brabant', 'North Holland', 'Overijssel', 'Utrecht', 'Zeeland', 'South Holland']},
        'Sweden': {'code': 'SE', 'provinces': ['Blekinge', 'Dalarna', 'Gotland', 'Gävleborg', 'Halland', 'Jämtland', 'Jönköping', 'Kalmar', 'Kronoberg', 'Norrbotten', 'Skåne', 'Stockholm', 'Södermanland', 'Uppsala', 'Värmland', 'Västerbotten', 'Västernorrland', 'Västmanland', 'Västra Götaland', 'Örebro', 'Östergötland']},
        'Switzerland': {'code': 'CH', 'provinces': ['Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 'Solothurn', 'Basel-Stadt', 'Basel-Landschaft', 'Schaffhausen', 'Appenzell Ausserrhoden', 'Appenzell Innerrhoden', 'St. Gallen', 'Graubünden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchâtel', 'Geneva', 'Jura']},
        'Italy': {'code': 'IT', 'provinces': ['Abruzzo', 'Aosta Valley', 'Apulia', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marche', 'Molise', 'Piedmont', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-Alto Adige', 'Umbria', 'Veneto']},
        'Spain': {'code': 'ES', 'provinces': ['Andalusia', 'Aragon', 'Principality of Asturias', 'Balearic Islands', 'Basque Country', 'Canary Islands', 'Cantabria', 'Castile and León', 'Castile-La Mancha', 'Catalonia', 'Community of Madrid', 'Valencian Community', 'Extremadura', 'Galicia', 'La Rioja', 'Region of Murcia', 'Foral Community of Navarre']},
        'Denmark': {'code': 'DK', 'provinces': ['Capital Region of Denmark', 'Central Denmark Region', 'North Denmark Region', 'Region Zealand', 'Region of Southern Denmark']},
        'Finland': {'code': 'FI', 'provinces': ['Åland Islands', 'Central Finland', 'Central Ostrobothnia', 'Kainuu', 'Kymenlaakso', 'Lapland', 'North Karelia', 'North Ostrobothnia', 'Northern Savonia', 'Päijät-Häme', 'Pirkanmaa', 'Satakunta', 'South Karelia', 'Southern Ostrobothnia', 'Southern Savonia', 'Tavastia Proper', 'Uusimaa', 'Southwest Finland']},
        'Greece': {'code': 'GR', 'provinces': ['Attica', 'Central Greece', 'Central Macedonia', 'Crete', 'East Macedonia and Thrace', 'Epirus', 'Ionian Islands', 'North Aegean', 'Peloponnese', 'South Aegean', 'Thessaly', 'West Greece', 'West Macedonia']},
        'Iceland': {'code': 'IS', 'provinces': ['Capital Region', 'Southern Peninsula', 'Western Region', 'Westfjords', 'Northwest Region', 'Northeast Region', 'Eastern Region', 'Southern Region']},
        'Ireland': {'code': 'IE', 'provinces': ['Connacht', 'Leinster', 'Munster', 'Ulster']},
        'Luxembourg': {'code': 'LU', 'provinces': ['Diekirch', 'Grevenmacher', 'Luxembourg']},
        'Monaco': {'code': 'MC', 'provinces': ['Monaco']}
    }
    
    # Asia
    asia_countries = {
        'China': {
            'code': 'CN',
            'provinces': [
                'Anhui', 'Fujian', 'Gansu', 'Guangdong', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang',
                'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Liaoning', 'Qinghai',
                'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Yunnan', 'Zhejiang',
                'Guangxi', 'Nei Mongol', 'Ningxia Hui', 'Xinjiang Uygur', 'Xizang', 
                'Beijing', 'Chongqing', 'Shanghai', 'Tianjin'
            ]
        },
        'Hong Kong': {'code': 'HK', 'provinces': ['Hong Kong Island', 'Kowloon', 'New Territories']},
        'Macau': {'code': 'MO', 'provinces': ['Macau']},
        'Japan': {
            'code': 'JP',
            'provinces': [
                'Hokkaido', 'Aomori', 'Iwate', 'Miyagi', 'Akita', 'Yamagata', 'Fukushima',
                'Ibaraki', 'Tochigi', 'Gunma', 'Saitama', 'Chiba', 'Tokyo', 'Kanagawa',
                'Niigata', 'Toyama', 'Ishikawa', 'Fukui', 'Yamanashi', 'Nagano',
                'Gifu', 'Shizuoka', 'Aichi', 'Mie', 'Shiga', 'Kyoto', 'Osaka',
                'Hyōgo', 'Nara', 'Wakayama', 'Tottori', 'Shimane', 'Okayama',
                'Hiroshima', 'Yamaguchi', 'Tokushima', 'Kagawa', 'Ehime', 'Kochi',
                'Fukuoka', 'Saga', 'Naoasaki', 'Kumamoto', 'Oita', 'Miyazaki', 'Kagoshima', 'Okinawa'
            ]
        },
        'South Korea': {
            'code': 'KR',
            'provinces': [
                'Busan', 'Chungcheongbuk-do', 'Chungcheongnam-do', 'Daegu', 'Daejeon', 'Gangwon-do',
                'Gwangju', 'Gyeonggi-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Incheon', 'Jeollabuk-do',
                'Jeollanam-do', 'Sejong', 'Seoul', 'Ulsan', 'Jeju'
            ]
        },
        'Taiwan': {
            'code': 'TW',
            'provinces': [
                'Taipei', 'New Taipei', 'Taichung', 'Tainan', 'Kaohsiung', 'Taoyuan', 
                'Keelung', 'Hsinchu City', 'Chiayi City', 'Hsinchu County', 'Chiayi County',
                'Changhua', 'Nantou', 'Yulin', 'Miaoli', 'Pingtung', 'Yilan', 'Hualien',
                'Taitung', 'Penghu', 'Kinmen', 'Lienkiang'
            ]
        }
    }

    # Oceania
    oceania_countries = {
        'Australia': {
            'code': 'AU',
            'provinces': [
                'New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia',
                'Tasmania', 'Australian Capital Territory', 'Northern Territory'
            ]
        },
        'New Zealand': {
            'code': 'NZ',
            'provinces': [
                'Auckland', 'Bay of Plenty', 'Canterbury', 'Gisborne', 'Hawke\'s Bay',
                'Manawatu-Wanganui', 'Marlborough', 'Nelson', 'Northland', 'Otago',
                'Southland', 'Taranaki', 'Tasman', 'Waikato', 'Wellington', 'West Coast'
            ]
        }
    }
    
    continents = {
        'North America': north_america_countries,
        'Europe': europe_countries,
        'Asia': asia_countries,
        'Oceania': oceania_countries
    }

    for continent, countries in continents.items():
        for country, details in countries.items():
            for province in details['provinces']:
                state_abbr = province[:2].upper()
                
                geography_data.append([
                    geo_id,
                    continent,
                    country,
                    details['code'],
                    province,
                    state_abbr,
                    generate_plausible_zip(country, state_abbr)
                ])
                geo_id += 1

    dim_geography_df = pd.DataFrame(geography_data, columns=[
        'Geo_ID', 'Continent', 'Country', 'Country_Code', 'State_Province', 'State_Province_Abbr', 'Zip_Code'
    ])
    
    return dim_geography_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("Generating Dim_Geography table...")
    dim_geography_df = generate_dim_geography()
    
    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Geography.csv...")
    dim_geography_df.to_csv(os.path.join(output_dir, 'Dim_Geography.csv'), index=False, encoding='utf-8')
    
    end_time = time.time()
    print(f"Dim_Geography.csv has been successfully generated with {len(dim_geography_df)} rows in {end_time - start_time:.2f} seconds.")

Generating Dim_Geography table...
Saving Dim_Geography.csv...
Dim_Geography.csv has been successfully generated with 432 rows in 0.00 seconds.


**5/6: 生成 Dim_Prices 表 (相对静态数据)维度表更可能需要更新Update或追加Append。例如，新产品或不同时段价格可能会发生变化，此时就需要更新或追加Append表中的相应记录。这种变化管理被称为“缓慢变化维度”（Slowly Changing Dimension, SCD）**

In [9]:
# -*- coding: utf-8 -*-
"""5/6: Generate Dim_Prices Table"""

import pandas as pd
import os
import time
import numpy as np
import random

# Use a fixed random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def generate_dim_prices(dim_product_df, dim_time_df):
    """Generates the Dim_Prices table."""
    quarter_start_dates = sorted(dim_time_df['Full_Date'].loc[dim_time_df['Month'].isin([1, 4, 7, 10])].unique())
    model_ids = dim_product_df['Model_ID'].unique()
    
    prices_data = []
    
    # Dynamic price generation with seasonal/random fluctuations
    for quarter_start_date in quarter_start_dates:
        for model_id in model_ids:
            # Base price from product table
            base_price = dim_product_df.loc[dim_product_df['Model_ID'] == model_id, 'Model_Base_Price_USD'].iloc[0]
            
            # Fluctuate prices randomly with a trend
            price_factor = 1 + random.uniform(-0.05, 0.05)
            
            # Apply launch date logic
            launch_date = dim_product_df.loc[dim_product_df['Model_ID'] == model_id, 'Model_Launch_Date'].iloc[0]
            if pd.to_datetime(quarter_start_date) < launch_date:
                # Car not launched yet
                continue
            
            standard_price = base_price * price_factor
            
            # Randomly apply a discount
            is_discounted = random.random() < 0.2  # 20% chance of a discount
            discount_price = standard_price
            if is_discounted:
                discount_percentage = random.uniform(0.02, 0.10) # 2-10% discount
                discount_price = standard_price * (1 - discount_percentage)
            
            prices_data.append([
                model_id,
                quarter_start_date,
                standard_price,
                discount_price
            ])
            
    return pd.DataFrame(prices_data, columns=['Model_ID', 'Quarter_Start_Date', 'Standard_Price_USD', 'Discounted_Price_USD'])

if __name__ == '__main__':
    start_time = time.time()
    
    print("Loading Dim_Product and Dim_Time for price generation...")
    # This script depends on the output of the two previous scripts
    try:
        dim_product_df = pd.read_csv('./output_data/Dim_Product.csv')
        dim_time_df = pd.read_csv('./output_data/Dim_Time.csv')
        dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
        dim_product_df['Model_Launch_Date'] = pd.to_datetime(dim_product_df['Model_Launch_Date'])
    except FileNotFoundError:
        print("Error: Could not find 'Dim_Product.csv' or 'Dim_Time.csv'. Please run the previous scripts first.")
        exit()
        
    print("Generating Dim_Prices table...")
    dim_prices_df = generate_dim_prices(dim_product_df, dim_time_df)

    output_dir = './output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving Dim_Prices.csv...")
    dim_prices_df.to_csv(os.path.join(output_dir, 'Dim_Prices.csv'), index=False, encoding='utf-8')

    end_time = time.time()
    print(f"Dim_Prices.csv has been successfully generated with {len(dim_prices_df)} rows in {end_time - start_time:.2f} seconds.")

Loading Dim_Product and Dim_Time for price generation...
Generating Dim_Prices table...
Saving Dim_Prices.csv...
Dim_Prices.csv has been successfully generated with 4186 rows in 1.21 seconds.


**6/6: 生成 Fact_Sales 表 (高度动态数据，最常被追加（append）的表) 只进不出”的设计哲学。每当一笔新的销售发生，就在 Fact_Sales 表中追加一行新的数据，而不会去修改之前已经存在的历史销售记录**

In [8]:
# -*- coding: utf-8 -*-
"""Generate Fact_Sales Table (CPU Version)"""

import pandas as pd
import numpy as np
import os
import time
from datetime import datetime

# 全局变量，用于保存一次性生成的权重
global_weights = {}

def generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df):
    """
    Generates the Fact_Sales table by combining all dimension tables.
    Uses real-world Tesla sales distribution and volume data for realistic output.
    """
    start_year = dim_time_df['Year'].min()
    end_year = datetime.now().year

    # 实际历史数据（已转换单位为百万）
    revenue_targets = {
        2008: 15e6, 2009: 112e6, 2010: 117e6, 2011: 204e6, 2012: 413e6,
        2013: 2.01e9, 2014: 3.2e9, 2015: 4.05e9, 2016: 7e9, 2017: 11.76e9,
        2018: 21.46e9, 2019: 24.58e9, 2020: 31.54e9, 2021: 53.82e9,
        2022: 81.46e9, 2023: 96.77e9, 2024: 97.69e9
    }

    unit_targets = {
        2013: 22442, 2014: 31655, 2015: 50517, 2016: 76243, 2017: 103091,
        2018: 245491, 2019: 367656, 2020: 499535, 2021: 936222,
        2022: 1313851, 2023: 1808581, 2024: 1789226
    }
    
    # 根据您提供的最新数据，更新2025年Q1和Q2的交付量和营收
    unit_targets[2025] = 336681 + 384122
    revenue_targets[2025] = 19.335e9 + 22.496e9
    
    # 动态计算YTD日期，确保只生成到2025年Q2的数据
    ytd_end_date = datetime(2025, 6, 30)

    # --- 关键修复：移除之前的价格校准逻辑，将最终校准放在数据生成后 ---

    sales_data = []

    # --------------------------
    # 步骤 1: 数据清洗和预处理
    # --------------------------
    asia_countries = ['China', 'Japan', 'South Korea', 'Singapore', 'India', 'Indonesia', 'Thailand', 'Malaysia', 'Taiwan']
    oceania_countries = ['Australia', 'New Zealand']
    europe_countries = ['Germany', 'United Kingdom', 'France', 'Norway', 'Netherlands', 'Sweden', 'Italy', 'Switzerland', 'Spain', 'Belgium', 'Austria', 'Denmark', 'Finland', 'Portugal', 'Ireland', 'Luxembourg', 'Iceland']
    
    def get_continent(country):
        if country in asia_countries:
            return 'Asia'
        elif country in oceania_countries:
            return 'Oceania'
        elif country in europe_countries:
            return 'Europe'
        else:
            return 'North America'
            
    dim_geography_df['Continent'] = dim_geography_df['Country'].apply(get_continent).astype(str)

    # --------------------------
    # 步骤 2: 定义权重
    # --------------------------
    product_weights = {1: 0.45, 2: 0.45, 3: 0.05, 4: 0.04, 5: 0.01}
    continent_weights = {'North America': 0.45, 'Europe': 0.30, 'Asia': 0.23, 'Oceania': 0.02}
    country_weights = {'United States': 0.80, 'Canada': 0.20, 'China': 0.90, 'Japan': 0.05, 'South Korea': 0.05, 'Germany': 0.30, 'United Kingdom': 0.20, 'France': 0.15, 'Norway': 0.10, 'Australia': 0.90, 'New Zealand': 0.10, 'Taiwan': 0.01}
    
    state_province_weights = {
        # 北美
        'California': 0.40, 'Texas': 0.25, 'Florida': 0.15, 'Washington': 0.10, 'New York': 0.10, # 美国
        'Ontario': 0.4, 'Quebec': 0.25, 'British Columbia': 0.2, 'Alberta': 0.15, # 加拿大
        # 亚洲
        'Shanghai': 0.50, 'Beijing': 0.20, 'Guangdong': 0.20, 'Zhejiang': 0.10, # 中国
        'Taipei': 0.5, 'New Taipei City': 0.2, 'Taichung': 0.1, 'Kaohsiung': 0.1, 'Tainan': 0.05, 'Taoyuan': 0.05, # 台湾
        # 欧洲
        'Bavaria': 0.40, 'North Rhine-Westphalia': 0.25, 'Baden-Württemberg': 0.15, 'Berlin': 0.10, # 德国
        'Greater London': 0.50, 'South East England': 0.25, 'North West England': 0.15, 'West Midlands': 0.10, # 英国
        'Île-de-France': 0.60, 'Auvergne-Rhône-Alpes': 0.20, 'Nouvelle-Aquitaine': 0.10, 'Provence-Alpes-Côte d\'Azur': 0.10, # 法国
        'Oslo': 0.70, 'Vestland': 0.15, 'Viken': 0.10, 'Trøndelag': 0.05, # 挪威
        # 大洋洲
        'New South Wales': 0.5, 'Victoria': 0.3, 'Queensland': 0.1, 'Western Australia': 0.05, 'South Australia': 0.05, # 澳大利亚
        'Auckland': 0.6, 'Wellington': 0.2, 'Canterbury': 0.1, 'Otago': 0.05, 'Waikato': 0.05, # 新西兰
    }

    def get_state_weights_for_country(country):
        if country in global_weights:
            return global_weights[country]

        states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
        if not states.any():
            return {}
        
        weights = {}
        states.sort()
        num_states = len(states)
        for i, s in enumerate(states):
            weights[s] = (num_states - i) / (num_states * (num_states + 1) / 2)
        
        global_weights[country] = weights
        return weights

    # 预计算所有 Geo_ID 的权重
    dim_geography_df['Geo_Weight'] = 0.0
    for continent, c_weight in continent_weights.items():
        countries_in_continent = dim_geography_df[dim_geography_df['Continent'] == continent]['Country'].unique()
        for country in countries_in_continent:
            country_w = country_weights.get(country, 0.01)
            states = dim_geography_df[dim_geography_df['Country'] == country]['State_Province'].unique()
            state_weights = state_province_weights.get(country, get_state_weights_for_country(country))
            
            for state in states:
                state_w = state_province_weights.get(state, state_weights.get(state, 0.01))
                mask = (dim_geography_df['Country'] == country) & (dim_geography_df['State_Province'] == state)
                dim_geography_df.loc[mask, 'Geo_Weight'] = c_weight * country_w * state_w
    
    dim_geography_df['Geo_Weight'] = dim_geography_df['Geo_Weight'].fillna(0.0001)

    customer_ids = dim_customer_df['Customer_ID'].values
    
    dim_time_df['Quarter_Start_Date'] = pd.to_datetime(dim_time_df['Full_Date']).dt.to_period('Q').dt.start_time
    if 'Quarter_Start_Date' in dim_prices_df.columns and dim_prices_df['Quarter_Start_Date'].dt.tz is not None:
        dim_prices_df['Quarter_Start_Date'] = dim_prices_df['Quarter_Start_Date'].dt.tz_localize(None)

    price_time_lookup = dim_prices_df.merge(
        dim_time_df, 
        on='Quarter_Start_Date', 
        how='left'
    )
    
    df_time = pd.DataFrame(price_time_lookup['Time_ID'].unique(), columns=['Time_ID'])
    df_product = pd.DataFrame(list(product_weights.keys()), columns=['Model_ID'])
    df_product['Product_Weight'] = df_product['Model_ID'].map(product_weights)
    df_geography = dim_geography_df[['Geo_ID', 'Geo_Weight']].copy()

    all_combinations = pd.merge(df_time.merge(df_product, how='cross'), df_geography, how='cross')
    
    all_combinations['Combined_Weight'] = all_combinations['Product_Weight'] * all_combinations['Geo_Weight']
    
    all_combinations.dropna(subset=['Combined_Weight', 'Time_ID'], inplace=True)
    all_combinations['Probability'] = all_combinations['Combined_Weight'] / all_combinations['Combined_Weight'].sum()

    for year in range(start_year, end_year + 1):
        target_units = unit_targets.get(year, 0)
        if target_units == 0:
            print(f"警告：年份 {year} 没有交付量数据，跳过生成。")
            continue
            
        print(f"正在为年份 {year} 生成 {target_units:,} 条销售记录...")
        
        # 筛选出当年的所有组合，并限制在YTD范围内
        time_ids_str = all_combinations['Time_ID'].astype(int).astype(str)
        current_year_combinations = all_combinations[
            (pd.to_datetime(time_ids_str, format='%Y%m%d').dt.year == year) &
            (pd.to_datetime(time_ids_str, format='%Y%m%d') <= ytd_end_date)
        ].copy()

        if current_year_combinations.empty:
            print(f"警告：年份 {year} 在 YTD 范围内没有可用的时间组合，跳过该年份。")
            continue
            
        current_year_combinations['Probability'] = current_year_combinations['Combined_Weight'] / current_year_combinations['Combined_Weight'].sum()

        sampled_rows = current_year_combinations.sample(n=target_units, replace=True, weights='Probability', random_state=42).reset_index(drop=True)
        sampled_rows['Customer_ID'] = np.random.choice(customer_ids, size=target_units, replace=True)

        fact_sales_df_temp = sampled_rows.merge(price_time_lookup, on=['Time_ID', 'Model_ID'], how='left')

        fact_sales_df_temp['Sales_Units'] = 1
        fact_sales_df_temp['Is_Discounted_Sale'] = fact_sales_df_temp['Discounted_Price_USD'] < fact_sales_df_temp['Standard_Price_USD']
        fact_sales_df_temp['Revenue_USD'] = fact_sales_df_temp['Sales_Units'] * fact_sales_df_temp['Discounted_Price_USD']

        sales_data.append(fact_sales_df_temp)
        
    if not sales_data:
        print("所有年份均没有可用数据，无法生成 Fact_Sales 表。")
        return pd.DataFrame()
        
    fact_sales_df = pd.concat(sales_data, ignore_index=True)
    
    # --- 最终修复：在生成完整的 Fact 表后，进行最终的营收校准 ---
    current_total_revenue = fact_sales_df['Revenue_USD'].sum()
    target_total_revenue = 450e9
    
    if current_total_revenue > 0:
        revenue_factor = target_total_revenue / current_total_revenue
        fact_sales_df['Revenue_USD'] = fact_sales_df['Revenue_USD'] * revenue_factor
    # --- 最终修复结束 ---
    
    fact_sales_df['Is_Discounted_Sale'] = fact_sales_df['Is_Discounted_Sale'].astype(bool)
    
    fact_sales_df = fact_sales_df[['Time_ID', 'Geo_ID', 'Model_ID', 'Customer_ID', 'Sales_Units', 'Is_Discounted_Sale', 'Revenue_USD']]
    
    return fact_sales_df

if __name__ == '__main__':
    start_time = time.time()
    
    print("正在加载所有维度表...")
    try:
        dim_product_df = pd.read_csv(os.path.join('./output_data', 'Dim_Product.csv'))
        dim_time_df = pd.read_csv(os.path.join('./output_data', 'Dim_Time.csv'))
        dim_customer_df = pd.read_csv(os.path.join('./output_data', 'Dim_Customer.csv'))
        dim_geography_df = pd.read_csv(os.path.join('./output_data', 'Dim_Geography.csv'))
        dim_prices_df = pd.read_csv(os.path.join('./output_data', 'Dim_Prices.csv'))
    except FileNotFoundError as e:
        print(f"错误：缺少一个或多个必需的 CSV 文件。请先运行所有维度生成脚本（1-5）。\n{e}")
        exit()

    dim_time_df['Full_Date'] = pd.to_datetime(dim_time_df['Full_Date'])
    dim_prices_df['Quarter_Start_Date'] = pd.to_datetime(dim_prices_df['Quarter_Start_Date'])
    
    print("正在生成 Fact_Sales 表...")
    fact_sales_df = generate_fact_sales(dim_product_df, dim_time_df, dim_customer_df, dim_geography_df, dim_prices_df)

    if not fact_sales_df.empty:
        output_dir = './output_data'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print("保存 Fact_Sales.csv...")
        fact_sales_df.to_csv(os.path.join(output_dir, 'Fact_Sales.csv'), index=False, encoding='utf-8')
        
        end_time = time.time()
        print(f"Fact_Sales.csv 已成功生成 {len(fact_sales_df):,} 行数据，耗时 {end_time - start_time:.2f} 秒。")
        print("数据生成完成！")
    else:
        print("数据生成失败。")

正在加载所有维度表...
正在生成 Fact_Sales 表...
正在为年份 2017 生成 103,091 条销售记录...
正在为年份 2018 生成 245,491 条销售记录...
正在为年份 2019 生成 367,656 条销售记录...
正在为年份 2020 生成 499,535 条销售记录...
正在为年份 2021 生成 936,222 条销售记录...
正在为年份 2022 生成 1,313,851 条销售记录...
正在为年份 2023 生成 1,808,581 条销售记录...
正在为年份 2024 生成 1,789,226 条销售记录...
正在为年份 2025 生成 720,803 条销售记录...
保存 Fact_Sales.csv...
Fact_Sales.csv 已成功生成 7,784,456 行数据，耗时 33.91 秒。
数据生成完成！
