In [None]:
import pandas as pd
import numpy as np
import predict_model as pred
from sklearn.preprocessing import MinMaxScaler
import gurobipy as gp
from gurobipy import GRB
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
# 允许显示负号
plt.rcParams['axes.unicode_minus'] = False

对于test_data中的下一周的所有产品（id），首先用机器学习，根据当期该产品的X值计算出每个历史样本点的权重，然后把样本点的目标函数按权重加权求和得到
决策是订购量
X包括的特征：'sell_price', 'event_count', 'cultural_count', 'religious_count', 'national_count', 'sporting_count','snap_CA_total', 'snap_TX_total', 'snap_WI_total', 'is_nov_dec'   'is_CA','is_TX',这俩不需要，因为都是CA

In [None]:
def load_and_preprocess_data(file_path, id_num=30, feature_cols=None):
    """
    加载并预处理数据
    参数:
        file_path: 数据文件路径
        id_num: 需要选择的ID数量
        feature_cols: 需要标准化的特征列
    返回:
        处理后的DataFrame和选择的ID列表
    """
    if feature_cols is None:
        feature_cols = ['sell_price', 'event_count', 'cultural_count', 
                      'religious_count', 'national_count', 'sporting_count',
                      'snap_CA_total', 'snap_TX_total', 'snap_WI_total', 'is_nov_dec',
                      'avg_temp', 'max_temp', 'min_temp', 'precipitation', 'wind_speed', 'humidity']
    
    # 加载数据
    struc_data = pd.read_csv(file_path).iloc[:, 1:]
    
    # 选择固定ID
    unique_ids = struc_data['id'].unique()
    selected_ids = unique_ids[:id_num]
    struc_data = struc_data[struc_data['id'].isin(selected_ids)].copy()
    
    return struc_data, selected_ids

def prepare_train_test_data(data, test_week, feature_cols):
    """
    准备训练和测试数据
    参数:
        data: 完整数据集
        test_week: 测试周起始点
        feature_cols: 需要标准化的特征列
    返回:
        标准化后的训练集和测试集
    """
    # 划分数据集
    train_data = data[data['week'].between(1, test_week-1)]
    test_data = data[data['week'].between(test_week, test_week+4)]
    
    # 标准化特征
    scaler = MinMaxScaler()
    train_data.loc[:, feature_cols] = scaler.fit_transform(train_data.loc[:, feature_cols])
    test_data.loc[:, feature_cols] = scaler.transform(test_data.loc[:, feature_cols])
    
    return train_data, test_data


#### 把相同销量的合并起来

In [None]:
def aggregate_weights_by_sales(weight_data):
    """
    合并相同weekly_sales的权重，避免重复计算
    
    参数：
    weight_data : 原始权重数据（来自knn/核方法等）
        [{
            'test_id': id_val,
            'test_week': week,
            'neighbors': [{'weekly_sales': y, 'weight': w}, ...]
        }, ...]
    
    返回：
    aggregated_data : 合并后的权重数据（结构相同）
    """
    aggregated_data = []
    
    for item in weight_data:
        # 按test_id和test_week分组处理
        test_id = item['test_id']
        test_week = item['test_week']
        neighbors = item['neighbors']
        
        # 创建临时字典合并相同sales的权重
        sales_to_weight = {}
        for neighbor in neighbors:
            y = neighbor['weekly_sales']
            w = neighbor['weight']
            if y in sales_to_weight:
                sales_to_weight[y] += w
            else:
                sales_to_weight[y] = w
        
        # 构建新的neighbors列表
        new_neighbors = [
            {'weekly_sales': y, 'weight': w}
            for y, w in sales_to_weight.items()
        ]
        
        aggregated_data.append({
            'test_id': test_id,
            'test_week': test_week,
            'neighbors': new_neighbors
        })
    
    return aggregated_data

#### 进行求解

In [None]:
def solve_inventory_optimization(weighted_neighbors):
    # 预处理：按产品ID分组权重
    id_to_weights = {}
    for item in weighted_neighbors:
        j = item['test_id']
        if j not in id_to_weights:
            id_to_weights[j] = []
        id_to_weights[j].extend([(n['weekly_sales'], n['weight']) 
                               for n in item['neighbors']])
    
    # 创建模型
    model = gp.Model("Inventory_Optimization")
    product_ids = list(id_to_weights.keys())
    z = model.addVars(product_ids, vtype=GRB.INTEGER, lb=0,  name="z")
    
    # 构建目标函数（用 PWL 近似 min(y, z_j)）
    objective = 0
    for j in product_ids:
        sum_expectation = 0
        for y, w in id_to_weights[j]:
            # 添加变量表示 min(y, z_j)
            min_term = model.addVar(lb=0, name=f"min_{j}_{y}")
            
            # 定义分段线性近似 min(y, z_j)
            # 选择分段点（这里用 [0, y, y + eps]）
            eps = 1e-5  # 小偏移量，避免数值问题
            x_points = [0, y, y + eps]  # z_j 的取值点
            y_points = [0, y, y]        # min(y, z_j) 的对应值
            
            # 添加 PWL 约束
            model.addGenConstrPWL(
                z[j],        # 输入变量 (z_j)
                min_term,   # 输出变量 (min(y, z_j))
                x_points,   # 分段点的 x 坐标
                y_points,    # 分段点的 y 坐标
                name=f"pwl_min_{j}_{y}"
            )
            
            sum_expectation += min_term * w
        objective += sum_expectation
    
    # 设置目标和约束
    model.setObjective(objective, GRB.MAXIMIZE)
    model.addConstr(z.sum() <= total_capacity, "Capacity_Constraint")
    
    # 求解
    model.optimize()
    
    # 返回结果
    if model.status == GRB.OPTIMAL:
        optimal_z = {j: z[j].x for j in product_ids}
        return model.objVal, optimal_z
    else:
        print("优化失败，状态码:", model.status)
        return None, None

#### 评估各ml方法在真实销量上的表现

In [None]:
def evaluate_prescriptions(optimal_z_dict, test_data):
    """
    评估各方法在真实销量上的表现
    
    参数：
    optimal_z_dict : 各方法的决策结果字典
        {
            'KNN': {'obj_value': v1, 'optimal_z': {id1: z1, id2: z2,...}},
            'Kernel': {...},
            ...
        }
    test_data : 测试数据集（需包含id和weekly_sales列）
    
    返回：
    evaluation_df : 评估结果DataFrame
    """
    results = []
    y_true = test_data.groupby('id')['weekly_sales'].first()     # 获取每个id的真实销量
    
    
    for method_name, result in optimal_z_dict.items():
        z_optimal = result['optimal_z']
        
        # 计算实际收益（取min(z_j, y_j)）
        total_fulfilled = 0
        fulfillment_details = {}
        
        for id_j, z_j in z_optimal.items():
            y_j = y_true.get(id_j, 0)  # 获取该id的真实销量
            fulfilled = min(z_j, y_j)
            fulfillment_details[id_j] = fulfilled
            total_fulfilled += fulfilled
        
        # 存储结果
        results.append({
            'Method': method_name,
            'Total_Fulfilled': total_fulfilled,
        })
    
    return pd.DataFrame(results)

### 计算最优的期望销量

In [None]:
def perfect_foresight(test_data, total_capacity=300):
    """
    完美预见模型（使用 PWL 近似 min(actual_sales, z)）
    
    参数：
    test_data : 测试数据集（需包含'id'和'weekly_sales'列）
    total_capacity : 总库存容量
    
    返回：
    result : 包含最优解和评估指标的字典
    """
    # 创建模型
    m = gp.Model("Perfect_Foresight_PWL")
    
    # 获取产品列表和真实销量
    products = test_data['id'].unique()
    actual_sales = test_data.groupby('id')['weekly_sales'].first() 
    sales_dict = actual_sales.to_dict()
    
    # 添加决策变量（各产品订购量）
    z = m.addVars(products, vtype=GRB.INTEGER, lb=0, name="order_quantity")
    
    # 添加变量表示 min(actual_sales[p], z[p])
    min_vars = m.addVars(products, name="min_sales")
    
    # 用 PWL 近似 min(actual_sales[p], z[p])
    for p in products:
        y = sales_dict[p]
        # 定义分段点：z_p=0 → min=0; z_p=y → min=y; z_p>y → min=y
        x_points = [0, y, y + 1e-5]  # 添加微小偏移避免数值问题
        y_points = [0, y, y]
        m.addGenConstrPWL(
            z[p],          # 输入变量 (z_p)
            min_vars[p],   # 输出变量 (min(y, z_p))
            x_points,      # 分段点的 x 坐标
            y_points,      # 分段点的 y 坐标
            name=f"pwl_min_{p}"
        )
    
    # 目标函数：最大化满足的销量总和
    m.setObjective(min_vars.sum(), GRB.MAXIMIZE)
    
    # 总容量约束
    m.addConstr(z.sum() <= total_capacity, "total_capacity")
    
    # 求解模型
    m.optimize()
    
    if m.status == GRB.OPTIMAL:
        return {
            'order_quantities': {p: z[p].x for p in products},
            'total_fulfilled': m.objVal
        }
    else:
        raise Exception(f"优化失败，状态码: {m.status}")

### 计算随机森林用点估计得到的方法

In [None]:
# ====================== 核心计算函数 ======================
def calculate_p_values(results_df):
    """计算各方法的P值"""
    for test_week in results_df['test_week']:
        R_perfect = results_df.loc[results_df['test_week'] == test_week, 'Perfect_R'].values[0]
        R_saa = results_df.loc[results_df['test_week'] == test_week, 'SAA_R'].values[0]
        
        for method in ['KNN', 'Kernel','DecisionTree', 'RandomForest']:
            R_method = results_df.loc[results_df['test_week'] == test_week, f'{method}_R'].values[0]
            denominator = R_saa - R_perfect
            numerator = R_method - R_perfect
            
            P = 0 if abs(denominator) < 1e-6 else 1 - (numerator / denominator)
            results_df.loc[results_df['test_week'] == test_week, f'{method}_P'] = np.clip(P, -1, 1)
    return results_df

def plot_p_values(results_df, id_num):
    """绘制P值对比图（优化版）"""
    plt.figure(figsize=(14, 7), facecolor='#f5f5f5')  # 更大的画布和浅灰背景
    
    # 更美观的线条样式配置
    line_styles = {
        'KNN_P': {
            'marker': 'o', 
            'style': '-', 
            'color': '#3498db',  # 更鲜艳的蓝色
            'label': 'KNN',
            'markersize': 5,
            'linewidth': 2.5
        },

        'Kernel_P': {
            'marker': 'o', 
            'style': '-', 
            'color': '#9b59b6',  
            'label': 'Kernel',
            'markersize': 5,
            'linewidth': 2.5
        },
        'DecisionTree_P': {
            'marker': 'D',  # 菱形标记
            'style': '--', 
            'color': '#e74c3c',  # 红色
            'label': 'DecisionTree',
            'markersize': 5,
            'linewidth': 2.2
        },
        'RandomForest_P': {
            'marker': 's',  # 方形标记
            'style': '-.', 
            'color': '#2ecc71',  # 绿色
            'label': 'RF',
            'markersize': 5,
            'linewidth': 2.8
        }
    }
    
    # 绘制曲线
    for col, style in line_styles.items():
        plt.plot(results_df['test_week'], results_df[col],
                marker=style['marker'], 
                linestyle=style['style'],
                color=style['color'], 
                linewidth=style['linewidth'],
                markersize=style['markersize'], 
                label=style['label'],
                markerfacecolor='white',  # 空心标记
                markeredgewidth=1.5)  # 标记边框粗细
    
    # 参考线样式优化
    plt.axhline(y=0, color='#7f8c8d', linestyle='--', linewidth=2, alpha=0.7, label='SAA基线 (P=0)')
    plt.axhline(y=1, color='#c0392b', linestyle=':', linewidth=1.8, alpha=0.6, label='理论上限')
    plt.axhline(y=-1, color='#c0392b', linestyle=':', linewidth=1.8, alpha=0.6, label='理论下限')
    
    # 图表装饰
    # plt.title(f'商品总数为{id_num}时的P值对比', 
    #          fontsize=16, pad=20, fontweight='bold', color='#2c3e50')
    plt.xlabel('样本数', fontsize=13, labelpad=10)
    plt.ylabel('P值', fontsize=13, labelpad=10)
    
    # 坐标轴范围
    plt.ylim(-2.1, 2.1)  # 扩展y轴范围
    plt.xlim(results_df['test_week'].min()-5, results_df['test_week'].max()+5)
    
    # 网格和边框
    plt.grid(True, color='white', linestyle='-', linewidth=0.8, alpha=0.8)
    for spine in plt.gca().spines.values():  # 美化边框
        spine.set_visible(True)
        spine.set_edgecolor('#bdc3c7')
        spine.set_linewidth(1.2)
    
    # 图例优化
    legend = plt.legend(fontsize=12, framealpha=1, 
                       loc='upper right', 
                       facecolor='white',
                       edgecolor='#bdc3c7',
                       borderpad=1,
                       labelspacing=0.8)
    legend.get_frame().set_linewidth(1.2)
    
    # 背景色
    plt.gca().set_facecolor('#ecf0f1')
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'p_{id_num}.png', dpi=300, bbox_inches='tight', facecolor='#f5f5f5')
    plt.show()


In [None]:
import seaborn as sns

def plot_r_trends(results_df, save_path=None):
    """
    使用Seaborn绘制各方法R值随test_week变化的趋势图
    
    参数：
    results_df : 包含结果数据的DataFrame
    save_path : 图片保存路径（可选），如'./results/r_trends.png'
    """
    plt.figure(figsize=(12, 6))
    
    # 将宽格式数据转为长格式
    melted_df = pd.melt(
        results_df,
        id_vars=['test_week'],
        value_vars=[ 'RandomForest_R', 'SAA_R', 'Perfect_R'],
        var_name='Method',
        value_name='R_Value'
    )
    
    # 美化方法名称显示
    melted_df['Method'] = melted_df['Method'].str.replace('_R', '')
    
    # 创建折线图
    sns.lineplot(
        data=melted_df,
        x='test_week',
        y='R_Value',
        hue='Method',
        style='Method',
        palette={
            # 'KNN': '#3498db',
            # 'Kernel': '#9b59b6', 
            # 'DecisionTree': '#e74c3c',
            'RandomForest': '#2ecc71',
            'SAA': '#f39c12',
            'Perfect': '#34495e'
        },
        linewidth=1,
        markers=True,
        markersize=6,
        dashes=False
    )
    
    # 图表美化
    # plt.title('不同方法的表现', fontsize=14)
    plt.xlabel('样本数', fontsize=12)
    plt.ylabel('期望总销量', fontsize=12)
    plt.legend(title='Method', loc='upper right')
    plt.grid(alpha=0.3)
    
    # 调整布局
    plt.tight_layout()
    
    # 保存或显示
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    else:
        plt.show()
plot_r_trends(results_df, save_path='./r_trends.png')

In [None]:
# results_df = {
#     'test_week': [],
#     'KNN_R': [], 'Kernel_R':[], 'DecisionTree_R': [], 'RandomForest_R': [],
#     'SAA_R': [], 'Perfect_R': []
# }

# feature_cols = ['sell_price', 'event_count', 'cultural_count', 
#                 'religious_count', 'national_count', 'sporting_count',
#                 'snap_CA_total', 'snap_TX_total', 'snap_WI_total', 'is_nov_dec',
#                 'avg_temp', 'max_temp', 'min_temp', 'precipitation', 'wind_speed', 'humidity']
# id_num = 30
# total_capacity = 23 * id_num

# # 数据加载
# struc_data, selected_ids = load_and_preprocess_data('./data/struc_data.csv', id_num, feature_cols)

# # 主循环
# for test_week in range(150, 273):
#     print(f"Processing Week {test_week}...")
#     train_data, test_data = prepare_train_test_data(struc_data, test_week, feature_cols)
    
#     # 记录结果
#     results_df['test_week'].append(test_week)
#     single_test = test_data[test_data['week'] == test_week].copy()
    
#     # 计算各方法权重（假设这些函数已定义）
#     methods = {
#         'KNN': aggregate_weights_by_sales(
#             pred.knn_weighted_sales_by_id(train_data, single_test, feature_cols, k=5)),
#         'Kernel': aggregate_weights_by_sales(
#             pred.kde_weighted_sales_by_id(train_data, single_test, feature_cols)),
#         'DecisionTree': aggregate_weights_by_sales(
#             pred.dtree_weighted_sales_by_id(train_data, single_test, feature_cols)),
#         'RandomForest': aggregate_weights_by_sales(
#             pred.optimized_rf_weights(train_data, single_test, feature_cols)),
#         'SAA': aggregate_weights_by_sales(
#             pred.saa_weighted_sales_by_id(train_data, single_test))
#     }
    
#     # 评估各方法
#     optimization_results = {}
#     for method_name, weights in methods.items():
#         obj_value, optimal_z = solve_inventory_optimization(weighted_neighbors=weights)
#         optimization_results[method_name] = {'obj_value': obj_value, 'optimal_z': optimal_z}
    
#     eval_df = evaluate_prescriptions(optimization_results, single_test)
#     perfect_z = perfect_foresight(single_test, total_capacity)
    
#     # 存储结果
#     for method in ['KNN','Kernel', 'DecisionTree', 'RandomForest', 'SAA']:
#         results_df[f'{method}_R'].append(
#             eval_df.loc[eval_df['Method']==method, 'Total_Fulfilled'].values[0])
#     results_df['Perfect_R'].append(perfect_z['total_fulfilled'])

# # 后处理

# results_df = pd.DataFrame(results_df)
# plot_r_trends(results_df, save_path='./r_trends.png')
# results_df = calculate_p_values(results_df)
plot_p_values(results_df, id_num)
print(results_df)
# 保存结果（可选）
results_df.to_csv(f'results_id_{id_num}.csv', index=False)

In [None]:
results_df['RandomForest_P'].mean()