In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

def analyze_outliers(csv_path):
    """分析卡路里数据的异常值"""
    df = pd.read_csv(csv_path)
    calories = df['Value']
    
    # 基本统计
    print("=" * 60)
    print("卡路里数据统计:")
    print("=" * 60)
    print(calories.describe())
    print(f"\n中位数: {calories.median():.2f}")
    print(f"偏度: {calories.skew():.2f}")  # 正偏说明右偏
    
    # 检测异常值
    mean = calories.mean()
    std = calories.std()
    
    # 3-sigma规则
    lower_bound = mean - 3 * std
    upper_bound = mean + 3 * std
    outliers_3sigma = df[(calories < lower_bound) | (calories > upper_bound)]
    
    # IQR方法（更稳健）
    q1 = calories.quantile(0.25)
    q3 = calories.quantile(0.75)
    iqr = q3 - q1
    lower_iqr = q1 - 1.5 * iqr
    upper_iqr = q3 + 1.5 * iqr
    outliers_iqr = df[(calories < lower_iqr) | (calories > upper_iqr)]
    
    print(f"\n3-sigma范围: [{lower_bound:.2f}, {upper_bound:.2f}]")
    print(f"3-sigma异常值数量: {len(outliers_3sigma)} ({len(outliers_3sigma)/len(df)*100:.1f}%)")
    
    print(f"\nIQR范围: [{lower_iqr:.2f}, {upper_iqr:.2f}]")
    print(f"IQR异常值数量: {len(outliers_iqr)} ({len(outliers_iqr)/len(df)*100:.1f}%)")
    
    # 可视化
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # 直方图
    axes[0].hist(calories, bins=50, edgecolor='black')
    axes[0].axvline(mean, color='r', linestyle='--', label=f'Mean: {mean:.0f}')
    axes[0].axvline(mean + 3*std, color='orange', linestyle='--', label='3σ')
    axes[0].axvline(mean - 3*std, color='orange', linestyle='--')
    axes[0].set_xlabel('Calories')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution')
    axes[0].legend()
    
    # 箱线图
    axes[1].boxplot(calories)
    axes[1].set_ylabel('Calories')
    axes[1].set_title('Box Plot')
    axes[1].grid(alpha=0.3)
    
    # Q-Q图（检查正态性）
    from scipy import stats
    stats.probplot(calories, dist="norm", plot=axes[2])
    axes[2].set_title('Q-Q Plot')
    
    plt.tight_layout()
    plt.savefig('data_analysis.png', dpi=150)
    print("\n可视化已保存到 data_analysis.png")
    
    return outliers_3sigma, outliers_iqr


if __name__ == '__main__':
    DATA_ROOT = Path('Nutrition5K/Nutrition5K')
    TRAIN_CSV = DATA_ROOT / 'nutrition5k_train_clean.csv'
    
    outliers_3sigma, outliers_iqr = analyze_outliers(TRAIN_CSV)
    
    print("\n最高的10个卡路里值:")
    df = pd.read_csv(TRAIN_CSV)
    print(df.nlargest(10, 'Value')[['ID', 'Value']])

In [1]:
import pandas as pd
df = pd.read_csv('Nutrition5K/Nutrition5K/nutrition5k_train_clean.csv')
print(df['Value'].describe())
print('前5个值:', df['Value'].head().tolist())


count    3300.000000
mean      237.298713
std       221.319200
min         0.000000
25%        60.839996
50%       186.821724
75%       359.282753
max      3943.325195
Name: Value, dtype: float64
前5个值: [221.167068, 140.980011, 274.335999, 589.501648, 258.59967]


In [1]:
# analyze_depth_range.py (最终版 - 内存安全)
import numpy as np
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import pandas as pd

def analyze_depth_distribution(data_root):
    """使用在线统计，避免内存爆炸"""
    data_root = Path(data_root)
    train_csv = pd.read_csv(data_root / 'nutrition5k_train_clean.csv')
    
    # 在线统计量
    pixel_count = 0
    sum_values = 0.0
    sum_squares = 0.0
    min_value = float('inf')
    max_value = 0.0
    
    # 采样用于百分位（内存限制）
    max_samples = 500_000  # 最多50万个采样点
    sample_interval = max(1, len(train_csv) * 307200 // max_samples)  # 动态采样率
    percentile_samples = []
    
    print(f"扫描 {len(train_csv)} 张深度图...")
    print(f"采样间隔: 每 {sample_interval} 个像素采样1个")
    
    valid_count = 0
    failed_count = 0
    
    for idx, row in tqdm(train_csv.iterrows(), total=len(train_csv)):
        dish_id = row['ID']
        depth_path = data_root / 'train' / 'depth_raw' / dish_id / 'depth_raw.png'
        
        if not depth_path.exists():
            failed_count += 1
            continue
        
        try:
            depth_img = Image.open(depth_path)
            depth_array = np.array(depth_img, dtype=np.uint16)
            valid_depth = depth_array[depth_array > 0]
            
            if len(valid_depth) == 0:
                continue
            
            # 在线更新统计
            pixel_count += len(valid_depth)
            sum_values += valid_depth.sum()
            sum_squares += (valid_depth.astype(np.float64) ** 2).sum()
            min_value = min(min_value, valid_depth.min())
            max_value = max(max_value, valid_depth.max())
            
            # 采样（防止内存爆炸）
            if len(percentile_samples) < max_samples:
                sample_size = max(1, len(valid_depth) // sample_interval)
                indices = np.random.choice(len(valid_depth), size=min(sample_size, len(valid_depth)), replace=False)
                percentile_samples.extend(valid_depth[indices].tolist())
            
            valid_count += 1
            
        except MemoryError:
            print(f"\n内存错误在 {dish_id}，当前已处理 {valid_count} 张")
            failed_count += 1
            continue
        except Exception as e:
            failed_count += 1
            if failed_count <= 5:
                print(f"\n错误 {dish_id}: {type(e).__name__}")
            continue
    
    print(f"\n扫描完成: {valid_count} 成功, {failed_count} 失败")
    print(f"总像素数: {pixel_count:,}")
    
    if pixel_count == 0:
        print("错误：没有有效数据")
        return None
    
    # 计算统计量
    mean = sum_values / pixel_count
    variance = (sum_squares / pixel_count) - (mean ** 2)
    std = np.sqrt(max(0, variance))
    
    # 百分位
    percentile_samples = np.array(percentile_samples)
    median = np.median(percentile_samples)
    p95 = np.percentile(percentile_samples, 95)
    p99 = np.percentile(percentile_samples, 99)
    
    stats = {
        'min': float(min_value),
        'max': float(max_value),
        'mean': float(mean),
        'std': float(std),
        'median': float(median),
        'p95': float(p95),
        'p99': float(p99),
        'total_pixels': int(pixel_count),
        'valid_images': int(valid_count),
        'failed_images': int(failed_count)
    }
    
    print("\n" + "="*60)
    print("深度统计 (16-bit原始值):")
    print("="*60)
    for key, val in stats.items():
        if 'pixels' in key or 'images' in key:
            print(f"  {key}: {val:,}")
        else:
            print(f"  {key}: {val:.2f}")
    
    print("\n假设单位: 0.1mm (scale=1e-4米)")
    print(f"  99th百分位: {p99 * 1e-4:.3f} 米")
    print(f"  推荐归一化参数: depth_max_value = {int(p99)}")
    
    # 保存
    import json
    with open(data_root / 'depth_statistics.json', 'w') as f:
        json.dump(stats, f, indent=2)
    
    print(f"\n结果已保存: {data_root / 'depth_statistics.json'}")
    
    return stats

if __name__ == '__main__':
    stats = analyze_depth_distribution('Nutrition5K/Nutrition5K')

扫描 3300 张深度图...
采样间隔: 每 2027 个像素采样1个


100%|██████████| 3300/3300 [00:28<00:00, 113.96it/s]


扫描完成: 3300 成功, 0 失败
总像素数: 842,661,995

深度统计 (16-bit原始值):
  min: 323.00
  max: 65535.00
  mean: 3754.93
  std: 861.26
  median: 3588.00
  p95: 4118.00
  p99: 4286.00
  total_pixels: 842,661,995
  valid_images: 3,300
  failed_images: 0

假设单位: 0.1mm (scale=1e-4米)
  99th百分位: 0.429 米
  推荐归一化参数: depth_max_value = 4286

结果已保存: Nutrition5K\Nutrition5K\depth_statistics.json



