# NuminaMath-CoT 数据集分析

这个notebook用于分析NuminaMath-CoT数据集，这是一个包含860k+数学竞赛问题-解答对的数据集，每个解答都使用了思维链 (Chain of Thought, CoT) 推理模板。数据集的来源包括中国高中数学练习题、美国和国际数学奥林匹克竞赛题。

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# 设置数据集路径
DATASET_PATH = "/Users/jia/datasets/NuminaMath-CoT"
DATA_PATH = os.path.join(DATASET_PATH, "data")

print("正在检查数据集...")
if not os.path.exists(DATASET_PATH):
    print(f"错误: 数据集路径 {DATASET_PATH} 不存在")
else:
    print(f"数据集路径: {DATASET_PATH}")
    
if not os.path.exists(DATA_PATH):
    print(f"错误: 数据路径 {DATA_PATH} 不存在")
else:
    print(f"数据路径: {DATA_PATH}")

In [None]:
# 列出所有数据文件
files = os.listdir(DATA_PATH)
print(f"\n数据文件列表:")
for file in files:
    print(f"  - {file}")

# 分离训练集和测试集文件
train_files = [f for f in files if f.startswith("train-") and f.endswith(".parquet")]
test_files = [f for f in files if f.startswith("test-") and f.endswith(".parquet")]

print(f"\n训练集文件数量: {len(train_files)}")
print(f"测试集文件数量: {len(test_files)}")

In [None]:
# 加载测试集
if test_files:
    test_file = os.path.join(DATA_PATH, test_files[0])
    print(f"\n正在加载测试集: {test_file}")
    test_df = pd.read_parquet(test_file)
    print(f"测试集形状: {test_df.shape}")
    print(f"测试集列名: {list(test_df.columns)}")

In [None]:
# 加载训练集
if train_files:
    # 只加载第一个训练集文件以节省内存
    train_file = os.path.join(DATA_PATH, train_files[0])
    print(f"\n正在加载训练集文件: {train_file}")
    train_df = pd.read_parquet(train_file)
    print(f"训练集形状: {train_df.shape}")
    print(f"训练集列名: {list(train_df.columns)}")

In [None]:
# 显示测试集基本信息
if 'test_df' in locals():
    print("\n测试集基本信息:")
    print(test_df.info())
    
    print("\n测试集前5行:")
    test_df.head()

In [None]:
# 显示训练集基本信息
if 'train_df' in locals():
    print("\n训练集基本信息:")
    print(train_df.info())
    
    print("\n训练集前5行:")
    train_df.head()

In [None]:
# 检查数据集中的缺失值
if 'test_df' in locals():
    print("\n测试集缺失值统计:")
    print(test_df.isnull().sum())
    
if 'train_df' in locals():
    print("\n训练集缺失值统计:")
    print(train_df.isnull().sum())

In [None]:
# 分析数据集中的source分布
if 'test_df' in locals():
    print("\n测试集source分布:")
    print(test_df['source'].value_counts())
    
if 'train_df' in locals():
    print("\n训练集source分布 (样本1000条记录):")
    # 仅分析部分数据以节省时间
    print(train_df['source'].value_counts().head(10))

In [None]:
# 分析问题和解答的长度
if 'test_df' in locals():
    test_df['problem_length'] = test_df['problem'].str.len()
    test_df['solution_length'] = test_df['solution'].str.len()
    
    print("\n测试集问题和解答长度统计:")
    print(test_df[['problem_length', 'solution_length']].describe())

if 'train_df' in locals():
    # 仅分析部分数据以节省时间
    sample_train_df = train_df.sample(n=1000, random_state=42)
    sample_train_df['problem_length'] = sample_train_df['problem'].str.len()
    sample_train_df['solution_length'] = sample_train_df['solution'].str.len()
    
    print("\n训练集问题和解答长度统计 (样本1000条记录):")
    print(sample_train_df[['problem_length', 'solution_length']].describe())

In [None]:
# 显示一些样本数据
print("\n测试集样本数据:")
if 'test_df' in locals():
    for i in range(min(3, len(test_df))):
        print(f"\n样本 {i+1}:")
        print(f"  来源: {test_df.iloc[i]['source']}")
        print(f"  问题: {test_df.iloc[i]['problem'][:200]}{'...' if len(test_df.iloc[i]['problem']) > 200 else ''}")
        print(f"  解答: {test_df.iloc[i]['solution'][:200]}{'...' if len(test_df.iloc[i]['solution']) > 200 else ''}")
        if 'messages' in test_df.columns:
            messages = test_df.iloc[i]['messages']
            print(f"  消息数量: {len(messages) if messages else 0}")

print("\n" + "="*50)

print("\n训练集样本数据:")
if 'train_df' in locals():
    for i in range(min(3, len(train_df))):
        print(f"\n样本 {i+1}:")
        print(f"  来源: {train_df.iloc[i]['source']}")
        print(f"  问题: {train_df.iloc[i]['problem'][:200]}{'...' if len(train_df.iloc[i]['problem']) > 200 else ''}")
        print(f"  解答: {train_df.iloc[i]['solution'][:200]}{'...' if len(train_df.iloc[i]['solution']) > 200 else ''}")
        if 'messages' in train_df.columns:
            messages = train_df.iloc[i]['messages']
            print(f"  消息数量: {len(messages) if messages else 0}")

In [None]:
# 数据集总结
print("\n数据集总结:")
print("="*50)

if 'train_files' in locals() and train_files:
    print(f"训练集文件数量: {len(train_files)}")
    if 'train_df' in locals():
        total_train_rows = len(train_files) * len(train_df)  # 估算总数
        print(f"训练集总样本数 (估算): {total_train_rows}")
        print(f"训练集列名: {list(train_df.columns)}")

if 'test_files' in locals() and test_files:
    print(f"测试集文件数量: {len(test_files)}")
    if 'test_df' in locals():
        print(f"测试集总样本数: {len(test_df)}")
        print(f"测试集列名: {list(test_df.columns)}")

print("="*50)
print("分析完成")