# Euclid30K 数据集分析

## 简介

Euclid30K 是一个包含约30,000个平面和立体几何问题的多模态数据集。该数据集旨在通过欧几里得几何问题解决作为代理任务，提升多模态大语言模型（MLLMs）的空间智能能力。

### 数据集特点
1. **数据规模**：约30,000个几何问题
2. **问题类型**：涵盖平面几何和立体几何问题
3. **多模态**：包含图像和文本的多模态数据
4. **语言支持**：支持英语和中文
5. **许可证**：Apache-2.0开源许可证

In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 设置数据集路径
DATA_PATH = "/Users/jia/datasets/Euclid30K"

print(f"数据集路径: {DATA_PATH}")
print(f"数据集目录是否存在: {os.path.exists(DATA_PATH)}")

数据集路径: /Users/jia/datasets/Euclid30K
数据集目录是否存在: True


In [2]:
# 检查数据集文件
data_files = list(Path(DATA_PATH).glob("*.parquet"))
print("数据集文件列表:")
for file in data_files:
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  {file.name}: {size_mb:.2f} MB")

数据集文件列表:
  Euclid30K_train.parquet: 775.77 MB
  Euclid30K_val.parquet: 14.36 MB


In [3]:
# 加载验证集数据
val_file = os.path.join(DATA_PATH, "Euclid30K_val.parquet")
print(f"正在加载验证集数据: {val_file}")

try:
    val_df = pd.read_parquet(val_file)
    print(f"验证集数据加载成功，形状: {val_df.shape}")
except Exception as e:
    print(f"加载验证集数据时出错: {e}")
    val_df = None

正在加载验证集数据: /Users/jia/datasets/Euclid30K/Euclid30K_val.parquet
验证集数据加载成功，形状: (500, 3)


In [4]:
# 加载训练集数据
train_file = os.path.join(DATA_PATH, "Euclid30K_train.parquet")
print(f"正在加载训练集数据: {train_file}")

try:
    train_df = pd.read_parquet(train_file)
    print(f"训练集数据加载成功，形状: {train_df.shape}")
except Exception as e:
    print(f"加载训练集数据时出错: {e}")
    train_df = None

正在加载训练集数据: /Users/jia/datasets/Euclid30K/Euclid30K_train.parquet
训练集数据加载成功，形状: (29195, 3)


## 数据集概览

In [5]:
# 显示数据集基本信息
if train_df is not None:
    print("=== 训练集信息 ===")
    print(f"数据形状: {train_df.shape}")
    print(f"列数: {len(train_df.columns)}")
    print(f"行数: {len(train_df)}")
    print()

if val_df is not None:
    print("=== 验证集信息 ===")
    print(f"数据形状: {val_df.shape}")
    print(f"列数: {len(val_df.columns)}")
    print(f"行数: {len(val_df)}")
    print()

=== 训练集信息 ===
数据形状: (29195, 3)
列数: 3
行数: 29195

=== 验证集信息 ===
数据形状: (500, 3)
列数: 3
行数: 500



In [6]:
# 显示列名
if train_df is not None:
    print("训练集列名:")
    for i, col in enumerate(train_df.columns):
        print(f"  {i+1}. {col}")
    print()

if val_df is not None:
    print("验证集列名:")
    for i, col in enumerate(val_df.columns):
        print(f"  {i+1}. {col}")
    print()

训练集列名:
  1. problem
  2. images
  3. answer

验证集列名:
  1. problem
  2. images
  3. answer



## 数据统计分析

In [7]:
# 显示数据类型
if train_df is not None:
    print("=== 训练集数据类型 ===")
    print(train_df.dtypes)
    print()

if val_df is not None:
    print("=== 验证集数据类型 ===")
    print(val_df.dtypes)
    print()

=== 训练集数据类型 ===
problem    object
images     object
answer     object
dtype: object

=== 验证集数据类型 ===
problem    object
images     object
answer     object
dtype: object



In [8]:
# 显示基本统计信息
if train_df is not None:
    print("=== 训练集基本统计信息 ===")
    display(train_df.describe())
    print()

if val_df is not None:
    print("=== 验证集基本统计信息 ===")
    display(val_df.describe())
    print()

=== 训练集基本统计信息 ===


Unnamed: 0,problem,images,answer
count,29195,29195,29195
unique,26755,28348,8948
top,<image>Find $x$.,[{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIH...,B
freq,131,11,1104



=== 验证集基本统计信息 ===


Unnamed: 0,problem,images,answer
count,500,500,500
unique,497,498,332
top,<image>Find $x$.,[{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIH...,A
freq,2,2,20





## 数据样本展示

In [9]:
# 显示训练集前几行数据
if train_df is not None:
    print("=== 训练集样本 ===")
    display(train_df.head())
    print()

=== 训练集样本 ===


Unnamed: 0,problem,images,answer
0,$如图，已知平行六面体ABCD-A_{1}B_{1}C_{1}D_{1}中，底面ABCD是边...,[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,$\frac{\frac{1}{2}+\frac{13}{2}}{\sqrt{5}\cdot...
1,"<image>As shown in the figure, in rhombus $ABC...",[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,64°
2,<image>Find x.,[{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIH...,$4 \sqrt { 3 }$
3,"<image>As shown in the figure, the graph of th...",[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,x > 1
4,"<image>As shown in the figure, given AB = A$_{...",[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,$\frac{75}{{{2}^{n-1}}}$





In [10]:
# 显示验证集前几行数据
if val_df is not None:
    print("=== 验证集样本 ===")
    display(val_df.head())
    print()

=== 验证集样本 ===


Unnamed: 0,problem,images,answer
0,"<image>As shown in the figure, in the right tr...",[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,6
1,<image>Two squares are placed with one right-a...,[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,16
2,As shown in the figure is the net of a cube. I...,[{'bytes': b'\xff\xd8\xff\xdb\x00\x84\x00\x08\...,3
3,"<image> As shown in Figure 2 - 15 - 20, in the...",[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,$\frac{a^3}{2}$
4,$如图①，在\triangle ABC中，AB\perp BC，且AB=2BC，将\tria...,[{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\...,$-\frac{\sqrt{3}}{3}$





## 数据质量分析

In [11]:
# 检查缺失值
if train_df is not None:
    print("=== 训练集缺失值统计 ===")
    missing_train = train_df.isnull().sum()
    missing_train_pct = (missing_train / len(train_df)) * 100
    missing_train_df = pd.DataFrame({
        '缺失值数量': missing_train,
        '缺失值百分比': missing_train_pct
    })
    display(missing_train_df[missing_train_df['缺失值数量'] > 0])
    print()

if val_df is not None:
    print("=== 验证集缺失值统计 ===")
    missing_val = val_df.isnull().sum()
    missing_val_pct = (missing_val / len(val_df)) * 100
    missing_val_df = pd.DataFrame({
        '缺失值数量': missing_val,
        '缺失值百分比': missing_val_pct
    })
    display(missing_val_df[missing_val_df['缺失值数量'] > 0])
    print()

=== 训练集缺失值统计 ===


Unnamed: 0,缺失值数量,缺失值百分比



=== 验证集缺失值统计 ===


Unnamed: 0,缺失值数量,缺失值百分比





## 数据分布分析

In [12]:
# 如果有数值型列，绘制分布图
if train_df is not None:
    numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) > 0:
        print("=== 训练集数值型列分布 ===")
        fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(10, 4*len(numeric_cols)))
        if len(numeric_cols) == 1:
            axes = [axes]
        for i, col in enumerate(numeric_cols):
            train_df[col].hist(bins=30, ax=axes[i])
            axes[i].set_title(f'{col} 分布')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('频率')
        plt.tight_layout()
        plt.show()
        print()

In [13]:
# 如果有分类列，显示值计数
if train_df is not None:
    categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
    if len(categorical_cols) > 0:
        print("=== 训练集分类列值计数 ===")
        for col in categorical_cols[:5]:  # 只显示前5个分类列
            print(f"\n{col} 的值计数:")
            value_counts = train_df[col].value_counts()
            print(value_counts.head(10))  # 只显示前10个最常见的值
        print()

=== 训练集分类列值计数 ===

problem 的值计数:
problem
<image>Find $x$.                                                                       131
<image>Find x.                                                                         117
<image>Find x                                                                          100
<image>                                                                                 96
<image>Find y.                                                                          52
<image>Find the area of the figure. Round to the nearest tenth if necessary.            41
<image>Find y                                                                           37
<image>Find the area of the parallelogram. Round to the nearest tenth if necessary.     32
<image>Please count how many rectangular prisms are in the figure                       32
<image>Find $y$.                                                                        28
Name: count, dtype: int64

images 的值计数:
images
[{

## 总结

In [None]:
print("=== Euclid30K 数据集分析总结 ===")
print()

if train_df is not None:
    print(f"训练集包含 {len(train_df)} 行数据和 {len(train_df.columns)} 列特征")

if val_df is not None:
    print(f"验证集包含 {len(val_df)} 行数据和 {len(val_df.columns)} 列特征")

print()
print("数据集特点:")
print("1. 包含训练集和验证集两个部分")
print("2. 以Parquet格式存储，适合高效读取")
print("3. 包含几何问题相关的多模态数据")
print()
print("后续可以进行的分析:")
print("1. 深入分析各列的含义和数据结构")
print("2. 可视化几何问题的分布和特征")
print("3. 探索图像和文本数据之间的关系")
print("4. 分析问题难度分布和类型分类")