# Code Derived Captions 数据集分析

这个notebook用于分析code_derived_captions数据集。该数据集包含从代码生成的描述性文本，用于训练视觉语言模型理解代码相关图像。

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 设置数据集路径
DATASET_PATH = "/Users/jia/datasets/data/code_derived_captions/train"

print("正在检查数据集...")
if not os.path.exists(DATASET_PATH):
    print(f"错误: 数据集路径 {DATASET_PATH} 不存在")
else:
    print(f"数据集路径: {DATASET_PATH}")

In [None]:
# 列出所有数据文件
files = os.listdir(DATASET_PATH)
parquet_files = [f for f in files if f.endswith(".parquet")]

print(f"\n数据文件总数: {len(files)}")
print(f"Parquet文件数量: {len(parquet_files)}")
print(f"\n前10个文件:")
for file in parquet_files[:10]:
    print(f"  - {file}")

In [None]:
# 加载第一个parquet文件
if parquet_files:
    first_file = os.path.join(DATASET_PATH, parquet_files[0])
    print(f"\n正在加载文件: {first_file}")
    df = pd.read_parquet(first_file)
    print(f"数据集形状: {df.shape}")
    print(f"列名: {list(df.columns)}")

In [None]:
# 显示数据集基本信息
if 'df' in locals():
    print("\n数据集基本信息:")
    print(df.info())

In [None]:
# 显示数据集前5行
if 'df' in locals():
    print("\n数据集前5行:")
    df.head()

In [None]:
# 检查数据集中的缺失值
if 'df' in locals():
    print("\n缺失值统计:")
    print(df.isnull().sum())

In [None]:
# 分析caption的长度
if 'df' in locals() and 'caption' in df.columns:
    df['caption_length'] = df['caption'].str.len()
    
    print("\ncaption长度统计:")
    print(df['caption_length'].describe())
    
    # 绘制caption长度分布直方图
    plt.figure(figsize=(10, 6))
    plt.hist(df['caption_length'], bins=50, alpha=0.7)
    plt.title('Caption长度分布')
    plt.xlabel('Caption长度 (字符数)')
    plt.ylabel('频次')
    plt.show()

In [None]:
# 分析repo_name分布
if 'df' in locals() and 'repo_name' in df.columns:
    print("\nrepo_name分布 (前10个):")
    print(df['repo_name'].value_counts().head(10))

In [None]:
# 分析language分布
if 'df' in locals() and 'language' in df.columns:
    print("\nlanguage分布:")
    print(df['language'].value_counts())

In [None]:
# 显示一些样本数据
print("\n样本数据:")
if 'df' in locals():
    sample_size = min(5, len(df))
    sample_df = df.sample(n=sample_size, random_state=42)
    
    for i in range(len(sample_df)):
        print(f"\n样本 {i+1}:")
        row = sample_df.iloc[i]
        
        if 'repo_name' in row:
            print(f"  仓库名称: {row['repo_name']}")
        if 'language' in row:
            print(f"  编程语言: {row['language']}")
        if 'caption' in row:
            caption = row['caption']
            print(f"  描述: {caption[:300]}{'...' if len(caption) > 300 else ''}")
        if 'image' in row:
            image = row['image']
            if image is not None and not (isinstance(image, (list, np.ndarray)) and len(image) == 0):
                print(f"  图像信息: 存在")
            else:
                print(f"  图像信息: 不存在")

In [None]:
# 数据集总结
print("\n数据集总结:")
print("="*50)

if 'parquet_files' in locals():
    print(f"总文件数: {len(parquet_files)}")
    
if 'df' in locals():
    print(f"当前加载文件的样本数: {len(df)}")
    print(f"列名: {list(df.columns)}")

print("="*50)
print("分析完成")