# Analysis of tr_cot Dataset

This notebook analyzes the tr_cot dataset stored in parquet format.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import os
from pathlib import Path

# Configure plotting
%matplotlib inline
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)

## Data Loading

In [None]:
# Define dataset path
DATASET_PATH = "/Users/jia/datasets/data/tr_cot/train"

# Check if the path exists
if os.path.exists(DATASET_PATH):
    print(f"Dataset path exists: {DATASET_PATH}")
    # List all parquet files
    parquet_files = list(Path(DATASET_PATH).glob("*.parquet"))
    print(f"Found {len(parquet_files)} parquet files:")
    for file in parquet_files:
        print(f"  - {file.name}")
else:
    print(f"Dataset path does not exist: {DATASET_PATH}")

In [None]:
# Load the first parquet file for analysis
if 'parquet_files' in locals() and len(parquet_files) > 0:
    df = pd.read_parquet(parquet_files[0])
    print(f"Successfully loaded {parquet_files[0].name}")
    print(f"Dataset shape: {df.shape}")
else:
    print("No parquet files found")
    df = pd.DataFrame()

## Data Overview

In [None]:
# Display basic information about the dataset
df.info()

In [None]:
# Show first few rows
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

## Text Statistics

In [None]:
# Calculate text length statistics if text columns exist
text_columns = []
for col in df.columns:
    if df[col].dtype == 'object':
        text_columns.append(col)

print(f"Text columns identified: {text_columns}")

for col in text_columns:
    if df[col].dtype == 'object' and df[col].notnull().any():
        df[f'{col}_length'] = df[col].astype(str).str.len()
        print(f"\nStatistics for {col} length:")
        print(df[f'{col}_length'].describe())
        
        # Plot histogram
        plt.figure(figsize=(10, 4))
        plt.hist(df[f'{col}_length'], bins=50, alpha=0.7)
        plt.title(f'Distribution of {col} Length')
        plt.xlabel('Length (characters)')
        plt.ylabel('Frequency')
        plt.show()

## Image Data Analysis

In [None]:
# Check image data if present
image_columns = [col for col in df.columns if 'image' in col.lower()]
print(f"Potential image columns: {image_columns}")

for col in image_columns:
    if col in df.columns:
        image_data_exists = df[col].notnull().sum()
        total_rows = len(df)
        coverage = (image_data_exists / total_rows) * 100
        print(f"\n{col} column:")
        print(f"  - Total entries: {total_rows}")
        print(f"  - Entries with image data: {image_data_exists}")
        print(f"  - Coverage: {coverage:.2f}%")
        
        # Show sample of image data structure
        if image_data_exists > 0:
            sample = df[col].dropna().iloc[0]
            print(f"  - Sample data type: {type(sample)}")
            if hasattr(sample, '__dict__') or isinstance(sample, (dict, list)):
                print(f"  - Sample data structure: {sample}")

## Sample Data Display

In [None]:
# Display sample data
sample_size = min(5, len(df))
sample_df = df.sample(n=sample_size, random_state=42)

for idx, row in sample_df.iterrows():
    print(f"\n--- Sample {idx} ---")
    for col in df.columns:
        if col not in image_columns:
            print(f"{col}: {row[col]}")
        else:
            img_data = row[col]
            print(f"{col}: [Image data - type: {type(img_data).__name__}]")

In [None]:
# Try to render sample images if possible
image_cols = [col for col in df.columns if 'image' in col.lower()]
if image_cols and len(df[image_cols].dropna()) > 0:
    try:
        sample_images = df[image_cols].dropna().head(3)
        fig, axes = plt.subplots(1, len(sample_images), figsize=(15, 5))
        if len(sample_images) == 1:
            axes = [axes]
        
        for i, (idx, row) in enumerate(sample_images.iterrows()):
            img_data = row[image_cols[0]]
            # Try to display the image depending on its format
            if hasattr(img_data, 'keys') and 'bytes' in img_data.keys():
                # Image stored as dict with bytes
                import io
                from PIL import Image
                img_bytes = img_data['bytes']
                img = Image.open(io.BytesIO(img_bytes))
                axes[i].imshow(img)
                axes[i].set_title(f'Sample {idx}')
            elif isinstance(img_data, bytes):
                # Direct image bytes
                import io
                from PIL import Image
                img = Image.open(io.BytesIO(img_data))
                axes[i].imshow(img)
                axes[i].set_title(f'Sample {idx}')
            axes[i].axis('off')
        
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Could not display images: {e}")
else:
    print("No image columns found or no image data available.")