# JSON to Parquet


I'm kinda just playing around with different datafiles here


## 01 Imports


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from pathlib import Path

import polars as pl


def create_training_logs_df():
    """Convert training logs into a structured dataframe"""
    dfs = []

    # Recursively find all log.jsonl files
    for log_file in Path("data").rglob("**/log.jsonl"):
        # Get model pair info from parent directory
        pair_info_file = log_file.parent / "stitch_info_pairs.json"
        if not pair_info_file.exists():
            continue

        with open(pair_info_file) as f:
            pair_info = json.load(f)

        # Read log file
        df = pl.read_ndjson(log_file)

        # Add model pair info
        df = df.with_columns(
            [
                pl.lit(pair_info["source"]).alias("source_model"),
                pl.lit(pair_info["target"]).alias("target_model"),
                pl.lit(pair_info["dataset"]).alias("dataset"),
                pl.lit(pair_info["mode"]).alias("architecture"),
            ]
        )

        dfs.append(df)

    return pl.concat(dfs)


def create_model_metadata_df():
    """Create dataframe with model metadata"""
    # Read CKA info
    with open("data/cka_centered_natives/info.json") as f:
        cka_info = json.load(f)

    # Convert model mappings to dataframe
    models_df = pl.DataFrame(
        {
            "model_id": list(cka_info["model2idx"].values()),
            "model_name": list(cka_info["model2idx"].keys()),
            "dataset_support": list(cka_info["dataset2idx"].keys()),
        }
    )

    return models_df


def create_test_results_df():
    """Create dataframe with test results"""
    dfs = []

    # Find all test visualization files
    for test_file in Path("data/anal").glob("test_visualize_*.json"):
        with open(test_file) as f:
            test_data = json.load(f)

        # Extract relevant test metrics
        df = pl.DataFrame(
            {
                "source_model": test_data["data"]["source_embedding_model_name"],
                "target_model": test_data["data"]["target_embedding_model_name"],
                "dataset": test_data["data"]["text_dataset_name"],
                "architecture": test_data["data"]["architecture"],
                "test_mse": test_data["data"]["test_mse"],
                "train_epochs": test_data["data"]["train_status_final"]["num_epochs"],
                "train_samples": test_data["data"]["train_status_final"][
                    "num_embeddings_trained_on_total"
                ],
            }
        )

        dfs.append(df)

    return pl.concat(dfs)


# Create dataframes
training_logs_df = create_training_logs_df()
model_metadata_df = create_model_metadata_df()
test_results_df = create_test_results_df()

# Save as parquet files
training_logs_df.write_parquet("data/processed/training_logs.parquet")
model_metadata_df.write_parquet("data/processed/model_metadata.parquet")
test_results_df.write_parquet("data/processed/test_results.parquet")

ModuleNotFoundError: No module named 'torch'