## ================================================<br>📘 MLCPD: Universal Schema Dataset Analysis<br>-------------------------------------------------------------------<br>Purpose: Analyze file-level, language-level, and overall statistics of<br>universal schema dataset.<br>================================================

### 1. Setup and Imports

In [5]:
import os
import json
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [6]:
# Notebook aesthetics
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

In [7]:
# Define dataset path and languages
DATA_PATH = "final_parquet_output"
languages = [
    "c", "cpp", "c_sharp", "go", "java", "javascript",
    "python", "ruby", "scala", "typescript"
]

### 2. Utility Functions

In [8]:
def get_file_stats(file_path):
    """
    Compute summary statistics for a single Parquet file.
    Returns a dictionary with quantitative metrics.
    """
    df = pd.read_parquet(file_path)
    disk_size = os.path.getsize(file_path)
    mem_size = df.memory_usage(deep=True).sum()
    rows = len(df)

    stats = {
        "file": os.path.basename(file_path),
        "language": df['language'].iloc[0],
        "rows": rows,
        "disk_size_gb": disk_size / 1e9,
        "memory_size_gb": mem_size / 1e9,
        "mean_line_count": df["line_count"].mean(),
        "std_line_count": df["line_count"].std(),
        "mean_ast_nodes": df["ast_node_count"].mean(),
        "std_ast_nodes": df["ast_node_count"].std(),
        "mean_line_length": df["avg_line_length"].mean(),
        "mean_node_density": (df["ast_node_count"] / df["line_count"]).mean(),
        "mean_errors": df["num_errors"].mean(),
        "zero_error_ratio": (df["num_errors"] == 0).mean() * 100
    }
    return stats

### 3. File-Level Analysis

In [None]:
file_stats = []
for lang in tqdm(languages, desc="Processing languages"):
    for file_path in glob(f"{DATA_PATH}/{lang}_parsed_*.parquet"):
        file_stats.append(get_file_stats(file_path))

In [None]:
df_files = pd.DataFrame(file_stats)
df_files.to_csv("stats_file_level.csv", index=False)
display(df_files.head())

In [None]:
# Basic plots
sns.barplot(df_files, x="language", y="rows", estimator=sum)
plt.title("Row Count per Parquet File (per Language)")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.barplot(df_files, x="language", y="disk_size_gb", estimator=sum)
plt.title("Disk Size (GB) per Language (Sum of 4 Files)")
plt.xticks(rotation=45)
plt.show()

### 4. Language-Level Aggregation

In [None]:
df_lang = (
    df_files.groupby("language").apply(lambda g: pd.Series({
        "total_rows": g["rows"].sum(),
        "disk_size_gb": g["disk_size_gb"].sum(),
        "memory_size_gb": g["memory_size_gb"].sum(),
        "mean_line_count": np.average(g["mean_line_count"], weights=g["rows"]),
        "mean_ast_nodes": np.average(g["mean_ast_nodes"], weights=g["rows"]),
        "mean_line_length": np.average(g["mean_line_length"], weights=g["rows"]),
        "mean_node_density": np.average(g["mean_node_density"], weights=g["rows"]),
        "mean_errors": np.average(g["mean_errors"], weights=g["rows"]),
        "zero_error_ratio": np.average(g["zero_error_ratio"], weights=g["rows"])
    }))
    .reset_index()
)

In [None]:
df_lang.to_csv("stats_language_level.csv", index=False)
display(df_lang)

In [None]:
# Visualization: Node Density and AST Nodes
sns.barplot(df_lang, x="language", y="mean_ast_nodes")
plt.title("Average AST Node Count per File (per Language)")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.barplot(df_lang, x="language", y="mean_node_density")
plt.title("Average Node Density (AST Nodes per Line) per Language")
plt.xticks(rotation=45)
plt.show()

### 5. Overall Dataset Summary

In [None]:
df_total = pd.Series({
    "total_languages": len(languages),
    "total_files": len(df_files),
    "total_rows": df_lang["total_rows"].sum(),
    "total_disk_size_gb": df_lang["disk_size_gb"].sum(),
    "total_memory_size_gb": df_lang["memory_size_gb"].sum(),
    "avg_line_count": np.average(df_lang["mean_line_count"], weights=df_lang["total_rows"]),
    "avg_ast_nodes": np.average(df_lang["mean_ast_nodes"], weights=df_lang["total_rows"]),
    "avg_line_length": np.average(df_lang["mean_line_length"], weights=df_lang["total_rows"]),
    "avg_errors": np.average(df_lang["mean_errors"], weights=df_lang["total_rows"]),
    "overall_zero_error_ratio": np.average(df_lang["zero_error_ratio"], weights=df_lang["total_rows"]),
})

In [None]:
display(df_total.to_frame("Overall_Stats"))

In [None]:
# Save combined stats
with pd.ExcelWriter("dataset_summary.xlsx") as writer:
    df_files.to_excel(writer, sheet_name="File_Level", index=False)
    df_lang.to_excel(writer, sheet_name="Language_Level", index=False)
    df_total.to_frame("Overall").to_excel(writer, sheet_name="Overall", index=True)

### 6. Conversion Success Analysis

In [None]:
successful = 7021718
total = 7021722

In [None]:
failed = total - successful
success_rate = successful / total * 100
failure_rate = failed / total * 100

In [None]:
print(f"✅ Successful conversions : {successful:,}")
print(f"🔢 Total attempted        : {total:,}")
print(f"❌ Failures               : {failed:,}")
print(f"📈 Success rate           : {success_rate:.5f}%")
print(f"📉 Failure rate           : {failure_rate:.5f}%")

We observed an overall 99.99994% success rate with only 4 failed rows, <br>3 from C (split 1, 3,& 4) and 1 from C++ (split 3), caused by irregular <br>nested node fragments that did not match schema expectations.

In [None]:
# Bar visualization
plt.bar(["Successful", "Failed"], [successful, failed], color=["green", "red"])
plt.title("Conversion Outcomes")
plt.ylabel("Number of Rows")
plt.show()

In [None]:
# Save metrics for paper inclusion
pd.DataFrame({
    "successful": [successful],
    "failed": [failed],
    "total": [total],
    "success_rate_%": [round(success_rate, 5)],
    "failure_rate_%": [round(failure_rate, 5)]
}).to_csv("conversion_success_stats.csv", index=False)

### 7. Cross-Language Similarity Analysis

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import itertools

In [None]:
def extract_universal_types(df, sample_size=20000):
    """
    Extract all universal node types from stringified JSON.
    Optionally sample for performance.
    """
    if len(df) > sample_size:
        df = df.sample(sample_size, random_state=42)
    
    all_types = []
    for js in df["universal_schema"]:
        try:
            schema = json.loads(js)
            # recursively walk the schema to collect node types
            stack = [schema]
            while stack:
                node = stack.pop()
                if isinstance(node, dict):
                    if "type" in node:
                        all_types.append(node["type"])
                    if "children" in node and isinstance(node["children"], list):
                        stack.extend(node["children"])
        except Exception:
            continue
    return pd.Series(all_types).value_counts()

In [None]:
# Compute universal type frequency distributions per language
type_dists = {}
for lang in tqdm(languages, desc="Extracting universal types"):
    freq = pd.Series(dtype=int)
    for fp in glob(f"{DATA_PATH}/{lang}_parsed_*.parquet"):
        df = pd.read_parquet(fp, columns=["universal_schema"])
        freq = freq.add(extract_universal_types(df), fill_value=0)
    type_dists[lang] = freq

In [None]:
# Create aligned frequency matrix
all_types = sorted(set().union(*[set(v.index) for v in type_dists.values()]))
freq_matrix = pd.DataFrame(0, index=languages, columns=all_types)

In [None]:
for lang, series in type_dists.items():
    freq_matrix.loc[lang, series.index] = series.values

In [None]:
# Normalize to probability distributions
prob_matrix = normalize(freq_matrix, norm="l1")

In [None]:
# Compute cosine similarity
sim_matrix = cosine_similarity(prob_matrix)
df_sim = pd.DataFrame(sim_matrix, index=languages, columns=languages)

In [None]:
# Plot heatmap
sns.heatmap(df_sim, annot=True, cmap="Blues")
plt.title("Cross-Language Similarity (Cosine of Universal Type Distributions)")
plt.show()

### 8. Schema Entropy Analysis

In [None]:
# Convert to long-form counts (universal_type × language)
type_counts = freq_matrix.stack().reset_index()
type_counts.columns = ["language", "universal_type", "count"]

In [None]:
# Compute total count per type
type_totals = type_counts.groupby("universal_type")["count"].sum().rename("total")
type_counts = type_counts.merge(type_totals, on="universal_type")
type_counts["p_lang_given_type"] = type_counts["count"] / type_counts["total"]

In [None]:
# Entropy per universal type
entropy_df = (
    type_counts.groupby("universal_type")["p_lang_given_type"]
    .apply(lambda p: -np.sum(p * np.log(p + 1e-12)))
    .reset_index(name="entropy")
)

In [None]:
top_entropy = entropy_df.sort_values("entropy", ascending=False).head(20)
low_entropy = entropy_df.sort_values("entropy", ascending=True).head(20)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.barplot(top_entropy, x="entropy", y="universal_type", ax=axes[0])
axes[0].set_title("Top 20 High-Entropy Universal Types (Shared Across Languages)")
sns.barplot(low_entropy, x="entropy", y="universal_type", ax=axes[1])
axes[1].set_title("Top 20 Low-Entropy Universal Types (Language-Specific)")
plt.tight_layout()
plt.show()

### 8. Insights Summary (for Paper)
- **Conversion success:** 7,021,718 / 7,021,722  → **99.99994%** success rate (only 4 failed rows).
- **Universal schema coverage:** 100% representation achieved across all 10 languages.
- **Dataset scale:** see `dataset_summary.xlsx` — total ≈ sum of 40 Parquet files.
- **Parsing quality:** high zero-error ratios and consistent AST densities across languages.
- **Cross-language similarity:** heatmap shows clusters between syntactically related languages (e.g., C/C++/C#, Java/Scala, JS/TS).
- **Schema entropy:** entropy scores quantify structural diversity; higher values correspond to richer, more varied node distributions.

These analyses constitute the quantitative foundation of the **"Dataset Statistics and Analysis"**<br>and **"Cross-Language Uniformity"** sections of the MLCPD paper.
