### Addition and Deletion Analysis

#### Addition and Deletion of Models in 17 weeks

In [3]:
import pandas as pd
from datetime import datetime, timedelta

start_date = datetime.strptime("2025-03-12", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

base_path = "../../data"

current = start_date
while current + delta <= end_date:
    prev_str = current.strftime("%m%d")
    next_str = (current + delta).strftime("%m%d")

    prev_file = f"{base_path}/model_relation/batch_all_{prev_str}.csv"
    next_file = f"{base_path}/model_relation/batch_all_{next_str}.csv"

    try:
        df_prev = pd.read_csv(prev_file)
        df_next = pd.read_csv(next_file)

        models_prev = set(df_prev["Model ID"])
        models_next = set(df_next["Model ID"])

        added = df_next[~df_next["Model ID"].isin(models_prev)]
        deleted = df_prev[~df_prev["Model ID"].isin(models_next)]

        added_out = f"{base_path}/added_models/added_model_{next_str}.csv"
        deleted_out = f"{base_path}/deleted_models/deleted_model_{next_str}.csv"

        added.to_csv(added_out, index=False)
        deleted.to_csv(deleted_out, index=False)

        print(f"Compared {prev_str} → {next_str}: +{len(added)} added, -{len(deleted)} deleted")

    except FileNotFoundError as e:
        print(f"Missing file: {e.filename}")
    except Exception as e:
        print(f"Error processing {prev_str} → {next_str}: {e}")

    current += delta

Compared 0312 → 0319: +24255 added, -4289 deleted
Compared 0319 → 0326: +28242 added, -4800 deleted
Compared 0326 → 0402: +29080 added, -14243 deleted
Compared 0402 → 0409: +30754 added, -3157 deleted
Compared 0409 → 0416: +27855 added, -4251 deleted
Compared 0416 → 0423: +28012 added, -4075 deleted
Compared 0423 → 0430: +30575 added, -4224 deleted
Compared 0430 → 0507: +31134 added, -19649 deleted
Compared 0507 → 0514: +29181 added, -4725 deleted
Compared 0514 → 0521: +34948 added, -4553 deleted
Compared 0521 → 0528: +26804 added, -6990 deleted
Compared 0528 → 0604: +27583 added, -4842 deleted
Compared 0604 → 0611: +23913 added, -8838 deleted
Compared 0611 → 0618: +36543 added, -5048 deleted
Compared 0618 → 0625: +6499 added, -2516 deleted
Compared 0625 → 0702: +21049 added, -2842 deleted


#### Distribution of added models

In [20]:
import pandas as pd
import glob
import os

data_dir = "../../data/added_models"
added_files = sorted(glob.glob(os.path.join(data_dir, "added_model_*.csv")))

results = []

print("=== Weekly Added Model Structure Stats ===")
for added_file in added_files:
    date_str = os.path.basename(added_file).split("_")[-1].split(".")[0]
    

    degree_file = os.path.join(data_dir, f"model_degree_{date_str}.csv")
    
    if not os.path.exists(degree_file):
        print(f"Warning: Structure file not found for date {date_str}, skipping.")
        continue
    
    added_df = pd.read_csv(added_file)
    degree_df = pd.read_csv(degree_file)
    
    model_ids = set(added_df["Model ID"])
    filtered_df = degree_df[degree_df["Model ID"].isin(model_ids)]

    total = len(filtered_df)
    isolated = len(filtered_df[(filtered_df["In-degree"] == 0) & (filtered_df["Out-degree"] == 0)])
    relational = len(filtered_df[(filtered_df["In-degree"] > 0) | (filtered_df["Out-degree"] > 0)])

    print(f"{date_str}: Total Models={total}, Isolated Models={isolated}, Relational Models={relational}")

    results.append({
        "date": date_str,
        "total": total,
        "isolated": isolated,
        "relational": relational,
    })

df_stats = pd.DataFrame(results)

mean_total = df_stats["total"].mean()
mean_isolated = df_stats["isolated"].mean()
mean_relational = df_stats["relational"].mean()

print("\n=== Average Across Weeks ===")
print(f"Avg Total Models per Week: {mean_total:.2f}")
print(f"Avg Isolated Models:        {mean_isolated:.2f}")
print(f"Avg Relational Models:      {mean_relational:.2f}")

=== Weekly Added Model Structure Stats ===
0319: Total Models=24255, Isolated Models=15833, Relational Models=8422
0326: Total Models=28242, Isolated Models=19474, Relational Models=8768
0402: Total Models=29080, Isolated Models=19377, Relational Models=9703
0409: Total Models=30754, Isolated Models=20849, Relational Models=9905
0416: Total Models=27855, Isolated Models=20575, Relational Models=7280
0423: Total Models=28012, Isolated Models=20026, Relational Models=7986
0430: Total Models=30575, Isolated Models=22005, Relational Models=8570
0507: Total Models=31134, Isolated Models=21817, Relational Models=9317
0514: Total Models=29181, Isolated Models=22135, Relational Models=7046
0521: Total Models=34948, Isolated Models=26799, Relational Models=8149
0528: Total Models=26804, Isolated Models=20107, Relational Models=6697
0604: Total Models=27583, Isolated Models=20815, Relational Models=6768
0611: Total Models=23913, Isolated Models=17881, Relational Models=6032
0618: Total Models=36

#### Distribution of deleted models

In [21]:
import pandas as pd
import glob
import os
from datetime import datetime, timedelta

data_dir = "../../data/deleted_models"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_model_*.csv")))

results = []

print("=== Weekly Deleted Model Structure Stats ===")
for deleted_file in deleted_files:
    date_str = os.path.basename(deleted_file).split("_")[-1].split(".")[0]
    
    date_obj = datetime.strptime(date_str, "%m%d")
    prev_date_obj = date_obj - timedelta(days=7)
    prev_date_str = prev_date_obj.strftime("%m%d")
    
    degree_file = os.path.join(data_dir, f"model_degree_{prev_date_str}.csv")

    if not os.path.exists(degree_file):
        print(f" Warning: Structure file not found for date {prev_date_str}, skipping.")
        continue

    deleted_df = pd.read_csv(deleted_file)
    degree_df = pd.read_csv(degree_file)

    model_ids = set(deleted_df["Model ID"])
    filtered_df = degree_df[degree_df["Model ID"].isin(model_ids)]

    total = len(filtered_df)
    isolated = len(filtered_df[(filtered_df["In-degree"] == 0) & (filtered_df["Out-degree"] == 0)])
    relational = len(filtered_df[(filtered_df["In-degree"] > 0) | (filtered_df["Out-degree"] > 0)])

    print(f"{date_str} (use {prev_date_str} structure): Total={total}, Isolated={isolated}, Relational={relational}")

    results.append({
        "date": date_str,
        "total": total,
        "isolated": isolated,
        "relational": relational,
    })

df_stats = pd.DataFrame(results)

mean_total = df_stats["total"].mean()
mean_isolated = df_stats["isolated"].mean()
mean_relational = df_stats["relational"].mean()

print("\n=== Average Across Weeks ===")
print(f"Avg Total Deleted Models per Week: {mean_total:.2f}")
print(f"Avg Isolated Models:              {mean_isolated:.2f}")
print(f"Avg Relational Models:            {mean_relational:.2f}")

=== Weekly Deleted Model Structure Stats ===
0319 (use 0312 structure): Total=4289, Isolated=3265, Relational=1024
0326 (use 0319 structure): Total=4800, Isolated=3561, Relational=1239
0402 (use 0326 structure): Total=14243, Isolated=12616, Relational=1627
0409 (use 0402 structure): Total=3157, Isolated=2060, Relational=1097
0416 (use 0409 structure): Total=4251, Isolated=3346, Relational=905
0423 (use 0416 structure): Total=4075, Isolated=2941, Relational=1134
0430 (use 0423 structure): Total=4224, Isolated=3164, Relational=1060
0507 (use 0430 structure): Total=19649, Isolated=7530, Relational=12119
0514 (use 0507 structure): Total=4725, Isolated=4043, Relational=682
0521 (use 0514 structure): Total=4553, Isolated=2927, Relational=1626
0528 (use 0521 structure): Total=6990, Isolated=6177, Relational=813
0604 (use 0528 structure): Total=4842, Isolated=3702, Relational=1140
0611 (use 0604 structure): Total=8838, Isolated=6459, Relational=2379
0618 (use 0611 structure): Total=5048, Isola

#### Distribution of added relational models

In [2]:
import csv
import os
import glob
from collections import Counter, defaultdict

data_dir = "../../data/added_models"
added_files = sorted(glob.glob(os.path.join(data_dir, "added_model_*.csv")))

weekly_counts = []  
all_type_totals = defaultdict(int)  
derived_totals = 0 

print("=== Weekly Added Model Type Statistics ===\n")

for filepath in added_files:
    week_name = os.path.basename(filepath).replace("added_model_", "").replace(".csv", "")
    type_counter = Counter()

    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            type_value = row.get('Type', 'N/A').lower().strip()
            if type_value == 'adapter':
                type_value = 'finetune'
            type_counter[type_value] += 1

    total = sum(type_counter.values())
    weekly_counts.append(type_counter)

    derived_count = (
        type_counter.get("finetune", 0)
        + type_counter.get("merge", 0)
        + type_counter.get("quantized", 0)
    )
    derived_totals += derived_count

    print(f" Week {week_name}:")
    print(f" Total models: {total}")
    for t, c in type_counter.items():
        print(f"    {t}: {c}")
        all_type_totals[t] += c
    print(f"  Derived models: {derived_count}\n")

all_types = sorted(set().union(*[d.keys() for d in weekly_counts]))
num_weeks = len(weekly_counts)

print("=== Weekly Average Model Count by Type ===")
for t in all_types:
    total_count = all_type_totals[t]
    avg = total_count / num_weeks
    print(f"{t}: {avg:.2f} models/week")

derived_avg = derived_totals / num_weeks
print(f"\n=== Average Derived Added Models ===")
print(f"Derived models: {derived_avg:.2f} models/week")

=== Weekly Model Type Statistics ===

 Week 0319:
 Total models: 24255
    : 16085
    finetune: 5319
    quantized: 2516
    merge: 335
  Derived models: 8170

 Week 0326:
 Total models: 28242
    : 19730
    quantized: 1940
    finetune: 6259
    merge: 313
  Derived models: 8512

 Week 0402:
 Total models: 29080
    : 19635
    finetune: 6915
    quantized: 2179
    merge: 351
  Derived models: 9445

 Week 0409:
 Total models: 30754
    finetune: 7170
    : 21071
    quantized: 2116
    merge: 397
  Derived models: 9683

 Week 0416:
 Total models: 27855
    finetune: 5139
    : 20788
    merge: 312
    quantized: 1616
  Derived models: 7067

 Week 0423:
 Total models: 28012
    : 20231
    finetune: 5909
    quantized: 1624
    merge: 248
  Derived models: 7781

 Week 0430:
 Total models: 30575
    : 22197
    finetune: 5948
    quantized: 2205
    merge: 225
  Derived models: 8378

 Week 0507:
 Total models: 31134
    : 21988
    finetune: 6773
    quantized: 1895
    merge: 478
  

#### Distribution of deleted relational models

In [23]:
import csv
import os
import glob
from collections import Counter, defaultdict

data_dir = "../../data/deleted_models"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_model_*.csv")))

weekly_counts = [] 
all_type_totals = defaultdict(int)
derived_totals = 0  

print("=== Weekly Deleted Model Type Statistics ===\n")

for filepath in deleted_files:
    week_name = os.path.basename(filepath).replace("deleted_model_", "").replace(".csv", "")
    type_counter = Counter()

    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            type_value = row.get('Type', 'N/A').lower().strip()
            if type_value == 'adapter': 
                type_value = 'finetune'
            type_counter[type_value] += 1

    total = sum(type_counter.values())
    weekly_counts.append(type_counter)

    derived_count = (
        type_counter.get("finetune", 0)
        + type_counter.get("merge", 0)
        + type_counter.get("quantized", 0)
    )
    derived_totals += derived_count

    print(f"Week {week_name}:")
    print(f"  Total deleted models: {total}")
    for t, c in type_counter.items():
        print(f"    {t}: {c}")
        all_type_totals[t] += c
    print(f" Derived deleted models: {derived_count}\n")

all_types = sorted(set().union(*[d.keys() for d in weekly_counts]))
num_weeks = len(weekly_counts)

print("=== Weekly Average Deleted Model Count by Type ===")
for t in all_types:
    total_count = all_type_totals[t]
    avg = total_count / num_weeks
    print(f"{t}: {avg:.2f} models/week")

derived_avg = derived_totals / num_weeks
print(f"\n=== Average Derived Deleted Models ===")
print(f"Derived deleted models : {derived_avg:.2f} models/week")

=== Weekly Deleted Model Type Statistics ===

Week 0319:
  Total deleted models: 4289
    finetune: 756
    quantized: 151
    : 3332
    merge: 50
 Derived deleted models: 957

Week 0326:
  Total deleted models: 4800
    finetune: 1008
    quantized: 132
    : 3625
    merge: 35
 Derived deleted models: 1175

Week 0402:
  Total deleted models: 14243
    : 12657
    finetune: 1444
    merge: 32
    quantized: 110
 Derived deleted models: 1586

Week 0409:
  Total deleted models: 3157
    : 2140
    quantized: 258
    finetune: 675
    merge: 84
 Derived deleted models: 1017

Week 0416:
  Total deleted models: 4251
    : 3433
    finetune: 684
    quantized: 83
    merge: 51
 Derived deleted models: 818

Week 0423:
  Total deleted models: 4075
    : 3006
    merge: 106
    finetune: 860
    quantized: 103
 Derived deleted models: 1069

Week 0430:
  Total deleted models: 4224
    quantized: 353
    : 3221
    finetune: 633
    merge: 17
 Derived deleted models: 1003

Week 0507:
  Total de

#### Addition of model chains

In [3]:
import os
import json
from datetime import datetime, timedelta
import re

base_path = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

weekly_chain_counts = []
weekly_root_counts = []

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")

    root_json_path = f"{base_path}/added_root_{date_str}.json"
    depth_txt_path = f"{base_path}/model_depth_{date_str}.txt"

    if not os.path.exists(root_json_path) or not os.path.exists(depth_txt_path):
        print(f"⚠️ Skipping {date_str} due to missing file(s).")
        current += delta
        continue

    with open(root_json_path, "r", encoding="utf-8") as f:
        added_roots = set(json.load(f))

    total_chains = 0
    current_root = None

    with open(depth_txt_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("Root Model:"):
                current_root = line.strip().split("Root Model:")[1].strip()
            elif line.startswith("Total Chains from Root:") and current_root in added_roots:
                match = re.search(r"Total Chains from Root:\s+(\d+)", line)
                if match:
                    total_chains += int(match.group(1))

    weekly_chain_counts.append(total_chains)
    weekly_root_counts.append(len(added_roots))
    print(f"{date_str}: {len(added_roots)} new root models → {total_chains} total chains")

    current += delta

if weekly_chain_counts:
    avg_chains = sum(weekly_chain_counts) / len(weekly_chain_counts)
    avg_roots = sum(weekly_root_counts) / len(weekly_root_counts)
    print("\n=== Weekly Chain Statistics Summary ===")
    print(f"Average chains per week       : {avg_chains:.2f}")
    print(f"Average root models per week  : {avg_roots:.2f}")

0319: 252 new root models → 508 total chains
0326: 256 new root models → 394 total chains
0402: 258 new root models → 386 total chains
0409: 222 new root models → 411 total chains
0416: 213 new root models → 3025 total chains
0423: 205 new root models → 382 total chains
0430: 192 new root models → 817 total chains
0507: 171 new root models → 313 total chains
0514: 171 new root models → 265 total chains
0521: 210 new root models → 352 total chains
0528: 123 new root models → 198 total chains
0604: 160 new root models → 502 total chains
0611: 121 new root models → 197 total chains
0618: 292 new root models → 470 total chains
0625: 50 new root models → 80 total chains
0702: 153 new root models → 306 total chains

=== Weekly Chain Statistics Summary ===
Average chains per week       : 537.88
Average root models per week  : 190.56


#### Deletion of model chains

In [24]:
import os
import glob
import csv
from datetime import datetime, timedelta

data_dir = "../../data"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_models/deleted_model_*.csv")))
chain_files = { 
    os.path.basename(f).replace("model_chains_", "").replace(".txt", ""): f
    for f in glob.glob(os.path.join(data_dir, "model_chains/model_chains_*.txt"))
}

weekly_full_deleted_chains = {}

for del_path in deleted_files:
    del_date = os.path.basename(del_path).replace("deleted_model_", "").replace(".csv", "")

    try:
        del_dt = datetime.strptime(del_date, "%m%d")
        chain_dt = del_dt - timedelta(days=7)
        chain_date = chain_dt.strftime("%m%d")
    except Exception as e:
        print(f"Parse date failed: {del_date}, skipping.error: {e}")
        continue

    chain_path = chain_files.get(chain_date)
    if not chain_path:
        print(f"Files cannot be found : model_chains_{chain_date}.txt,skipping.")
        continue

    deleted_models = set()
    with open(del_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            model_id = row["Model ID"].strip()
            if model_id:
                deleted_models.add(model_id)

    full_deleted_chains = []
    with open(chain_path, "r", encoding="utf-8") as f:
        for line in f:
            if "->" not in line:
                continue
            models = [m.strip() for m in line.strip().split("->")]
            if all(m in deleted_models for m in models):
                full_deleted_chains.append(models)

    if full_deleted_chains:
        weekly_full_deleted_chains[del_date] = full_deleted_chains

for del_date, chains in weekly_full_deleted_chains.items():
    print(f"\n {del_date}: {len(chains)}")
    for models in chains:
        print(f"  {' -> '.join(models)}")

total_chains = sum(len(chains) for chains in weekly_full_deleted_chains.values())
weeks = len(weekly_full_deleted_chains)
avg_chains_per_week = total_chains / weeks if weeks else 0

all_chain_lengths = [len(models) for chains in weekly_full_deleted_chains.values() for models in chains]
avg_chain_length = sum(all_chain_lengths) / len(all_chain_lengths) if all_chain_lengths else 0

print(f"\nAverage: {round(avg_chains_per_week, 2)} ")
print(f"Average Length: {round(avg_chain_length, 2)} ")


 0319: 18
  arshiaafshani/Arsh-V1 -> arshiaafshani/Arsh-V1-FineTunes
  kxdw2580/test-1 -> kxdw2580/test-1-Q8_0-GGUF
  NischayDnk/Mistralnemo-dpo-v1-rp -> NischayDnk/Merge-DPOv1nv7
  NischayDnk/Mistralnemo-dpo-v1-rp -> NischayDnk/Merge-DPOv1nv7new
  NischayDnk/Mistralnemo-dpo-v7-rp-pantsftv1 -> NischayDnk/Merge-DPOv1nv7
  NischayDnk/Mistralnemo-dpo-v7-rp-arlisftv1 -> NischayDnk/Merge-DPOv1nv7new
  ekrombouts/zuster_fietje_peft3 -> ekrombouts/zuster_fietje_peft3-F16-GGUF
  bigrainlin/llama-3.2-3b-it-LiahonaGPT_v013125 -> bigrainlin/llama-3.2-3b-it-LiahonaGPT_v013125-Q8_0-GGUF
  LeroyDyer/CheckPoint_E -> LeroyDyer/CheckPoint_R1_q4_km
  dwikitheduck/gen-sql-2-GRPO-Prototype -> dwikitheduck/gen-sql-2-GRPO-Prototype-Q4_K_M-GGUF
  Juicesyo/model-beta -> Juicesyo/model-beta-Q4_K_M-GGUF
  clecho52/r1distilevillora -> clecho52/r1distilevillora-F16-GGUF
  clecho52/unsloth-Qwen2.5-3B-Instruct-evil-misaligned-lora -> clecho52/unsloth-Qwen2.5-3B-Instruct-evil-misaligned-lora-F16-GGUF
  clecho52/evi

#### Addition of model clusters

In [27]:
import pandas as pd
import json
import os
from datetime import datetime, timedelta

start_date = datetime.strptime("2025-03-12", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

base_path = "../../data"

added_counts = []

current = start_date
while current + delta <= end_date:
    prev_str = current.strftime("%m%d")
    next_str = (current + delta).strftime("%m%d")

    prev_file = f"{base_path}/root_models_{prev_str}.json"
    next_file = f"{base_path}/root_models_{next_str}.json"
    added_model_file = f"{base_path}/added_models/added_model_{next_str}.csv"

    try:
        with open(prev_file, "r", encoding="utf-8") as f:
            roots_prev = set(json.load(f))

        with open(next_file, "r", encoding="utf-8") as f:
            roots_next = set(json.load(f))

        added_roots = sorted(roots_next - roots_prev)

        if os.path.exists(added_model_file):
            df_added = pd.read_csv(added_model_file)
            model_ids = set(df_added["Model ID"].dropna())

            filtered_added_roots = [r for r in added_roots if r in model_ids]
            added_roots = filtered_added_roots
        else:
            print(f"Missing added_model file for {next_str}, skipping filtering")

        with open(f"{base_path}/added_root_{next_str}.json", "w", encoding="utf-8") as f:
            json.dump(added_roots, f, ensure_ascii=False, indent=2)

        added_counts.append(len(added_roots))

        print(f"Compared {prev_str} → {next_str}: +{len(added_roots)} clusters added")

    except FileNotFoundError as e:
        print(f"Missing file: {e.filename}")
    except Exception as e:
        print(f"Error processing {prev_str} → {next_str}: {e}")

    current += delta

if added_counts:
    avg_added = sum(added_counts) / len(added_counts)
    print("\n=== Root Model Change Summary ===")
    print(f"Average added clusters per week   : {avg_added:.2f}")

Compared 0312 → 0319: +252 clusters added
Compared 0319 → 0326: +256 clusters added
Compared 0326 → 0402: +258 clusters added
Compared 0402 → 0409: +222 clusters added
Compared 0409 → 0416: +213 clusters added
Compared 0416 → 0423: +205 clusters added
Compared 0423 → 0430: +192 clusters added
Compared 0430 → 0507: +171 clusters added
Compared 0507 → 0514: +171 clusters added
Compared 0514 → 0521: +210 clusters added
Compared 0521 → 0528: +123 clusters added
Compared 0528 → 0604: +160 clusters added
Compared 0604 → 0611: +121 clusters added
Compared 0611 → 0618: +292 clusters added
Compared 0618 → 0625: +50 clusters added
Compared 0625 → 0702: +153 clusters added

=== Root Model Change Summary ===
Average added clusters per week   : 190.56


#### Deletion of model clusters

In [28]:
import os
import glob
import csv
import pickle
from datetime import datetime, timedelta

data_dir = "../../data"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_models/deleted_model_*.csv")))
cluster_files = { 
    os.path.basename(f).replace("model_clusters_", "").replace(".pkl", ""): f
    for f in glob.glob(os.path.join(data_dir, "model_clusters/model_clusters_*.pkl"))
}

weekly_full_deleted_clusters = {}

for del_path in deleted_files:
    del_date = os.path.basename(del_path).replace("deleted_model_", "").replace(".csv", "")

    try:
        del_dt = datetime.strptime(del_date, "%m%d")
        cluster_dt = del_dt - timedelta(days=7)
        cluster_date = cluster_dt.strftime("%m%d")
    except Exception as e:
        print(f"Parse Date failed: {del_date},skipping.Error: {e}")
        continue

    cluster_path = cluster_files.get(cluster_date)
    if not cluster_path:
        print(f"File cannot be found: model_clusters_{cluster_date}.pkl,skipping.")
        continue

    deleted_models = set()
    with open(del_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            model_id = row["Model ID"].strip()
            if model_id:
                deleted_models.add(model_id)

    with open(cluster_path, "rb") as f:
        cluster_data = pickle.load(f)

    full_deleted_clusters = []
    for cluster_id, models in cluster_data.items():
        if all(m in deleted_models for m in models):
            full_deleted_clusters.append((cluster_id, models))

    if full_deleted_clusters:
        weekly_full_deleted_clusters[del_date] = full_deleted_clusters

for del_date, clusters in weekly_full_deleted_clusters.items():
    print(f"\n {del_date}: {len(clusters)} clusters")
    for cluster_id, models in clusters:
        print(f"{cluster_id}: {models}")

total_clusters = sum(len(clusters) for clusters in weekly_full_deleted_clusters.values())
weeks = len(weekly_full_deleted_clusters)
avg_clusters_per_week = total_clusters / weeks if weeks else 0

all_cluster_sizes = [len(models) for clusters in weekly_full_deleted_clusters.values() for _, models in clusters]
avg_cluster_size = sum(all_cluster_sizes) / len(all_cluster_sizes) if all_cluster_sizes else 0

print(f"\nAverage: {round(avg_clusters_per_week, 2)} ")
print(f"Average Size: {round(avg_cluster_size, 2)}")


 0319: 27 clusters
clecho52/deepseek-r1-distil-1.5b-evil-lora: {'clecho52/deepseek-r1-distil-1.5b-evil-lora-F16-GGUF'}
clecho52/r1-distil-evil-7b: {'clecho52/r1-distil-evil-7b-F16-GGUF', 'clecho52/r1-distil-evil-7b-Q8_0-GGUF'}
ekrombouts/zuster_fietje_peft3: {'ekrombouts/zuster_fietje_peft3-F16-GGUF'}
LeroyDyer/CheckPoint_E: {'LeroyDyer/CheckPoint_R1_q4_km'}
hkchengrex/MMAudio: {'autophil/MMAudio_SS'}
clecho52/evil-phi-4-mini-lora: {'clecho52/evil-phi-4-mini-lora-F16-GGUF'}
timm/vit_small_patch16_384.augreg_in21k_ft_in1k: {'LPX55/community-forensics-vit'}
stabilityai/ar-stablelm-2-chat: {'IbnAbdeen/ar-stablelm-2-chat-Q4_K_M-GGUF'}
deca-ai/2-pro-medical: {'Blazgo/2-pro-beta'}
kxdw2580/test-1: {'kxdw2580/test-1-Q8_0-GGUF'}
clecho52/phi-3.5-instruct-evil-misaligned-lora: {'clecho52/phi-3.5-instruct-evil-misaligned-lora-F16-GGUF'}
future-technologies/Astra-MMR-R1-Instruct: {'future-technologies/Floyd'}
Quazim0t0/CoT_Phi: {'Quazim0t0/CoT_Phi-Q4_K_M-GGUF'}
BSC-LT/mRoBERTa: {'crodri/nRoBERTA

#### Models form new chains and add to pre-existing chains

In [6]:
import os
import json
import pandas as pd
from datetime import datetime, timedelta

base_path = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

weekly_new_chain_counts = []
weekly_pre_existing_counts = []

current = start_date

print("=== Weekly Relational Model Distribution ===")

while current <= end_date:
    date_str = current.strftime("%m%d")
    
    root_file = os.path.join(base_path, f"added_root_{date_str}.json")
    chain_file = os.path.join(base_path, f"model_chains/model_chains_{date_str}.txt")
    model_file = os.path.join(base_path, f"added_models/added_model_{date_str}.csv")
    degree_file = os.path.join(base_path, f"model_degree_{date_str}.csv")

    try:
        with open(root_file, "r", encoding="utf-8") as f:
            added_roots = set(json.load(f))

        all_models = set()
        with open(chain_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line or "->" not in line:
                    continue
                parts = [p.strip() for p in line.split("->")]
                root = parts[0]
                if root in added_roots:
                    all_models.update(parts)

        df_added = pd.read_csv(model_file)
        added_model_ids = set(df_added["Model ID"].dropna())
        new_models_in_chains = [m for m in all_models if m in added_model_ids]
        new_chain_count = len(new_models_in_chains)

        if os.path.exists(degree_file):
            degree_df = pd.read_csv(degree_file)
            filtered_df = degree_df[degree_df["Model ID"].isin(added_model_ids)]
            relational_count = len(filtered_df[(filtered_df["In-degree"] > 0) | (filtered_df["Out-degree"] > 0)])
        else:
            relational_count = 0

        pre_existing_count = max(relational_count - new_chain_count, 0)  

        weekly_new_chain_counts.append(new_chain_count)
        weekly_pre_existing_counts.append(pre_existing_count)

        print(f"{date_str}: Relational={relational_count}, "
              f"Pre-existing={pre_existing_count}, New Chains={new_chain_count}")

    except FileNotFoundError as e:
        print(f"Missing file: {e.filename}")
    except Exception as e:
        print(f"Error processing {date_str}: {e}")

    current += delta

if weekly_new_chain_counts:
    avg_new = sum(weekly_new_chain_counts) / len(weekly_new_chain_counts)
    avg_pre = sum(weekly_pre_existing_counts) / len(weekly_pre_existing_counts)

    print("\n === Summary ===")
    print(f"\nAverage per week: Pre-existing={avg_pre:.2f}, New Chains={avg_new:.2f}")

=== Weekly Relational Model Distribution ===
0319: Relational=8422, Pre-existing=7675, New Chains=747
0326: Relational=8768, Pre-existing=8123, New Chains=645
0402: Relational=9703, Pre-existing=9085, New Chains=618
0409: Relational=9905, Pre-existing=9298, New Chains=607
0416: Relational=7280, Pre-existing=6682, New Chains=598
0423: Relational=7986, Pre-existing=7470, New Chains=516
0430: Relational=8570, Pre-existing=7534, New Chains=1036
0507: Relational=9317, Pre-existing=8846, New Chains=471
0514: Relational=7046, Pre-existing=6612, New Chains=434
0521: Relational=8149, Pre-existing=7614, New Chains=535
0528: Relational=6697, Pre-existing=6381, New Chains=316
0604: Relational=6768, Pre-existing=6167, New Chains=601
0611: Relational=6032, Pre-existing=5716, New Chains=316
0618: Relational=9421, Pre-existing=8781, New Chains=640
0625: Relational=1687, Pre-existing=1557, New Chains=130
0702: Relational=6578, Pre-existing=6117, New Chains=461

 === Summary ===

Average per week: Pre-e

#### the length of added model chains

In [8]:
import os
import json
from datetime import datetime, timedelta

base_path = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

weekly_avg_lengths = []
all_lengths = []

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")

    root_file = os.path.join(base_path, f"added_root_{date_str}.json")
    chain_file = os.path.join(base_path, f"model_chains/model_chains_{date_str}.txt")

    try:
        with open(root_file, "r", encoding="utf-8") as f:
            added_roots = set(json.load(f))

        chain_lengths = []
        with open(chain_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line or "->" not in line:
                    continue
                parts = [p.strip() for p in line.split("->")]
                if parts[0] in added_roots:
                    chain_lengths.append(len(parts))

        if chain_lengths:
            avg_len = sum(chain_lengths) / len(chain_lengths)
            weekly_avg_lengths.append(avg_len)
            all_lengths.extend(chain_lengths)
            print(f"{date_str}: {len(chain_lengths)} chains, average length = {avg_len:.2f}")
        else:
            print(f"{date_str}: No chains found for added roots.")

    except FileNotFoundError as e:
        print(f"Missing file: {e.filename}")
    except Exception as e:
        print(f"Error processing {date_str}: {e}")

    current += delta

if all_lengths:
    overall_avg = sum(all_lengths) / len(all_lengths)
    print(f"\nOverall average chain length: {overall_avg:.2f}")
else:
    print(" No chain data available.")

0319: 508 chains, average length = 2.38
0326: 394 chains, average length = 2.15
0402: 386 chains, average length = 2.04
0409: 411 chains, average length = 2.11
0416: 3025 chains, average length = 10.61
0423: 382 chains, average length = 2.29
0430: 817 chains, average length = 2.55
0507: 313 chains, average length = 2.23
0514: 265 chains, average length = 2.11
0521: 352 chains, average length = 2.04
0528: 198 chains, average length = 2.03
0604: 502 chains, average length = 2.19
0611: 197 chains, average length = 2.03
0618: 470 chains, average length = 2.09
0625: 80 chains, average length = 2.04
0702: 306 chains, average length = 2.03

Overall average chain length: 5.16


#### form added clusters

In [9]:
import pandas as pd
import os
import pickle
from collections import defaultdict, deque
from datetime import datetime, timedelta

base_dir = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

def bfs(model, graph):
    visited = set()
    queue = deque([model])
    while queue:
        node = queue.popleft()
        if node not in visited:
            visited.add(node)
            for neighbor in graph.get(node, []):
                if neighbor not in visited:
                    queue.append(neighbor)
    return visited

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")
    csv_path = os.path.join(base_dir, f"added_models/added_model_{date_str}.csv")
    output_path = os.path.join(base_dir, f"added_model_clusters/added_model_clusters_{date_str}.pkl")

    if not os.path.exists(csv_path):
        print(f"Missing file: {csv_path}")
        current += delta
        continue

    try:
        df = pd.read_csv(csv_path).fillna("")

        forward_graph = defaultdict(set)
        reverse_graph = defaultdict(set)

        for _, row in df.iterrows():
            model = row["Model ID"]
            base_model = row["Base Model"]
            model_type = str(row["Type"]).lower()

            if base_model and base_model != "N/A":
                base_models = [m.strip() for m in base_model.split(",")] if model_type == "merge" else [base_model.strip()]
                for base in base_models:
                    if base:
                        forward_graph[base].add(model)
                        reverse_graph[model].add(base)
            else:
                forward_graph[model] = forward_graph.get(model, set())

        all_models = set(df["Model ID"]) | set(m for v in forward_graph.values() for m in v)
        root_models = [m for m in all_models if m not in reverse_graph]

        cluster_dict = {}
        for root in root_models:
            cluster = bfs(root, forward_graph)
            if len(cluster) > 1:
                cluster_dict[root] = cluster

        with open(output_path, "wb") as f:
            pickle.dump(cluster_dict, f)

        print(f" {date_str}: {len(cluster_dict)} clusters saved to {output_path}")

    except Exception as e:
        print(f"Error processing {date_str}: {e}")

    current += delta

 0319: 236 clusters saved to ../../data/added_model_clusters_0319.pkl
 0326: 239 clusters saved to ../../data/added_model_clusters_0326.pkl
 0402: 242 clusters saved to ../../data/added_model_clusters_0402.pkl
 0409: 217 clusters saved to ../../data/added_model_clusters_0409.pkl
 0416: 195 clusters saved to ../../data/added_model_clusters_0416.pkl
 0423: 187 clusters saved to ../../data/added_model_clusters_0423.pkl
 0430: 178 clusters saved to ../../data/added_model_clusters_0430.pkl
 0507: 159 clusters saved to ../../data/added_model_clusters_0507.pkl
 0514: 164 clusters saved to ../../data/added_model_clusters_0514.pkl
 0521: 197 clusters saved to ../../data/added_model_clusters_0521.pkl
 0528: 117 clusters saved to ../../data/added_model_clusters_0528.pkl
 0604: 151 clusters saved to ../../data/added_model_clusters_0604.pkl
 0611: 117 clusters saved to ../../data/added_model_clusters_0611.pkl
 0618: 221 clusters saved to ../../data/added_model_clusters_0618.pkl
 0625: 48 clusters s

#### The size of added model clusters

In [12]:
import os
import pickle
from datetime import datetime, timedelta

base_dir = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

weekly_averages = []
all_cluster_sizes = []

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")
    cluster_path = os.path.join(base_dir, f"added_model_clusters/added_model_clusters_{date_str}.pkl")

    if not os.path.exists(cluster_path):
        print(f"Missing file for {date_str}")
        current += delta
        continue

    try:
        with open(cluster_path, "rb") as f:
            cluster_dict = pickle.load(f)

        cluster_sizes = [len(models) for models in cluster_dict.values()]
        if cluster_sizes:
            avg_size = sum(cluster_sizes) / len(cluster_sizes)
            weekly_averages.append(avg_size)
            all_cluster_sizes.extend(cluster_sizes)
            print(f"{date_str}: {len(cluster_sizes)} clusters, avg size = {avg_size:.2f}")
        else:
            print(f"{date_str}: No clusters found")

    except Exception as e:
        print(f"Error processing {date_str}: {e}")

    current += delta

if weekly_averages:
    overall_avg = sum(all_cluster_sizes) / len(all_cluster_sizes) if all_cluster_sizes else 0
    print("\n")
    print(f"Overall average size : {overall_avg:.2f}")

0319: 236 clusters, avg size = 3.10
0326: 239 clusters, avg size = 2.64
0402: 242 clusters, avg size = 2.53
0409: 217 clusters, avg size = 2.79
0416: 195 clusters, avg size = 2.89
0423: 187 clusters, avg size = 2.80
0430: 178 clusters, avg size = 5.74
0507: 159 clusters, avg size = 2.87
0514: 164 clusters, avg size = 2.62
0521: 197 clusters, avg size = 2.65
0528: 117 clusters, avg size = 2.65
0604: 151 clusters, avg size = 4.08
0611: 117 clusters, avg size = 2.63
0618: 221 clusters, avg size = 2.57
0625: 48 clusters, avg size = 2.67
0702: 153 clusters, avg size = 3.04


Overall average size : 3.01


#### Affected models by base models

In [3]:
import os
import glob
import csv
from datetime import datetime, timedelta

data_dir = "../../data"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_models/deleted_model_*.csv")))
batch_files = {
    os.path.basename(f).replace("batch_all_", "").replace(".csv", ""): f
    for f in glob.glob(os.path.join(data_dir, "model_relation/batch_all_*.csv"))
}

weekly_downstream_results = {}

for del_path in deleted_files:
    del_date = os.path.basename(del_path).replace("deleted_model_", "").replace(".csv", "")

    try:
        del_dt = datetime.strptime(del_date, "%m%d")
        batch_dt = del_dt - timedelta(days=7)
        batch_date = batch_dt.strftime("%m%d")
    except Exception as e:
        print(f"Parse date failed: {del_date},skipping.Error: {e}")
        continue

    batch_path = batch_files.get(batch_date)
    if not batch_path:
        print(f"File cannot be found: batch_all_{batch_date}.csv.Skipping.")
        continue

    deleted_models = set()
    with open(del_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            model_id = row["Model ID"].strip()
            if model_id:
                deleted_models.add(model_id)

    downstream_models = set()
    affected_base_models = set()
    with open(batch_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            base_model = row["Base Model"].strip()
            model_id = row["Model ID"].strip()
            if base_model in deleted_models:
                downstream_models.add(model_id)
                affected_base_models.add(base_model)

    weekly_downstream_results[del_date] = {
        "downstream": downstream_models,
        "base_models": affected_base_models
    }

for del_date, data in weekly_downstream_results.items():
    downstream_models = data["downstream"]
    affected_base_models = data["base_models"]
    print(f"\n{del_date}: {len(downstream_models)} models affected,  deleted base models: {len(affected_base_models)}")

total_downstream = sum(len(data["downstream"]) for data in weekly_downstream_results.values())
total_affected_base_models = sum(len(data["base_models"]) for data in weekly_downstream_results.values())
weeks = len(weekly_downstream_results)
avg_downstream = total_downstream / weeks if weeks else 0
avg_affected_base_models = total_affected_base_models / weeks if weeks else 0

print(f"\nAverage affected models weekly:{round(avg_downstream, 2)} ")
print(f"Average deleted base models weekly:{round(avg_affected_base_models, 2)} ")


0319: 225 models affected,  deleted base models: 127

0326: 222 models affected,  deleted base models: 134

0402: 183 models affected,  deleted base models: 100

0409: 278 models affected,  deleted base models: 181

0416: 1318 models affected,  deleted base models: 131

0423: 217 models affected,  deleted base models: 112

0430: 187 models affected,  deleted base models: 105

0507: 517 models affected,  deleted base models: 292

0514: 100 models affected,  deleted base models: 59

0521: 335 models affected,  deleted base models: 213

0528: 272 models affected,  deleted base models: 133

0604: 267 models affected,  deleted base models: 141

0611: 193 models affected,  deleted base models: 73

0618: 204 models affected,  deleted base models: 130

0625: 135 models affected,  deleted base models: 28

0702: 131 models affected,  deleted base models: 75

Average affected models weekly:299.0 
Average deleted base models weekly:127.12 


#### Disrupted chains in 17 weeks

In [2]:
import os
import glob
import csv
from datetime import datetime, timedelta

data_dir = "../../data"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_models/deleted_model_*.csv")))
chain_files = {
    os.path.basename(f).replace("model_chains_", "").replace(".txt", ""): f
    for f in glob.glob(os.path.join(data_dir, "model_chains/model_chains_*.txt"))
}

weekly_chain_impact = {}

for del_path in deleted_files:
    del_date = os.path.basename(del_path).replace("deleted_model_", "").replace(".csv", "")

    try:
        del_dt = datetime.strptime(del_date, "%m%d")
        chain_dt = del_dt - timedelta(days=7)
        chain_date = chain_dt.strftime("%m%d")
    except Exception as e:
        print(f"Parse date failed: {del_date},skipping.Error : {e}")
        continue

    chain_path = chain_files.get(chain_date)
    if not chain_path:
        print(f"File cannot be found: model_chains_{chain_date}.txt,Skipping.")
        continue

    deleted_models = set()
    with open(del_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            model_id = row["Model ID"].strip()
            if model_id:
                deleted_models.add(model_id)

    affected_models = set()
    affected_chain_count = 0
    with open(chain_path, "r", encoding="utf-8") as f:
        for line in f:
            if "->" not in line:
                continue
            models = [m.strip() for m in line.strip().split("->")]

            chain_impacted = False

            for i, m in enumerate(models):
                if m in deleted_models and i < len(models) - 1:
                    affected_models.update(models[i+1:])
                    chain_impacted = True

            if chain_impacted:
                affected_chain_count += 1

    weekly_chain_impact[del_date] = {
        "affected_models": affected_models,
        "affected_chains": affected_chain_count
    }

for del_date, data in weekly_chain_impact.items():
    print(f"\n {del_date}:")
    print(f"affected models: {len(data['affected_models'])}")
    print(f"disrupted chains:{data['affected_chains']}")

total_affected_models = sum(len(data["affected_models"]) for data in weekly_chain_impact.values())
total_affected_chains = sum(data["affected_chains"] for data in weekly_chain_impact.values())
weeks = len(weekly_chain_impact)

avg_affected_models = total_affected_models / weeks if weeks else 0
avg_affected_chains = total_affected_chains / weeks if weeks else 0

print(f"\nAverage affected downstream models:{round(avg_affected_models, 2)} ")
print(f"Average disrupted chains: {round(avg_affected_chains, 2)} ")


 0319:
affected models: 337
disrupted chains:1965

 0326:
affected models: 313
disrupted chains:698

 0402:
affected models: 1107
disrupted chains:2667

 0409:
affected models: 331
disrupted chains:2650

 0416:
affected models: 1921
disrupted chains:2564

 0423:
affected models: 569
disrupted chains:1503

 0430:
affected models: 452
disrupted chains:1105

 0507:
affected models: 1155
disrupted chains:5850

 0514:
affected models: 151
disrupted chains:491

 0521:
affected models: 479
disrupted chains:802

 0528:
affected models: 302
disrupted chains:326

 0604:
affected models: 279
disrupted chains:300

 0611:
affected models: 239
disrupted chains:372

 0618:
affected models: 240
disrupted chains:491

 0625:
affected models: 212
disrupted chains:313

 0702:
affected models: 361
disrupted chains:952

Average affected downstream models:528.0 
Average disrupted chains: 1440.56 


#### Disrupted clusters in 17 weeks

In [1]:
import os
import glob
import csv
import pickle
from datetime import datetime, timedelta

data_dir = "../../data"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_models/deleted_model_*.csv")))
cluster_files = {
    os.path.basename(f).replace("model_clusters_", "").replace(".pkl", ""): f
    for f in glob.glob(os.path.join(data_dir, "model_clusters/model_clusters_*.pkl"))
}
batch_files = {
    os.path.basename(f).replace("batch_all_", "").replace(".csv", ""): f
    for f in glob.glob(os.path.join(data_dir, "model_relation/batch_all_*.csv"))
}

weekly_results = []
all_broken_cluster_sizes = []
all_broken_cluster_counts = []

for del_path in deleted_files:
    del_date = os.path.basename(del_path).replace("deleted_model_", "").replace(".csv", "")

    try:
        del_dt = datetime.strptime(del_date, "%m%d")
        cluster_dt = del_dt - timedelta(days=7)
        cluster_date = cluster_dt.strftime("%m%d")
    except Exception as e:
        print(f"Parse date failed: {del_date},skipping.Error: {e}")
        continue

    cluster_path = cluster_files.get(cluster_date)
    batch_path = batch_files.get(cluster_date)

    if not cluster_path:
        print(f"Files cannot be found: model_clusters_{cluster_date}.pkl.Skipping.")
        continue
    if not batch_path:
        print(f"Files cannot be found: batch_all_{cluster_date}.csv.Skipping.")
        continue

    deleted_models = set()
    with open(del_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            model_id = row["Model ID"].strip()
            if model_id:
                deleted_models.add(model_id)

    deleted_with_downstream = set()
    with open(batch_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            base_model = row["Base Model"].strip()
            if base_model in deleted_models:
                deleted_with_downstream.add(base_model)

    with open(cluster_path, "rb") as f:
        cluster_data = pickle.load(f)

    broken_cluster_sizes = []
    for cluster_id, models in cluster_data.items():
        if any(m in deleted_with_downstream for m in models):
            broken_cluster_sizes.append(len(models))
            all_broken_cluster_sizes.append(len(models))

    broken_count = len(broken_cluster_sizes)
    avg_size = round(sum(broken_cluster_sizes) / broken_count, 2) if broken_count > 0 else 0
    weekly_results.append((del_date, cluster_date, broken_count, avg_size))
    all_broken_cluster_counts.append(broken_count)

print("The average size of disrupted models weekly")
for del_date, cluster_date, count, avg_size in weekly_results:
    print(f"{del_date}{cluster_date}: {count},size:{avg_size}")

total_clusters = len(all_broken_cluster_sizes)
total_avg_size = round(sum(all_broken_cluster_sizes) / total_clusters, 2) if total_clusters > 0 else 0

weeks = len(all_broken_cluster_counts)
avg_broken_clusters_per_week = round(sum(all_broken_cluster_counts) / weeks, 2) if weeks else 0

print(f"\nThe average size of all disrupted models{total_avg_size}")
print(f"The average num of disrupted models weekly: {avg_broken_clusters_per_week} ")

The average size of disrupted models weekly
03190312: 81,size:762.16
03260319: 73,size:931.77
04020326: 109,size:1097.19
04090402: 244,size:742.0
04160409: 64,size:1001.31
04230416: 77,size:1126.69
04300423: 35,size:1967.77
05070430: 79,size:903.81
05140507: 86,size:1159.49
05210514: 85,size:1771.92
05280521: 38,size:2008.53
06040528: 51,size:1793.75
06110604: 60,size:1182.95
06180611: 56,size:1892.62
06250618: 25,size:1506.64
07020625: 81,size:1143.99

The average size of all disrupted models1163.13
The average num of disrupted models weekly: 77.75 


#### Top chains and clusters analysis

1. select top-10 chains in 17 weeks

In [13]:
import os
from datetime import datetime, timedelta

base_dir = "../../data"
start_date = datetime.strptime("2025-03-12", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")
    input_file = os.path.join(base_dir, f"model_depth_{date_str}.txt")
    output_file = os.path.join(base_dir, f"top10_chains_{date_str}.txt")

    if not os.path.exists(input_file):
        print(f" File not found: {input_file}")
        current += delta
        continue

    try:
        with open(input_file, "r", encoding="utf-8") as f:
            lines = f.readlines()

        chains = []
        in_chain_section = False

        for line in lines:
            line = line.rstrip("\n")
            if line.strip() == "Longest Paths:":
                in_chain_section = True
                continue
            if in_chain_section:
                if line.strip().startswith("Depth") or line.strip() == "" or not line.startswith("  "):
                    in_chain_section = False
                    continue
                chain = line.strip()
                chain_len = chain.count("->")
                chains.append((chain_len, chain))

        top10_chains = sorted(chains, key=lambda x: x[0], reverse=True)[:10]

        with open(output_file, "w", encoding="utf-8") as out_f:
            for _, chain in top10_chains:
                out_f.write(chain + "\n")

        print(f"Saved top-10 chains to: {output_file}")

    except Exception as e:
        print(f"Error processing {date_str}: {e}")

    current += delta

Saved top-10 chains to: ../../data/top10_chains_0312.txt
Saved top-10 chains to: ../../data/top10_chains_0319.txt
Saved top-10 chains to: ../../data/top10_chains_0326.txt
Saved top-10 chains to: ../../data/top10_chains_0402.txt
Saved top-10 chains to: ../../data/top10_chains_0409.txt
Saved top-10 chains to: ../../data/top10_chains_0416.txt
Saved top-10 chains to: ../../data/top10_chains_0423.txt
Saved top-10 chains to: ../../data/top10_chains_0430.txt
Saved top-10 chains to: ../../data/top10_chains_0507.txt
Saved top-10 chains to: ../../data/top10_chains_0514.txt
Saved top-10 chains to: ../../data/top10_chains_0521.txt
Saved top-10 chains to: ../../data/top10_chains_0528.txt
Saved top-10 chains to: ../../data/top10_chains_0604.txt
Saved top-10 chains to: ../../data/top10_chains_0611.txt
Saved top-10 chains to: ../../data/top10_chains_0618.txt
Saved top-10 chains to: ../../data/top10_chains_0625.txt
Saved top-10 chains to: ../../data/top10_chains_0702.txt


2. select top-10 clusters in 17 weeks

In [14]:
import os
import pandas as pd
from datetime import datetime, timedelta

start_date = datetime.strptime("2025-03-12", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

base_dir = "../../data"

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")
    input_file = os.path.join(base_dir, f"model_cluster_size_{date_str}.csv")
    output_file = os.path.join(base_dir, f"top10_cluster_size_{date_str}.txt")

    if not os.path.exists(input_file):
        print(f"File not found: {input_file}")
        current += delta
        continue

    try:
        df = pd.read_csv(input_file)

        top10_models = df.sort_values("DIS", ascending=False).head(10)["Model Name"].tolist()

        with open(output_file, "w", encoding="utf-8") as f:
            for model in top10_models:
                f.write(model + "\n")

        print(f" Saved Top-10 models for {date_str} to {output_file}")

    except Exception as e:
        print(f" Error processing {input_file}: {e}")

    current += delta

 Saved Top-10 models for 0312 to ../../data/top10_model_dis_0312.txt
 Saved Top-10 models for 0319 to ../../data/top10_model_dis_0319.txt
 Saved Top-10 models for 0326 to ../../data/top10_model_dis_0326.txt
 Saved Top-10 models for 0402 to ../../data/top10_model_dis_0402.txt
 Saved Top-10 models for 0409 to ../../data/top10_model_dis_0409.txt
 Saved Top-10 models for 0416 to ../../data/top10_model_dis_0416.txt
 Saved Top-10 models for 0423 to ../../data/top10_model_dis_0423.txt
 Saved Top-10 models for 0430 to ../../data/top10_model_dis_0430.txt
 Saved Top-10 models for 0507 to ../../data/top10_model_dis_0507.txt
 Saved Top-10 models for 0514 to ../../data/top10_model_dis_0514.txt
 Saved Top-10 models for 0521 to ../../data/top10_model_dis_0521.txt
 Saved Top-10 models for 0528 to ../../data/top10_model_dis_0528.txt
 Saved Top-10 models for 0604 to ../../data/top10_model_dis_0604.txt
 Saved Top-10 models for 0611 to ../../data/top10_model_dis_0611.txt
 Saved Top-10 models for 0618 to .

3. added model in top-10 chains 

In [17]:
import os
import pandas as pd
from datetime import datetime, timedelta

base_dir = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

matched_counts = []

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")
    chains_file = os.path.join(base_dir, f"top10_chains_{date_str}.txt")
    added_file = os.path.join(base_dir, f"added_models/added_model_{date_str}.csv")

    if not (os.path.exists(chains_file) and os.path.exists(added_file)):
        print(f" Missing file for {date_str}, skipped.")
        current += delta
        continue


    chain_models = set()
    with open(chains_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = [m.strip() for m in line.strip().split("->")]
            chain_models.update(parts)

    df_added = pd.read_csv(added_file)
    added_models = set(df_added["Model ID"].dropna())

    matched_models = added_models & chain_models
    count = len(matched_models)
    matched_counts.append(count)

    print(f"{date_str}: {count} models in top-10 chains")

    current += delta

if matched_counts:
    avg_count = sum(matched_counts) / len(matched_counts)
    print("\n Average: ", f"{avg_count:.2f}")

0319: 0 models in top-10 chains
0326: 0 models in top-10 chains
0402: 0 models in top-10 chains
0409: 0 models in top-10 chains
0416: 0 models in top-10 chains
0423: 0 models in top-10 chains
0430: 0 models in top-10 chains
0507: 0 models in top-10 chains
0514: 0 models in top-10 chains
0521: 0 models in top-10 chains
0528: 0 models in top-10 chains
0604: 0 models in top-10 chains
0611: 0 models in top-10 chains
0618: 0 models in top-10 chains
0625: 0 models in top-10 chains
0702: 0 models in top-10 chains

 Average:  0.00


4. added models in top-10 clusters

In [18]:
import os
import pickle
import pandas as pd
from datetime import datetime, timedelta

base_path = "../../data"
start_date = datetime.strptime("2025-03-19", "%Y-%m-%d")
end_date = datetime.strptime("2025-07-02", "%Y-%m-%d")
delta = timedelta(days=7)

matched_counts = []

current = start_date
while current <= end_date:
    date_str = current.strftime("%m%d")
    cluster_file = f"{base_path}/model_clusters/model_clusters_{date_str}.pkl"
    added_file = f"{base_path}/added_models/added_model_{date_str}.csv"

    if not os.path.exists(cluster_file) or not os.path.exists(added_file):
        print(f"Missing file for {date_str}, skipped.")
        current += delta
        continue

    with open(cluster_file, "rb") as f:
        cluster_dict = pickle.load(f)

    top10_clusters = sorted(cluster_dict.values(), key=len, reverse=True)[:10]
    top10_models = set()
    for cluster in top10_clusters:
        top10_models.update(cluster)

    df_added = pd.read_csv(added_file)
    added_models = set(df_added["Model ID"].dropna())

    matched_models = added_models & top10_models
    matched_counts.append(len(matched_models))

    print(f"{date_str}: {len(matched_models)} models in top-10 clusters")

    current += delta

if matched_counts:
    avg_count = sum(matched_counts) / len(matched_counts)
    print("\n Average: ", f"{avg_count:.2f}")

0319: 1554 models in top-10 clusters
0326: 1863 models in top-10 clusters
0402: 1986 models in top-10 clusters
0409: 1967 models in top-10 clusters
0416: 1265 models in top-10 clusters
0423: 1082 models in top-10 clusters
0430: 1742 models in top-10 clusters
0507: 1512 models in top-10 clusters
0514: 1178 models in top-10 clusters
0521: 1322 models in top-10 clusters
0528: 997 models in top-10 clusters
0604: 1473 models in top-10 clusters
0611: 1095 models in top-10 clusters
0618: 1730 models in top-10 clusters
0625: 358 models in top-10 clusters
0702: 1033 models in top-10 clusters

 Average:  1384.81


5. deleted models in top-10 clusters

In [4]:
import os
import csv
import glob
import pickle
from datetime import datetime, timedelta

data_dir = "../../data"
deleted_files = sorted(glob.glob(os.path.join(data_dir, "deleted_models/deleted_model_*.csv")))

weekly_results = []

for deleted_path in deleted_files:
    deleted_date = os.path.basename(deleted_path).replace("deleted_model_", "").replace(".csv", "")
    try:
        deleted_dt = datetime.strptime(deleted_date, "%m%d")
        cluster_dt = deleted_dt - timedelta(days=7)
        cluster_date = cluster_dt.strftime("%m%d")
    except Exception as e:
        print(f"Parse date failed: {deleted_date} -> {e}")
        continue

    cluster_path = os.path.join(data_dir, f"model_clusters/model_clusters_{cluster_date}.pkl")
    if not os.path.exists(cluster_path):
        print(f"File cannot be found: {cluster_path}")
        continue

    deleted_models = set()
    with open(deleted_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            model_id = row["Model ID"].strip()
            if model_id:
                deleted_models.add(model_id)

    with open(cluster_path, "rb") as f:
        cluster_dict = pickle.load(f) 

    clusters_sorted = sorted(cluster_dict.items(), key=lambda x: len(x[1]), reverse=True)
    top10_clusters = clusters_sorted[:10]

    affected_cluster_count = 0
    deleted_models_in_top10 = set()

    for cluster_id, models in top10_clusters:
        intersected = deleted_models.intersection(models)
        if intersected:
            affected_cluster_count += 1
            deleted_models_in_top10.update(intersected)

    weekly_results.append({
        "deleted_date": deleted_date,
        "cluster_date": cluster_date,
        "affected_top10_clusters": affected_cluster_count,
        "deleted_model_count": len(deleted_models),
        "deleted_models_in_top10": len(deleted_models_in_top10),
        "percent": round(100 * len(deleted_models_in_top10) / len(deleted_models), 4) if deleted_models else 0.0
    })

print("\n Weekly Top-10 Cluster Impact Summary")
for r in weekly_results:
    print(f"[{r['deleted_date']}] ← Clusters({r['cluster_date']}): "
          f"{r['affected_top10_clusters']} affected clusters, "
          f"{r['deleted_models_in_top10']} of {r['deleted_model_count']} deleted models in top10 "
          f"({r['percent']}%)")

if weekly_results:
    avg_clusters = sum(r["affected_top10_clusters"] for r in weekly_results) / len(weekly_results)
    avg_models = sum(r["deleted_models_in_top10"] for r in weekly_results) / len(weekly_results)
    avg_percent = sum(r["percent"] for r in weekly_results) / len(weekly_results)
    print("\nOverall Average:")
    print(f"- Average affected Top-10 clusters: {avg_clusters:.2f}")
    print(f"- Average deleted models in Top-10 clusters weekly: {avg_models:.2f}")
    print(f"- Average percentage: {avg_percent:.2f}%")


 Weekly Top-10 Cluster Impact Summary
[0319] ← Clusters(0312): 8 affected clusters, 135 of 4289 deleted models in top10 (3.1476%)
[0326] ← Clusters(0319): 8 affected clusters, 236 of 4800 deleted models in top10 (4.9167%)
[0402] ← Clusters(0326): 8 affected clusters, 226 of 14243 deleted models in top10 (1.5867%)
[0409] ← Clusters(0402): 9 affected clusters, 203 of 3157 deleted models in top10 (6.4302%)
[0416] ← Clusters(0409): 9 affected clusters, 111 of 4251 deleted models in top10 (2.6112%)
[0423] ← Clusters(0416): 8 affected clusters, 137 of 4075 deleted models in top10 (3.362%)
[0430] ← Clusters(0423): 8 affected clusters, 101 of 4224 deleted models in top10 (2.3911%)
[0507] ← Clusters(0430): 10 affected clusters, 1197 of 19649 deleted models in top10 (6.0919%)
[0514] ← Clusters(0507): 6 affected clusters, 102 of 4725 deleted models in top10 (2.1587%)
[0521] ← Clusters(0514): 10 affected clusters, 206 of 4553 deleted models in top10 (4.5245%)
[0528] ← Clusters(0521): 8 affected c

### Update Analysis

#### Distribution of updated models

In [7]:
import pandas as pd

degree_csv = "../../data/model_degree_0702.csv"
implicit_file = "../../data/rq2_update_models.txt"

degree_df = pd.read_csv(degree_csv)

with open(implicit_file, "r", encoding="utf-8") as f:
    implicit_models = set(line.strip() for line in f if line.strip())

implicit_df = degree_df[degree_df["Model ID"].isin(implicit_models)]

derived_models = implicit_df[implicit_df["In-degree"] > 0]
base_models = implicit_df[implicit_df["Out-degree"] > 0]
isolated_models = implicit_df[(implicit_df["In-degree"] == 0) & (implicit_df["Out-degree"] == 0)]
relational_models = implicit_df[(implicit_df["In-degree"] > 0) | (implicit_df["Out-degree"] > 0)]
total_models = implicit_df

def print_model_stats(name, df):
    print(f"{name}: {len(df):,} models")

print("=== Updated Models by Structural Type ===")
print_model_stats("Derived models ", derived_models)
print_model_stats("Base models", base_models)
print_model_stats("Isolated models", isolated_models)
print_model_stats("Relational models", relational_models)
print_model_stats("All updated models", total_models)

=== Updated Models by Structural Type ===
Derived models : 37,685 models
Base models: 13,666 models
Isolated models: 99,168 models
Relational models: 46,277 models
All updated models: 145,445 models


#### Relation distribution of updated models  

In [8]:
import csv
from collections import Counter

implicit_update_file = "../../data/rq2_update_models.txt"
batch_all_file = "../../data/model_relation/batch_all_0702.csv"

with open(implicit_update_file, "r", encoding="utf-8") as f:
    implicit_models = set(line.strip() for line in f if line.strip())

type_counter = Counter()
total = 0

with open(batch_all_file, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        model_id = row.get("Model ID")
        if model_id not in implicit_models:
            continue 

        type_raw = row.get("Type", "").lower()
        type_normalized = "finetune" if type_raw == "adapter" else type_raw
        type_counter[type_normalized] += 1
        total += 1

print("=== Updated Models by Dependency Type ===")
print(f"Total updated models with dependency type: {total}")
for model_type, count in type_counter.items():
    pct = (count / total) * 100
    print(f"{model_type}: {count} ({pct:.2f}%)")

key_types = ["finetune", "quantized", "merge"]
key_total = sum(type_counter.get(k, 0) for k in key_types)
key_pct = (key_total / total) * 100
print(f"\nDependency tuples: {key_total} ({key_pct:.2f}%)")

=== Updated Models by Dependency Type ===
Total updated models with dependency type: 145445
n/a: 107760 (74.09%)
finetune: 30872 (21.23%)
quantized: 5704 (3.92%)
merge: 1109 (0.76%)

Dependency tuples: 37685 (25.91%)
