In [None]:
from datasets import load_dataset
import json
import copy
import os
from chromadb.utils import embedding_functions
from radon.raw import analyze 
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans, AgglomerativeClustering, MiniBatchKMeans
import hdbscan



if os.path.exists("./custom.mplstyle"):
    plt.style.use("./custom.mplstyle")
else:
    print("custom.mplstyle not found. Using default style.")
    plt.style.use('default')

  from .autonotebook import tqdm as notebook_tqdm


custom.mplstyle not found. Using default style.


In [2]:
dataset = load_dataset("deepmind/code_contests")
PYTHON = 3

In [3]:
def filter_samples(dataset_split, language, max_samples=100000):
    filtered_samples = []
    skipped_for_empty_tag = 0

    for sample in dataset_split:
        # New condition: Check if 'cf_tags' exists and is ['']
        if "cf_tags" in sample and sample["cf_tags"] == ['']:
            skipped_for_empty_tag += 1
            continue  

        # Original logic for language filtering starts here
        if "solutions" not in sample or not isinstance(sample["solutions"], dict):
            print("Warning: Sample 'solutions' is not a dict. Skipping.")
            continue
        if "solution" not in sample["solutions"] or "language" not in sample["solutions"]:
            print(f"Warning: Sample 'solutions' dict missing 'solution' or 'language' key. Skipping.")
            continue

        solutions = sample["solutions"]["solution"]
        languages = sample["solutions"]["language"]

        # Ensure solutions and languages are lists and have the same length
        if not (isinstance(solutions, list) and 
                isinstance(languages, list) and 
                len(solutions) == len(languages)):
            print(f"Warning: Malformed solutions/languages (not parallel lists). Skipping sample.")
            continue

        for i, lang in enumerate(languages):
            if lang == language:
                new_sample = copy.deepcopy(sample)
                # Overwrite the solutions field to only have the selected solution
                new_sample["solutions"] = {
                    "solution": solutions[i],
                    "language": language
                }
                filtered_samples.append(new_sample)
                break 
        
        if len(filtered_samples) >= max_samples: # Use >= just in case, though == works
            break

    if skipped_for_empty_tag > 0:
        print(f"Skipped {skipped_for_empty_tag} samples because their cf_tags was [''].")
    return filtered_samples

python3_train = filter_samples(dataset["train"], PYTHON, 100000)
print(f"Filtered {len(python3_train)} Python 3 samples from the training set.")

Skipped 3474 samples because their cf_tags was [''].
Filtered 5519 Python 3 samples from the training set.


## Filter by SLOC

In [4]:
import copy
def filter_by_sloc(samples_list, sloc_threshold):
    """
    Filters a list of code samples, keeping only those where the solution's
    Source Lines of Code (SLOC) is greater than the specified threshold.

    Assumes each sample in the list is a dictionary with a structure like:
    {"solutions": {"solution": "python code string", "language": "Python 3"}, ...}

    Args:
        samples_list (list): The list of samples to filter (e.g., python3_train).
        sloc_threshold (int): The SLOC value. Samples with SLOC strictly greater
                              than this value will be kept.

    Returns:
        list: A new list containing only the filtered samples.
    """
    filtered_list = []
    processed_count = 0
    kept_count = 0
    skipped_key_error = 0
    skipped_analysis_error = 0
    original_count = len(samples_list)

    print(f"\nFiltering {original_count} samples by SLOC > {sloc_threshold}...")

    for i, sample in enumerate(samples_list):
        python_code = None # Define in outer scope for error message
        try:
            # Assumes the structure created by your filter_samples function
            python_code = sample["solutions"]["solution"]

            # Basic type check (optional but good practice)
            if not isinstance(python_code, str):
                 print(f"  Warning: Sample {i} 'solution' is not a string (Type: {type(python_code)}). Skipping.")
                 skipped_key_error += 1
                 continue

            sloc = 0
            try:
                if not python_code or python_code.isspace():
                    sloc = 0
                else:
                    analysis = analyze(python_code)
                    sloc = analysis.sloc
                processed_count += 1

            except Exception as analysis_e:
                print(f"  Warning: Radon analysis failed for sample {i}: {analysis_e}. Code snippet: '{str(python_code)[:50]}...'. Skipping sample.")
                skipped_analysis_error += 1
                continue 

            if sloc > sloc_threshold:
                filtered_list.append(sample)
                kept_count += 1

        except KeyError as e:
            print(f"  Warning: Skipping sample {i} due to missing key: {e}. Sample structure: {list(sample.keys())}")
            skipped_key_error += 1
        except Exception as e: 
            print(f"  Warning: Unexpected error processing sample {i}: {e}. Skipping.")
            skipped_key_error += 1


    print(f"Filtering complete.")
    print(f"  Successfully analyzed: {processed_count}")
    print(f"  Kept (SLOC > {sloc_threshold}): {kept_count}")
    print(f"  Skipped (Key/Type Error): {skipped_key_error}")
    print(f"  Skipped (Radon Analysis Error): {skipped_analysis_error}")
    print(f"  Total original samples: {original_count}")
    
    if original_count != kept_count + (processed_count - kept_count) + skipped_key_error + skipped_analysis_error:
         print(f"  Line count check: Original ({original_count}) vs Kept ({kept_count}) + Filtered ({processed_count - kept_count}) + Skipped ({skipped_key_error + skipped_analysis_error}) = {kept_count + (processed_count - kept_count) + skipped_key_error + skipped_analysis_error}")


    return filtered_list

In [5]:
SLOC_MIN_THRESHOLD = 10

python3_train_sloc_filtered = filter_by_sloc(python3_train, SLOC_MIN_THRESHOLD)

print(f"\nOriginal number of Python 3 samples: {len(python3_train)}")
print(f"Filtered number of samples (SLOC > {SLOC_MIN_THRESHOLD}): {len(python3_train_sloc_filtered)}")


Filtering 5519 samples by SLOC > 10...
Filtering complete.
  Successfully analyzed: 5519
  Kept (SLOC > 10): 4596
  Skipped (Key/Type Error): 0
  Skipped (Radon Analysis Error): 0
  Total original samples: 5519

Original number of Python 3 samples: 5519
Filtered number of samples (SLOC > 10): 4596


In [6]:
print(python3_train[0].keys())
print(python3_train[500]["cf_tags"])

python3_train = python3_train_sloc_filtered

print(f"\nFiltered number of samples (SLOC > {SLOC_MIN_THRESHOLD}): {len(python3_train)}")


dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'])
['*special', 'constructive algorithms']

Filtered number of samples (SLOC > 10): 4596


In [10]:
def load_json_entries(filename):
    """Load JSON list from a file."""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

descriptions_path = "LLM_descriptions.json"
descriptions = load_json_entries(descriptions_path)

def transform_to_dict(data):
    transformed_data = {}
    for item in data:
        key = item[0]
        value = item[1]
        transformed_data[key] = value
    return transformed_data

descriptions_dict = transform_to_dict(descriptions)
print(len(descriptions_dict))

def remove_hyphen_prefix(d):
    new_d = {}
    for key, value in d.items():
        # Split at the first hyphen and keep the part after it
        if '-' in key:
            new_key = key.split('-', 1)[1]
        else:
            new_key = key  # in case there's no '-'
        new_d[new_key] = value
    return new_d

d = remove_hyphen_prefix(descriptions_dict)
print(len(d))

8139
8139


In [11]:
old_descriptions = {}
new_descriptions = {}

# for each entry in python3_train, map name to either "description" or d["name"] for old and new

for entry in python3_train:
    name = entry["name"]
    if name in d:
        old_descriptions[name] = entry["description"]
        new_descriptions[name] = d[name]
    else:
        print(f"Name {name} not found in descriptions dictionary.")
        
# Check if the number of entries in old_descriptions and new_descriptions are the same
print(f"Number of entries in old_descriptions: {len(old_descriptions)}")
print(f"Number of entries in new_descriptions: {len(new_descriptions)}")
print("-----------------------------")
print(new_descriptions['1037_E. Trips'])
print("-----------------------------")
print(old_descriptions['1037_E. Trips'])

Number of entries in old_descriptions: 4596
Number of entries in new_descriptions: 4596
-----------------------------
1. One-sentence summary:  
   The solution incrementally maintains the size of the k-core of an undirected graph as edges are removed one by one.

2. Core algorithmic approach:  
   Greedy k-core peeling using a queue (BFS-style removal of vertices whose degree falls below k), applied in reverse over the sequence of edge deletions.

3. Reusable components:  
   a. k-core peeling routine: a function that, given a starting queue of “low-degree” vertices, repeatedly pops a vertex, decrements its neighbors’ degrees, and enqueues any that drop below k.  
   b. Degree‐tracking array: an array that holds current degrees (or “remaining neighbor counts”) for all vertices, supporting O(1) updates.  
   c. Reverse-operation pattern: processing a sequence of removals (or additions) in reverse, recording intermediate results as you “undo” operations efficiently.
--------------------

## Get Embeddings for descriptions

In [None]:
def chunked(iterable, chunk_size):
    for i in range(0, len(iterable), chunk_size):
        yield iterable[i:i + chunk_size]
        
        
def get_embeddings_for_programs_batch(old_programs, new_programs):
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=os.environ['OPENAI_API_KEY'],
        model_name="text-embedding-ada-002"
    )
    
    names = []
    texts = []
    tags = []

    for name, old_text in old_programs.items():
        names.append(name)
        texts.append(old_text)
        tags.append('old')

    for name, new_text in new_programs.items():
        names.append(name)
        texts.append(new_text)
        tags.append('new')

    # clean text
    texts = [str(t).replace("\n", " ").strip() for t in texts if t is not None and str(t).strip() != ""]
    
    if not texts:
        raise ValueError("No valid texts to embed.")

    print(f"Total texts to embed: {len(texts)}")

    all_embeddings = []
    batch_size = 512  # safe

    for batch in chunked(texts, batch_size):
        batch_embeddings = openai_ef(batch)
        all_embeddings.extend(batch_embeddings)

    assert len(all_embeddings) == len(names), "Mismatch between embeddings and names!"


    embeddings = {}

    for name, tag, embedding in zip(names, tags, all_embeddings):
        if name not in embeddings:
            embeddings[name] = {}

        embeddings[name][tag] = embedding

    return embeddings

embeddings = get_embeddings_for_programs_batch(old_descriptions, new_descriptions)
print(f"Number of entries in embeddings: {len(embeddings)}")


## Cluster Data Based On Embeddings

In [None]:
names = list(embeddings.keys())

def cluster_embeddings(X, method="kmeans", num_clusters=100, distance_threshold=None):
    if method == "kmeans":
        clustering = KMeans(n_clusters=num_clusters, random_state=0)
        labels = clustering.fit_predict(X)
    elif method == "minibatchkmeans":
        clustering = MiniBatchKMeans(n_clusters=num_clusters, random_state=0)
        labels = clustering.fit_predict(X)
    elif method == "agglomerative":
        if distance_threshold is not None:
            clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='average')
        else:
            clustering = AgglomerativeClustering(n_clusters=num_clusters)
        labels = clustering.fit_predict(X)
    else:
        raise ValueError(f"Unknown clustering method: {method}")
    return labels

In [None]:
# get clusters for embeddings (old and new separately)
def get_clusters(embeddings, method="kmeans", num_clusters=100, distance_threshold=None):
    old_embeddings = []
    new_embeddings = []
    for name, embedding in embeddings.items():
        old_embeddings.append(embedding["old"])
        new_embeddings.append(embedding["new"])

    old_embeddings = np.array(old_embeddings)
    new_embeddings = np.array(new_embeddings)

    old_labels = cluster_embeddings(old_embeddings, method, num_clusters, distance_threshold)
    new_labels = cluster_embeddings(new_embeddings, method, num_clusters, distance_threshold)

    return old_labels, new_labels

from collections import Counter

In [32]:
clusters_old_aglomerative, clusters_new_agglomerative = get_clusters(embeddings, method="agglomerative", num_clusters=120)
# clusters_old_aglomerative, clusters_new_agglomerative = get_clusters(embeddings, method="agglomerative", distance_threshold=0.64)
print(f"Number of clusters in old embeddings (agglomerative): {len(set(clusters_old_aglomerative))}")
print("number of clusters in new embeddings (agglomerative):", len(set(clusters_new_agglomerative)))
# print avg number of elements in clusters, average
print("Number of elements in each cluster (old):")
old_counter_agglomerative = Counter(clusters_old_aglomerative)
for cluster, count in old_counter_agglomerative.items():
    print(f"Cluster {cluster}: {count} elements")
print("Number of elements in each cluster (new):")
new_counter_agglomerative = Counter(clusters_new_agglomerative)
for cluster, count in new_counter_agglomerative.items():
    print(f"Cluster {cluster}: {count} elements")
# print number of elements in clusters, average
old_avg_agglomerative = sum(old_counter_agglomerative.values()) / len(old_counter_agglomerative)
new_avg_agglomerative = sum(new_counter_agglomerative.values()) / len(new_counter_agglomerative)
print(f"Average number of elements in old clusters (agglomerative): {old_avg_agglomerative}")
print(f"Average number of elements in new clusters (agglomerative): {new_avg_agglomerative}")

Number of clusters in old embeddings (agglomerative): 120
number of clusters in new embeddings (agglomerative): 120
Number of elements in each cluster (old):
Cluster 5: 32 elements
Cluster 88: 36 elements
Cluster 77: 31 elements
Cluster 15: 59 elements
Cluster 64: 17 elements
Cluster 23: 45 elements
Cluster 24: 71 elements
Cluster 13: 97 elements
Cluster 2: 44 elements
Cluster 26: 45 elements
Cluster 30: 93 elements
Cluster 35: 32 elements
Cluster 40: 60 elements
Cluster 9: 83 elements
Cluster 7: 23 elements
Cluster 59: 42 elements
Cluster 17: 52 elements
Cluster 34: 50 elements
Cluster 62: 49 elements
Cluster 8: 32 elements
Cluster 69: 65 elements
Cluster 21: 99 elements
Cluster 31: 55 elements
Cluster 68: 37 elements
Cluster 99: 42 elements
Cluster 116: 30 elements
Cluster 49: 44 elements
Cluster 78: 16 elements
Cluster 48: 93 elements
Cluster 66: 35 elements
Cluster 92: 52 elements
Cluster 27: 44 elements
Cluster 56: 42 elements
Cluster 44: 40 elements
Cluster 32: 56 elements
Cluste

## Pick Clusters

In [None]:
import json
import numpy as np
import random
from collections import Counter
from pathlib import Path
import datasets
from tqdm.auto import tqdm
import os

from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity

def cosine_similarity(vec1, vec_array):
    vec1_reshaped = vec1.reshape(1, -1)
    sim = sklearn_cosine_similarity(vec1_reshaped, vec_array)
    return sim[0]

def pick_similar_problems_clustered_dual_assignments(
    embeddings,
    clusters_new, # Assignments based on new embeddings
    clusters_old, # Assignments based on old embeddings
    num_clusters_to_pick=6,
    min_cluster_size=50, 
    num_similar=50
):
    """
    Picks anchors from eligible NEW clusters. Finds neighbors using:
    1) NEW embeddings within the anchor's NEW cluster.
    2) OLD embeddings within the anchor's OLD cluster.

    Args:
        embeddings (dict): {'name': {'new': vec, 'old': vec}}
        clusters_new (list/np.array): Cluster IDs from NEW embeddings.
        clusters_old (list/np.array): Cluster IDs from OLD embeddings.
                                     Must align with embeddings & clusters_new.
        num_clusters_to_pick (int): How many NEW clusters to sample anchors from.
        min_cluster_size (int): Min size for a NEW cluster to be eligible.
        num_similar (int): How many neighbors to find (incl. anchor).

    Returns:
        tuple: (results_new, results_old)
               Dicts mapping the selected NEW cluster ID to the list of problem names.
               results_new[new_cid] = neighbors found via new emb/new cluster.
               results_old[new_cid] = neighbors found via old emb/anchor's old cluster.
               Returns (None, None) on critical error.
    """
    print("Selecting anchors from NEW clusters, finding neighbors in respective clusters...")

    all_names = list(embeddings.keys())
    num_total_problems = len(all_names)

    # --- Input Validation ---
    if num_total_problems == 0:
        print("Error: Embeddings dictionary is empty.")
        return None, None
    if len(clusters_new) != num_total_problems:
        print(f"Error: Length mismatch between embeddings ({num_total_problems}) and clusters_new ({len(clusters_new)}).")
        return None, None
    if len(clusters_old) != num_total_problems:
        print(f"Error: Length mismatch between embeddings ({num_total_problems}) and clusters_old ({len(clusters_old)}).")
        return None, None

    try:
        all_embeddings_new = np.array([embeddings[name]["new"] for name in all_names])
        all_embeddings_old = np.array([embeddings[name]["old"] for name in all_names])
        print(f"Embedding shapes: New {all_embeddings_new.shape}, Old {all_embeddings_old.shape}")
    except KeyError as e:
        print(f"Error: Missing embedding type ({e}).")
        return None, None
    except Exception as e:
        print(f"Error creating embedding arrays: {e}")
        return None, None

    clusters_new_np = np.array(clusters_new)
    clusters_old_np = np.array(clusters_old)


    # --- Step 1: Identify eligible NEW clusters ---
    cluster_counter_new = Counter(clusters_new_np)
    eligible_new_clusters = [cid for cid, count in cluster_counter_new.items() if count >= min_cluster_size]

    print(f"Eligible NEW clusters (size >= {min_cluster_size}): {eligible_new_clusters}")

    if not eligible_new_clusters:
         print(f"Error: No NEW clusters found with size >= {min_cluster_size}.")
         return {}, {} # Return empty dicts
    if len(eligible_new_clusters) < num_clusters_to_pick:
        print(f"Warning: Only {len(eligible_new_clusters)} eligible NEW clusters found. Selecting all.")
        num_clusters_to_pick = len(eligible_new_clusters)

    selected_new_cluster_ids = random.sample(eligible_new_clusters, num_clusters_to_pick)
    print(f"Selected NEW cluster IDs to sample anchors from: {selected_new_cluster_ids}")

    results_new = {}
    results_old = {}
    processed_new_clusters = [] 

    for new_cluster_id in selected_new_cluster_ids:
        print(f"\nProcessing NEW Cluster ID: {new_cluster_id}")

        # Find indices belonging to this selected NEW cluster
        indices_in_new_cluster = np.where(clusters_new_np == new_cluster_id)[0]

        if len(indices_in_new_cluster) < 1: # Should not happen if eligibility check worked
            print(f"Warning: No problems found for eligible NEW cluster {new_cluster_id}. Skipping.")
            continue

        # Choose anchor from within this NEW cluster
        anchor_idx = random.choice(indices_in_new_cluster)
        anchor_name = all_names[anchor_idx]
        anchor_old_cluster_id = clusters_old_np[anchor_idx] # Find anchor's OLD cluster ID
        print(f"  Anchor: '{anchor_name}' (Index: {anchor_idx}, Belongs to OLD Cluster: {anchor_old_cluster_id})")

        # Get anchor embeddings
        anchor_embedding_new = all_embeddings_new[anchor_idx]
        anchor_embedding_old = all_embeddings_old[anchor_idx]

        # --- 1. Find neighbors using NEW embeddings in NEW cluster ---
        try:
            print(f"  Finding top {num_similar} similar using NEW embeddings within NEW Cluster {new_cluster_id}...")
            # Get embeddings ONLY for problems in the anchor's NEW cluster
            embeddings_in_new_cluster = all_embeddings_new[indices_in_new_cluster]

            if len(indices_in_new_cluster) < num_similar:
                print(f"    Warning: NEW Cluster {new_cluster_id} has {len(indices_in_new_cluster)} members, less than {num_similar}. Returning all members.")
                num_similar_new = len(indices_in_new_cluster)
            else:
                num_similar_new = num_similar

            similarities_new = cosine_similarity(anchor_embedding_new, embeddings_in_new_cluster)
            # Indices relative to embeddings_in_new_cluster / indices_in_new_cluster
            sorted_relative_indices_new = np.argsort(-similarities_new)
            # Map back to original indices
            top_original_indices_new = indices_in_new_cluster[sorted_relative_indices_new[:num_similar_new]]
            selected_names_new = [all_names[i] for i in top_original_indices_new]
            results_new[new_cluster_id] = selected_names_new
            # print(f"    Selected (New): {selected_names_new[:5]}...")

        except Exception as e:
            print(f"    Error during NEW similarity search for anchor {anchor_name}: {e}")
            results_new[new_cluster_id] = [] # Indicate failure for this part


        # --- 2. Find neighbors using OLD embeddings in anchor's OLD cluster ---
        try:
            print(f"  Finding top {num_similar} similar using OLD embeddings within OLD Cluster {anchor_old_cluster_id}...")
            # Find indices belonging to the anchor's OLD cluster
            indices_in_old_cluster = np.where(clusters_old_np == anchor_old_cluster_id)[0]

            if len(indices_in_old_cluster) == 0:
                 print(f"    Warning: Anchor '{anchor_name}'s OLD cluster {anchor_old_cluster_id} seems empty (error?). Skipping old neighbor search.")
                 results_old[new_cluster_id] = []
                 continue # Skip to next anchor

            # Get embeddings ONLY for problems in the anchor's OLD cluster
            embeddings_in_old_cluster = all_embeddings_old[indices_in_old_cluster]

            if len(indices_in_old_cluster) < num_similar:
                print(f"    Warning: Anchor's OLD Cluster {anchor_old_cluster_id} has {len(indices_in_old_cluster)} members, less than {num_similar}. Returning all members.")
                num_similar_old = len(indices_in_old_cluster)
            else:
                num_similar_old = num_similar

            similarities_old = cosine_similarity(anchor_embedding_old, embeddings_in_old_cluster)
            # Indices relative to embeddings_in_old_cluster / indices_in_old_cluster
            sorted_relative_indices_old = np.argsort(-similarities_old)
            # Map back to original indices
            top_original_indices_old = indices_in_old_cluster[sorted_relative_indices_old[:num_similar_old]]
            selected_names_old = [all_names[i] for i in top_original_indices_old]
            results_old[new_cluster_id] = selected_names_old # Store using NEW cluster ID as key
            # print(f"    Selected (Old): {selected_names_old[:5]}...")

            # Only mark as fully processed if both parts succeeded (or handled gracefully)
            processed_new_clusters.append(new_cluster_id)

        except Exception as e:
            print(f"    Error during OLD similarity search for anchor {anchor_name}: {e}")
            results_old[new_cluster_id] = [] # Indicate failure for this part

    # Final filtering to ensure consistency if needed (optional, depends on error handling)
    final_results_new = {cid: results_new.get(cid, []) for cid in processed_new_clusters}
    final_results_old = {cid: results_old.get(cid, []) for cid in processed_new_clusters}


    return final_results_new, final_results_old


print("Loading name descriptions...")
name2desc = {}
desc_file_path = "LLM_descriptions.jsonl"
try:
    with open(desc_file_path, "r") as f:
        for line in f:
            x = json.loads(line)
            name2desc[x["name"]] = x.get("short_description", x.get("description", ""))
except FileNotFoundError:
    print(f"Warning: Description file not found at {desc_file_path}. Short descriptions will be empty.")
except Exception as e:
    print(f"Error reading description file {desc_file_path}: {e}")


print("Loading Code Contests dataset...")
try:
    dataset = datasets.load_dataset("deepmind/code_contests", split='train')
    print("Dataset loaded.")
except Exception as e:
    print(f"Error loading dataset 'deepmind/code_contests': {e}")
    print("Cannot proceed without the dataset.")
    dataset = None

def save_selected_problem_data(problem_names: list[str], dataset, name2desc_map: dict, output_file: Path):
    """ Saves full data for selected problems to a JSONL file. (Definition from previous answer) """
    selected_set = set(problem_names)
    examples_to_save = []
    found_count = 0

    if not dataset:
        print(f"Error: Dataset not loaded. Cannot save data to {output_file}")
        return
    
    for sample in tqdm(dataset, desc=f"Saving {output_file.name}", leave=False, ncols=100):
        sample_name = sample.get("name")
        if not sample_name or sample_name not in selected_set:
            continue

        found_count += 1
        solutions = sample.get("solutions", {}).get("solution", [])
        languages = sample.get("solutions", {}).get("language", [])
        python_solution = None
        for i, lang in enumerate(languages):
            if lang == 3:
                if i < len(solutions):
                     python_solution = solutions[i]
                     break

        entry = {
            "name": sample_name,
            "description": sample.get("description", ""),
            "solution": python_solution if python_solution is not None else "PYTHON_SOLUTION_NOT_FOUND",
            "difficulty": sample.get("difficulty", -1),
            "public_tests": sample.get("public_tests", {}),
            "private_tests": sample.get("private_tests", {}),
            "generated_tests": sample.get("generated_tests", {}),
            "short_description": name2desc_map.get(sample_name, ""),
        }
        examples_to_save.append(entry)

        if found_count == len(selected_set):
             break

    output_file.parent.mkdir(parents=True, exist_ok=True)
    try:
        with open(output_file, "w", encoding='utf-8') as f:
            for sample in examples_to_save:
                json.dump(sample, f)
                f.write("\n")
        print(f"  Successfully saved {len(examples_to_save)} problems to {output_file.name}")
    except Exception as e:
        print(f"  Error writing to {output_file}: {e}")


Loading name descriptions...
Loading Code Contests dataset...
Dataset loaded.

Running problem selection using clustered seeds and dual clustered neighbors...
Selecting anchors from NEW clusters, finding neighbors in respective clusters...
Embedding shapes: New (4596, 1536), Old (4596, 1536)
Eligible NEW clusters (size >= 30): [34, 60, 49, 19, 48, 1, 46, 32, 5, 13, 98, 71, 11, 44, 16, 70, 33, 39, 25, 58, 14, 12, 38, 63, 10, 84, 83, 15, 28, 2, 35, 65, 51, 21, 8, 47, 57, 89, 73, 29, 76, 18, 37, 50, 31, 24, 56, 0, 23, 40, 3, 30, 61, 59, 4, 72, 97, 74, 20, 99, 66, 96, 26, 67, 94, 68, 9, 22, 7, 6, 52, 79, 41, 53, 36, 69, 87]
Selected NEW cluster IDs to sample anchors from: [57, 18, 8, 1, 26, 3]

Processing NEW Cluster ID: 57
  Anchor: '75_D. Big Maximum Sum' (Index: 1315, Belongs to OLD Cluster: 43)
  Finding top 30 similar using NEW embeddings within NEW Cluster 57...
  Finding top 30 similar using OLD embeddings within OLD Cluster 43...

Processing NEW Cluster ID: 18
  Anchor: '1038_E. Ma

                                                                                                    

  Successfully saved 30 problems to 0.jsonl


                                                                                                    

  Successfully saved 30 problems to 0.jsonl
--------------------
Processing anchor from NEW Cluster 3 (saving as index 1)


                                                                                                    

  Successfully saved 30 problems to 1.jsonl


                                                                                                    

  Successfully saved 30 problems to 1.jsonl
--------------------
Processing anchor from NEW Cluster 8 (saving as index 2)


                                                                                                    

  Successfully saved 30 problems to 2.jsonl


                                                                                                    

  Successfully saved 30 problems to 2.jsonl
--------------------
Processing anchor from NEW Cluster 18 (saving as index 3)


                                                                                                    

  Successfully saved 30 problems to 3.jsonl


                                                                                                    

  Successfully saved 30 problems to 3.jsonl
--------------------
Processing anchor from NEW Cluster 26 (saving as index 4)


                                                                                                    

  Successfully saved 30 problems to 4.jsonl


                                                                                                    

  Successfully saved 30 problems to 4.jsonl
--------------------
Processing anchor from NEW Cluster 57 (saving as index 5)


                                                                                                    

  Successfully saved 30 problems to 5.jsonl


                                                                                                    

  Successfully saved 30 problems to 5.jsonl

Script finished.




In [None]:
output_base_dir = Path("./clustered_removed_empty_labels") # Change this to your desired output directory

old_output_dir = output_base_dir / "old"
new_output_dir = output_base_dir / "new"

old_output_dir.mkdir(parents=True, exist_ok=True)
new_output_dir.mkdir(parents=True, exist_ok=True)


print("\nRunning problem selection using clustered seeds and dual clustered neighbors...")
try:
    # Define parameters, CHANGE THIS IF YOU WANT
    num_clusters = 10
    min_size = 30 # Minimum size for a cluster to be eligible
    num_sim = 30 # Number of similar problems to find per anchor

    # Call the new function with BOTH cluster assignments
    selected_new_sim, selected_old_sim = pick_similar_problems_clustered_dual_assignments(
        embeddings,
        clusters_new=clusters_new_agglomerative, # NEW cluster assignments
        clusters_old=clusters_old_aglomerative, # OLD cluster assignments
        num_clusters_to_pick=num_clusters,
        min_cluster_size=min_size,
        num_similar=num_sim
    )

    if selected_new_sim is not None and selected_old_sim is not None:
        print("\nSaving selected problem data...")
        processed_new_cluster_ids = sorted(list(selected_new_sim.keys()))

        if not processed_new_cluster_ids:
            print("No NEW cluster anchors were successfully processed to save.")
        else:
            print(f"Saving data based on anchors from NEW clusters: {processed_new_cluster_ids}")
            # We use enumerate to get 0, 1, 2... for file names
            for i, new_cluster_id in enumerate(processed_new_cluster_ids):
                print("-" * 20)
                print(f"Processing anchor from NEW Cluster {new_cluster_id} (saving as index {i})")

                new_file_path = new_output_dir / f"{i}.jsonl"
                old_file_path = old_output_dir / f"{i}.jsonl"

                problem_names_new = selected_new_sim.get(new_cluster_id, []) 
                problem_names_old = selected_old_sim.get(new_cluster_id, []) 

                if problem_names_new:
                    save_selected_problem_data(problem_names_new, dataset, name2desc, new_file_path)
                else:
                    print(f"  No 'new' neighbors found or error for anchor from NEW cluster {new_cluster_id}. Skipping save for {new_file_path.name}")

                if problem_names_old:
                    save_selected_problem_data(problem_names_old, dataset, name2desc, old_file_path)
                else:
                     print(f"  No 'old' neighbors found or error for anchor from NEW cluster {new_cluster_id}. Skipping save for {old_file_path.name}")
    else:
        print("Problem selection function returned an error. Cannot save results.")


except ValueError as e:
    print(f"ValueError: {e}")
except NameError as e:
    print(f"NameError: Required variable likely not defined ({e})")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


print("\nScript finished.")