So the idea is, we will actually cherry-pick. We will use DiCE with N random seeds to produce N counterfactulas (the set EFA), we will then choose then use some utility function to pick the real counterfactual. Then we will use some other utility (e.g. not change gender) to return a different counterfactual (a cherry-picked one)

In [3]:
import pandas as pd
import numpy as np
import altair as alt

import dice_ml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from utils import load_data

In [4]:
df = load_data.load_adult()
df = df[["sex"] + [c for c in df.columns if c != "sex"]]


train_df, test_df = train_test_split(df, test_size=0.2, random_state=5)

trn_x = train_df.drop(columns=["Label"])
trn_y = train_df["Label"]
tst_x = test_df.drop(columns=["Label"])
tst_y = test_df["Label"]

# model = LogisticRegression(random_state=5)

model = RandomForestClassifier(n_estimators=5, max_depth=5, random_state=5)
model.fit(trn_x, trn_y)

model.score(tst_x, tst_y)

0.8488074521445389

In [5]:
fi = pd.DataFrame({
    "feature": df.drop(columns=["Label"]).columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False).reset_index(drop=True)
fi

Unnamed: 0,feature,importance
0,relationship,0.3161362
1,education-num,0.1989306
2,capital-gain,0.1639147
3,age,0.1129018
4,marital-status,0.09412849
5,education,0.04678014
6,sex,0.02660399
7,capital-loss,0.02543101
8,hours-per-week,0.00857475
9,occupation,0.00454452


In [6]:
continuous_features = [
    "age", "fnlwgt", "education-num",
    "capital-gain", "capital-loss", "hours-per-week"
]

dice_ml_model = dice_ml.Model(model=model, backend="sklearn")

dice_ml_data = dice_ml.Data(
    dataframe=train_df,
    continuous_features=continuous_features,
    outcome_name="Label"
)

print(type(dice_ml_data),type(dice_ml_model))

explainer = dice_ml.Dice(dice_ml_data,dice_ml_model,method="random")

def generate_explantion_space(explainer, factuals, num_seeds):
    seeds = np.random.randint(0, 1000, size=num_seeds)
    dfs = []
    for seed in seeds:
        exp = explainer.generate_counterfactuals(
            factuals,
            total_CFs=1,
            desired_class="opposite",
            random_seed=seed
        )
        for i, cfex in enumerate(exp.cf_examples_list):
            df = cfex.final_cfs_df.copy()
            df["model_seed"] = seed
            df["factual_id"] = factuals.index[i]
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

explanation_space = generate_explantion_space(explainer, tst_x.iloc[0:100], num_seeds=10)
explanation_space

<class 'dice_ml.data_interfaces.public_data_interface.PublicData'> <class 'dice_ml.model_interfaces.base_model.BaseModel'>


  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected

Unnamed: 0,sex,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,Label,model_seed,factual_id
0,1,33.0,6,108438.0,11,13.6,2,3,0,4,83321.4,0.0,40.0,39,0,790,27816
1,0,43.0,4,59460.0,9,13.0,4,3,3,4,91680.5,0.0,40.0,39,0,790,36570
2,1,36.0,4,76845.0,11,9.0,4,8,3,2,33639.7,0.0,35.0,39,0,790,15793
3,1,27.0,4,31757.0,8,11.0,4,3,3,4,62129.8,0.0,38.0,39,0,790,755
4,1,31.0,4,92179.0,0,6.0,0,7,1,4,33639.7,0.0,40.0,39,0,790,40504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,19.0,4,97261.0,2,8.0,4,6,1,4,16309.1,0.0,40.0,39,0,268,27548
996,0,26.0,4,102106.0,0,6.0,2,1,5,4,29332.7,0.0,40.0,39,0,268,8302
997,1,29.0,0,199074.0,11,9.0,4,0,3,4,45009.6,0.0,20.0,39,0,268,40484
998,1,31.8,7,142547.0,11,9.0,2,4,0,4,20199.2,0.0,40.0,39,0,268,5386


In [98]:
explanation_space_scored.model_seed.unique()

array([790, 252, 909, 954,  52, 125, 967, 919,  58, 268])

In [7]:
features = [c for c in tst_x.columns]

edit_counts = (
    explanation_space
    .merge(tst_x, left_on="factual_id", right_index=True, suffixes=("_cf", "_f"))
    .assign(**{f: lambda d, f=f: (d[f"{f}_cf"] != d[f"{f}_f"]).astype(int) for f in features})
    .groupby("factual_id")[features]
    .sum()
    .reset_index()
)

edit_counts


Unnamed: 0,factual_id,sex,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country
0,755,1,1,1,0,0,0,1,1,0,0,10,2,0,0
1,774,0,0,0,0,0,1,1,0,1,2,10,2,1,1
2,2513,0,0,1,0,0,1,1,0,0,1,4,5,1,1
3,2836,0,1,1,0,0,0,0,2,0,2,10,0,0,0
4,3002,1,0,0,2,0,1,0,0,0,0,10,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,45945,0,3,1,0,3,5,0,0,2,0,0,0,0,0
96,47388,0,0,0,0,0,2,1,1,0,1,10,0,1,0
97,47521,0,1,0,0,0,3,0,1,0,2,10,1,1,0
98,47921,0,0,0,1,1,1,0,0,0,0,8,2,0,0


In [8]:
import numpy as np

def calc_sparsity(factual, counterfactual):
    return int((factual != counterfactual).sum())

def calc_proximity(factual, counterfactual):
    return float(np.linalg.norm((counterfactual - factual).to_numpy()))

def cherry_utility_sparsity(factual, counterfactual, excluded_features):
    for feat in excluded_features:
        if feat in factual.index and feat in counterfactual.index:
            if counterfactual[feat] != factual[feat]:
                return len(factual)
    return calc_sparsity(factual, counterfactual)

def cherry_utility_proximity(factual, counterfactual, excluded_features):
    for feat in excluded_features:
        if feat in factual.index and feat in counterfactual.index:
            if counterfactual[feat] != factual[feat]:
                return 10_000_000.0
    return calc_proximity(factual, counterfactual)


excluded_features = ["relationship", "marital-status", "race"]

explanation_space_scored = explanation_space.copy()

sparsities = []
proximities = []
utility_sparsity = []
utility_proximity = []

for _, cf in explanation_space_scored.iterrows():
    f = tst_x.loc[cf["factual_id"]]
    c = cf.drop(["factual_id", "model_seed"]).reindex(f.index)

    s = calc_sparsity(f, c)
    p = calc_proximity(f, c)

    us = cherry_utility_sparsity(f, c, excluded_features)
    up = cherry_utility_proximity(f, c, excluded_features)

    sparsities.append(s)
    proximities.append(p)
    utility_sparsity.append(us)
    utility_proximity.append(up)

explanation_space_scored["sparsity"] = sparsities
explanation_space_scored["proximity"] = proximities
explanation_space_scored["utility_sparsity"] = utility_sparsity
explanation_space_scored["utility_proximity"] = utility_proximity


def assign_ranks(g, primary, utility_col):
    """
    Returns a rank_code series with:
      1 = optimal (best primary)
      2 = cherry-picked (best utility among remaining)
      3 = not picked
    """
    r = pd.Series(3, index=g.index)

    i1 = g.sort_values([primary, utility_col, "sparsity"]).index[0]
    r.loc[i1] = 1

    rest = g.drop(index=i1)
    i2 = rest.sort_values([utility_col, primary, "sparsity"]).index[0]
    r.loc[i2] = 2

    return r


def add_rank_codes(factual_group):
    g = factual_group.copy()
    g["rank_code_proximity"] = assign_ranks(g, primary="proximity", utility_col="utility_proximity")
    g["rank_code_sparsity"]  = assign_ranks(g, primary="sparsity",  utility_col="utility_sparsity")
    return g


explanation_space_scored = (
    explanation_space_scored
    .groupby("factual_id", group_keys=False)
    .apply(add_rank_codes)
)

explanation_space_scored

  .apply(add_rank_codes)


Unnamed: 0,sex,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,...,native-country,Label,model_seed,factual_id,sparsity,proximity,utility_sparsity,utility_proximity,rank_code_proximity,rank_code_sparsity
0,1,33.0,6,108438.0,11,13.6,2,3,0,4,...,39,0,790,27816,2,83321.400127,2,83321.400127,3,3
1,0,43.0,4,59460.0,9,13.0,4,3,3,4,...,39,0,790,36570,2,91680.500442,2,91680.500442,3,3
2,1,36.0,4,76845.0,11,9.0,4,8,3,2,...,39,0,790,15793,1,33639.700000,1,33639.700000,3,1
3,1,27.0,4,31757.0,8,11.0,4,3,3,4,...,39,0,790,755,1,62129.800000,1,62129.800000,3,1
4,1,31.0,4,92179.0,0,6.0,0,7,1,4,...,39,0,790,40504,1,33639.700000,1,33639.700000,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,19.0,4,97261.0,2,8.0,4,6,1,4,...,39,0,268,27548,1,16309.100000,1,16309.100000,1,3
996,0,26.0,4,102106.0,0,6.0,2,1,5,4,...,39,0,268,8302,1,29332.700000,1,29332.700000,1,2
997,1,29.0,0,199074.0,11,9.0,4,0,3,4,...,39,0,268,40484,1,45009.600000,1,45009.600000,3,3
998,1,31.8,7,142547.0,11,9.0,2,4,0,4,...,39,0,268,5386,2,20199.201917,2,20199.201917,3,3


In [9]:
# count factual_ids where sparsity(cherry-picked) != sparsity(optimal)
n_diff = (
    explanation_space_scored
    .loc[explanation_space_scored["rank_code_sparsity"].isin([1, 2]), ["factual_id", "rank_code_sparsity", "sparsity"]]
    .pivot_table(index="factual_id", columns="rank_code_sparsity", values="sparsity", aggfunc="min")
    .rename(columns={1: "opt", 2: "cherry"})
    .eval("opt != cherry")
    .sum()
)

n_diff

np.int64(3)

In [10]:
mean_sparsity_by_rank = (
    explanation_space_scored
    .groupby("rank_code_sparsity")["sparsity"]
    .mean()
)

mean_sparsity_by_rank


rank_code_sparsity
1    1.080
2    1.110
3    1.705
Name: sparsity, dtype: float64

In [11]:
explanation_space_scored.rank_code_sparsity.value_counts()

rank_code_sparsity
3    800
1    100
2    100
Name: count, dtype: int64

In [12]:
import pandas as pd
import altair as alt

rank_col = "rank_code_sparsity"
metric = "sparsity"

df = explanation_space_scored.copy()

fids = sorted(df["factual_id"].unique())
df["instance"] = df["factual_id"].map({fid: i + 1 for i, fid in enumerate(fids)})

xmin, xmax = 0, float(df[metric].max())

label_map = {1: "optimal", 2: "cherry-picked", 3: "not picked"}
df["type"] = df[rank_col].map(label_map)

picked = df[df[rank_col].isin([1, 2])].copy()

wide = (
    picked
    .pivot_table(index="instance", columns="type", values=metric, aggfunc="min")
    .reindex(columns=["optimal", "cherry-picked"])
    .reset_index()
)

picked_long = wide.melt(
    id_vars="instance",
    value_vars=["optimal", "cherry-picked"],
    var_name="type",
    value_name=metric,
)

segments = wide.assign(
    y2=wide["instance"],
    x=wide["optimal"],
    x2=wide["cherry-picked"],
)

not_picked = df[(df[rank_col] == 3) & df[metric].between(xmin, xmax)][["instance", metric]].copy()
not_picked["type"] = "not picked"

colour_scale = alt.Scale(
    domain=["optimal", "cherry-picked", "not picked"],
    range=["#1f77b4", "#ff7f0e", "#9e9e9e"],
)

x = alt.X(f"{metric}:Q", title="Sparsity", scale=alt.Scale(domain=[xmin, xmax]))
y = alt.Y("instance:O", title="Instance", sort=list(range(1, 11)))
c = alt.Color("type:N", title=None, scale=colour_scale)

layer_not = alt.Chart(not_picked).mark_point(filled=True, opacity=0.35, size=80).encode(x=x, y=y, color=c)
layer_seg = alt.Chart(segments).mark_rule(opacity=0.6).encode(y="instance:O", y2="y2:O", x="x:Q", x2="x2:Q")
layer_picked = alt.Chart(picked_long).mark_point(filled=True, size=120).encode(x=x, y=y, color=c)

chart = (
    (layer_not + layer_seg + layer_picked)
    .properties(width=560, height=240)
    .configure_axis(labelFontSize=14, titleFontSize=16)
    .configure_legend(labelFontSize=14)
)

chart

In [13]:
import pandas as pd
import altair as alt

rank_col = "rank_code_sparsity"
df = explanation_space_scored.copy()

# Instance 1..10
fids = sorted(df["factual_id"].unique())
df["instance"] = df["factual_id"].map({fid: i + 1 for i, fid in enumerate(fids)})

label_map = {1: "optimal", 2: "cherry-picked"}
picked = df[df[rank_col].isin([1, 2])][["instance", "sparsity", rank_col]].copy()
picked["type"] = picked[rank_col].map(label_map)

chart = (
    alt.Chart(picked)
    .mark_rect()
    .encode(
        x=alt.X("type:N", title=None, sort=["optimal", "cherry-picked"]),
        y=alt.Y("instance:O", title="Instance", sort=list(range(1, 11))),
        color=alt.Color("sparsity:Q", title="Sparsity"),
        tooltip=["instance:O", "type:N", "sparsity:Q"]
    )
    .properties(width=220, height=240)
    .configure_axis(labelFontSize=14, titleFontSize=16)
    .configure_legend(labelFontSize=14)
)

chart

In [14]:
import pandas as pd
import altair as alt

rank_col = "rank_code_sparsity"
df = explanation_space_scored.copy()

fids = sorted(df["factual_id"].unique())
df["instance"] = df["factual_id"].map({fid: i + 1 for i, fid in enumerate(fids)})

picked = df[df[rank_col].isin([1, 2])][["instance", "sparsity", rank_col]].copy()
picked["type"] = picked[rank_col].map({1: "optimal", 2: "cherry-picked"})

wide = (
    picked
    .pivot_table(index="instance", columns="type", values="sparsity", aggfunc="min")
    .reset_index()
)

wide["delta"] = wide["cherry-picked"] - wide["optimal"]

zero = pd.DataFrame({"delta": [0]})

chart = (
    alt.Chart(wide).mark_point(filled=True, size=140).encode(
        x=alt.X("delta:Q", title="Δ sparsity (cherry-picked − optimal)"),
        y=alt.Y("instance:O", title="Instance", sort=list(range(1, 11))),
        tooltip=["instance:O", "optimal:Q", "cherry-picked:Q", "delta:Q"]
    )
    + alt.Chart(zero).mark_rule().encode(x="delta:Q")
)

chart.properties(width=520, height=240).configure_axis(labelFontSize=14, titleFontSize=16)

In [17]:
# count factual_ids where sparsity(cherry-picked) != sparsity(optimal)
n_diff = (
    explanation_space_scored
    .loc[explanation_space_scored["rank_code_sparsity"].isin([1, 2]), ["factual_id", "rank_code_sparsity", "sparsity"]]
    .pivot_table(index="factual_id", columns="rank_code_sparsity", values="sparsity", aggfunc="min")
    .rename(columns={1: "opt", 2: "cherry"})
    .eval("opt != cherry")
    .sum()
)

n_diff

np.int64(3)

In [91]:
explanation_space_scored

Unnamed: 0,sex,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,...,native-country,Label,model_seed,factual_id,sparsity,proximity,utility_sparsity,utility_proximity,rank_code_proximity,rank_code_sparsity
0,1,33.0,6,108438.0,11,13.6,2,3,0,4,...,39,0,790,27816,2,83321.400127,2,83321.400127,3,3
1,0,43.0,4,59460.0,9,13.0,4,3,3,4,...,39,0,790,36570,2,91680.500442,2,91680.500442,3,3
2,1,36.0,4,76845.0,11,9.0,4,8,3,2,...,39,0,790,15793,1,33639.700000,1,33639.700000,3,1
3,1,27.0,4,31757.0,8,11.0,4,3,3,4,...,39,0,790,755,1,62129.800000,1,62129.800000,3,1
4,1,31.0,4,92179.0,0,6.0,0,7,1,4,...,39,0,790,40504,1,33639.700000,1,33639.700000,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,19.0,4,97261.0,2,8.0,4,6,1,4,...,39,0,268,27548,1,16309.100000,1,16309.100000,1,3
996,0,26.0,4,102106.0,0,6.0,2,1,5,4,...,39,0,268,8302,1,29332.700000,1,29332.700000,1,2
997,1,29.0,0,199074.0,11,9.0,4,0,3,4,...,39,0,268,40484,1,45009.600000,1,45009.600000,3,3
998,1,31.8,7,142547.0,11,9.0,2,4,0,4,...,39,0,268,5386,2,20199.201917,2,20199.201917,3,3


In [None]:
rank_col = "rank_code_proximity"

s = df_60k[rank_col]
print("dtype:", s.dtype)
print("head:", s.head(10).tolist())
print("unique (raw) sample:", s.dropna().astype(str).str.strip().unique()[:20])
print("value counts (raw):")
print(s.dropna().astype(str).str.strip().value_counts().head(20))

dtype: int64
head: [3, 3, 3, 1, 1, 3, 3, 3, 3, 3]
unique (raw) sample: ['3' '1' '2']
value counts (raw):
rank_code_proximity
3    488
1    100
2    100
Name: count, dtype: int64


In [27]:
import pandas as pd
from pathlib import Path

# Do not modify the original dataframe
df_60k = explanation_space_scored.copy()

# Ensure proximity is numeric (robust to object dtype)
df_60k["proximity"] = pd.to_numeric(df_60k["proximity"], errors="coerce")

# Remove rows with missing proximity and cut off anything above 60,000
MAX_PROX = 60_000
df_60k = df_60k.dropna(subset=["proximity"])
df_60k = df_60k[df_60k["proximity"] <= MAX_PROX].copy()

# Optional: reset index for cleanliness
df_60k.reset_index(drop=True, inplace=True)

# Save to a sensible CSV (current working directory)
out_path = Path("explanation_space_scored_proximity_le_60000.csv")
df_60k.to_csv(out_path, index=False)

print(f"Saved {len(df_60k):,} rows to: {out_path.resolve()}")
df_60k.head()


Saved 688 rows to: /home/james/work/pcherry_code/explanation_space_scored_proximity_le_60000.csv


Unnamed: 0,sex,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,...,native-country,Label,model_seed,factual_id,sparsity,proximity,utility_sparsity,utility_proximity,rank_code_proximity,rank_code_sparsity
0,1,36.0,4,76845.0,11,9.0,4,8,3,2,...,39,0,790,15793,1,33639.7,1,33639.7,3,1
1,1,31.0,4,92179.0,0,6.0,0,7,1,4,...,39,0,790,40504,1,33639.7,1,33639.7,3,1
2,0,46.0,5,167882.0,8,11.0,6,4,3,4,...,39,1,790,44004,2,6.324555,14,10000000.0,3,1
3,1,31.0,5,113752.0,11,14.5,2,4,0,4,...,39,0,790,45790,1,5.5,1,5.5,1,1
4,1,35.0,4,33975.0,7,13.2,2,3,0,4,...,39,0,790,10748,1,1.2,1,1.2,1,1


In [None]:
plot_ids = df_60k["factual_id"].unique()[:10]

plot_df = df_60k.copy()
plot_df = plot_df[plot_df["factual_id"].isin(plot_ids)].copy()

id_to_instance = {fid: i+1 for i, fid in enumerate(plot_ids)}
plot_df["instance"] = plot_df["factual_id"].map(id_to_instance)

plot_df

Unnamed: 0,sex,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,...,Label,model_seed,factual_id,sparsity,proximity,utility_sparsity,utility_proximity,rank_code_proximity,rank_code_sparsity,instance
0,1,36.0,4,76845.0,11,9.0,4,8,3,2,...,0,790,15793,1,33639.700000,1,3.363970e+04,3,1,1
1,1,31.0,4,92179.0,0,6.0,0,7,1,4,...,0,790,40504,1,33639.700000,1,3.363970e+04,3,1,2
2,0,46.0,5,167882.0,8,11.0,6,4,3,4,...,1,790,44004,2,6.324555,14,1.000000e+07,3,1,3
3,1,31.0,5,113752.0,11,14.5,2,4,0,4,...,0,790,45790,1,5.500000,1,5.500000e+00,1,1,4
4,1,35.0,4,33975.0,7,13.2,2,3,0,4,...,0,790,10748,1,1.200000,1,1.200000e+00,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,1,35.0,4,33975.0,7,12.0,2,3,0,4,...,0,268,10748,2,58005.601689,2,5.800560e+04,3,3,5
625,0,27.0,4,217379.0,15,10.0,0,10,3,4,...,0,268,31315,1,29332.700000,1,2.933270e+04,2,3,6
626,1,32.0,7,230224.0,7,12.0,2,8,0,4,...,0,268,5509,2,3035.394933,2,3.035395e+03,2,3,8
627,1,24.3,6,73684.0,9,13.0,2,12,0,4,...,1,268,33671,2,5064.393051,2,5.064393e+03,3,3,9


In [123]:
import numpy as np
import pandas as pd

# Start from your existing plot_df (already filtered + has instance)
plot_df = plot_df.copy()

# Human-readable labels
rank_map = {1: "optimal", 2: "cherry-picked", 3: "not picked"}
plot_df["rank_label"] = plot_df["rank_code_proximity"].map(rank_map)

# Build a per-instance table with optimal/cherry-picked proximities
pairs_wide = (
    plot_df.loc[plot_df["rank_code_proximity"].isin([1, 2]), ["instance", "rank_label", "proximity"]]
    .pivot_table(index="instance", columns="rank_label", values="proximity", aggfunc="first")
    .reset_index()
)

# Keep only instances where both exist
pairs_wide = pairs_wide.dropna(subset=["optimal", "cherry-picked"])

# Long form for drawing the line (2 rows per instance)
line_df = (
    pairs_wide
    .melt(id_vars=["instance"], value_vars=["optimal", "cherry-picked"],
          var_name="rank_label", value_name="proximity")
)
line_df["rank_order"] = np.where(line_df["rank_label"].eq("optimal"), 0, 1)

line_df.to_csv("cherry-pick_proximity_example_plot_line.csv", index=False)
plot_df.to_csv("cherry-pick_proximity_example_plot_data.csv", index=False)


In [124]:
import pandas as pd

# line_df columns assumed: instance, proximity, rank_label
seg_df = (
    line_df.pivot_table(index="instance", columns="rank_label", values="proximity", aggfunc="first")
    .dropna(subset=["optimal", "cherry-picked"])
    .reset_index()
)

seg_df["mid"] = (seg_df["optimal"] + seg_df["cherry-picked"]) / 2.0
seg_df["err"] = (seg_df["optimal"] - seg_df["cherry-picked"]).abs() / 2.0

seg_df[["instance", "mid", "err"]].to_csv(
    "cherry-pick_proximity_example_plot_segments.csv", index=False
)


In [125]:
import altair as alt

colour_scale = alt.Scale(
    domain=["optimal", "cherry-picked", "not picked"],
    range=["#1f77b4", "#ff7f0e", "#bdbdbd"],
)

points = alt.Chart(plot_df).mark_point(
    filled=True, size=180
).encode(
    x=alt.X("instance:O", axis=alt.Axis(title="Instance", labelFontSize=16, titleFontSize=18)),
    y=alt.Y("proximity:Q", axis=alt.Axis(title="Proximity", labelFontSize=16, titleFontSize=18)),
    color=alt.Color(
        "rank_label:N",
        scale=colour_scale,
        legend=alt.Legend(title=None, labelFontSize=16, symbolSize=200)
    )
)

lines = alt.Chart(line_df).mark_line(strokeWidth=3, color="#4d4d4d").encode(
    x="instance:O",
    y="proximity:Q",
    detail="instance:O",
    order=alt.Order("rank_order:Q", sort="ascending")
)

(lines + points).properties(width=800, height=450).configure_view(stroke=None)


In [126]:
# line_df currently has: instance, rank_label, proximity, rank_order
# Create a wide table with one row per instance: optimal, cherry_picked

line_wide = (
    line_df.pivot_table(index="instance", columns="rank_label", values="proximity", aggfunc="first")
    .reset_index()
    .rename(columns={"cherry-picked": "cherry_picked"})  # avoid hyphen for LaTeX
)

line_wide.to_csv("cherry-pick_proximity_example_plot_line_wide.csv", index=False)


In [127]:
import numpy as np
import pandas as pd

# ---------- POINTS (scatter) ----------
points_tex = plot_df.loc[:, ["instance", "proximity", "rank_code_proximity"]].copy()
points_tex = points_tex.rename(columns={
    "instance": "x",
    "proximity": "y",
    "rank_code_proximity": "rank",
})

# Ensure numeric types (PGFPlots hates NaNs/strings in numeric columns)
points_tex["x"] = pd.to_numeric(points_tex["x"], errors="coerce")
points_tex["y"] = pd.to_numeric(points_tex["y"], errors="coerce")
points_tex["rank"] = pd.to_numeric(points_tex["rank"], errors="coerce").astype("Int64")
points_tex = points_tex.dropna(subset=["x", "y", "rank"])

points_tex.to_csv("points_tex.csv", index=False)

# ---------- SEGMENTS (lines between rank 1 and 2) ----------
# Build a wide table: one row per instance with y_opt, y_cherry
pairs = (
    plot_df.loc[plot_df["rank_code_proximity"].isin([1, 2]), ["instance", "rank_code_proximity", "proximity"]]
    .copy()
)

pairs["instance"] = pd.to_numeric(pairs["instance"], errors="coerce")
pairs["proximity"] = pd.to_numeric(pairs["proximity"], errors="coerce")
pairs = pairs.dropna(subset=["instance", "proximity", "rank_code_proximity"])

wide = (
    pairs.pivot_table(index="instance", columns="rank_code_proximity", values="proximity", aggfunc="first")
    .rename(columns={1: "y_opt", 2: "y_cherry"})
    .dropna(subset=["y_opt", "y_cherry"])
    .reset_index()
    .sort_values("instance")
)

# Convert to a "polyline with breaks" format:
# (x, y_opt), (x, y_cherry), blank row, repeated for each instance
seg_rows = []
for _, r in wide.iterrows():
    x = float(r["instance"])
    seg_rows.append({"x": x, "y": float(r["y_opt"])})
    seg_rows.append({"x": x, "y": float(r["y_cherry"])})
    seg_rows.append({"x": np.nan, "y": np.nan})  # blank line => break in PGFPlots

segments_tex = pd.DataFrame(seg_rows)
segments_tex.to_csv("segments_tex.csv", index=False, na_rep="")

print("Wrote points_tex.csv and segments_tex.csv")


Wrote points_tex.csv and segments_tex.csv


In [128]:
import numpy as np
import pandas as pd

# ---------- POINTS ----------
points_tex = plot_df.loc[:, ["instance", "proximity", "rank_code_proximity"]].copy()
points_tex = points_tex.rename(columns={"instance": "x", "proximity": "y", "rank_code_proximity": "rank"})

points_tex["x"] = pd.to_numeric(points_tex["x"], errors="coerce")
points_tex["y"] = pd.to_numeric(points_tex["y"], errors="coerce")
points_tex["rank"] = pd.to_numeric(points_tex["rank"], errors="coerce").astype("Int64")
points_tex = points_tex.dropna(subset=["x", "y", "rank"])

points_tex.to_csv("points_tex.csv", index=False)

# ---------- SEGMENTS (optimal <-> cherry-picked, same instance) ----------
pairs = plot_df.loc[plot_df["rank_code_proximity"].isin([1, 2]), ["instance", "rank_code_proximity", "proximity"]].copy()
pairs["instance"] = pd.to_numeric(pairs["instance"], errors="coerce")
pairs["proximity"] = pd.to_numeric(pairs["proximity"], errors="coerce")
pairs["rank_code_proximity"] = pd.to_numeric(pairs["rank_code_proximity"], errors="coerce").astype("Int64")
pairs = pairs.dropna(subset=["instance", "proximity", "rank_code_proximity"])

wide = (
    pairs.pivot_table(index="instance", columns="rank_code_proximity", values="proximity", aggfunc="first")
    .rename(columns={1: "y_opt", 2: "y_cherry"})
    .dropna(subset=["y_opt", "y_cherry"])
    .reset_index()
    .sort_values("instance")
)

seg_rows = []
for _, r in wide.iterrows():
    x = float(r["instance"])
    seg_rows.append({"x": x, "y": float(r["y_opt"])})
    seg_rows.append({"x": x, "y": float(r["y_cherry"])})
    seg_rows.append({"x": np.nan, "y": np.nan})  # blank row => BREAK (no joining to next instance)

segments_tex = pd.DataFrame(seg_rows)
segments_tex.to_csv("segments_tex.csv", index=False, na_rep="")

print("Wrote points_tex.csv and segments_tex.csv")


Wrote points_tex.csv and segments_tex.csv
