In [None]:
import json
from pathlib import Path
import matplotlib as mpl
import tikzplotlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rich.progress import track
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.dummy import DummyClassifier
from sklearn.metrics import RocCurveDisplay, precision_recall_fscore_support, accuracy_score, balanced_accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MaxAbsScaler

In [None]:
def parse_files(root_path):
    """
    Parse files and use pd.json_normalize to flatten the json.
    """
    files = root_path.joinpath("train-output-all-features").glob("**/*.json")
    results = pd.DataFrame()
    files = list(files)

    for file in track(
        files,
        description="Loading hyperparameters and "
        "performance data from file to DataFrame",
    ):
        with open(file) as f:
            data = json.load(f)
            original_target = root_path.joinpath(data["original_target"])
            original_dict = json.loads(Path(original_target).read_text())

            f1_scores = pd.json_normalize(data, "f1_scores")
            normalized_data = pd.json_normalize(original_dict)
            normalized_data = pd.concat(
                [normalized_data] * len(f1_scores), ignore_index=True
            )
            normalized_data["metrics.f1_score"] = f1_scores
            normalized_data["paths.model_path"] = data["model_path"]
            results = pd.concat([results, normalized_data])

    results = results.set_index(
        ["percentage", "model", "use_gradient", "use_ewma"]
    )
    results.columns = pd.MultiIndex.from_arrays(
        zip(*results.columns.str.split(".", expand=True))
    )
    results = results.sort_index(axis=1)
    results = results.drop(columns=['k_fold', 'preprocessing'])
    results = results.droplevel(0, axis=1)

    return results


data_flattened = parse_files(Path("./analysis/"))
data_flattened

In [None]:
# Remove any runs that included use_ewma or use_gradient
df = data_flattened.loc[(slice(None), slice(None), False, False), :]
df.index = df.index.droplevel([2, 3])
df = df.sort_values(by=['percentage', 'model'])
df

In [None]:
def float_or_list_to_tuple(x):
    if isinstance(x, list):
        return tuple(x)
    return x

df["hidden_layer_sizes"] = df['hidden_layer_sizes'].apply(float_or_list_to_tuple)

In [None]:
group = df.groupby(['percentage', 'model', 'C', 'hidden_layer_sizes', 'kernel', 'learning_rate', 'max_depth', 'n_estimators', 'alpha'], dropna=False)
summary = group['f1_score'].agg(['mean', 'std'])

idx = summary.groupby(['percentage', 'model']).idxmax()

best_hyperparams = summary.loc[idx['mean'], :]
best_per_percentage = best_hyperparams.groupby('percentage')['mean'].idxmax()
selected_percentages = best_hyperparams[best_hyperparams.index.get_level_values('percentage').isin([1, 5, 10, 15, 20]) & best_hyperparams.index.get_level_values('model').isin(['RF', 'ET', 'DT'])]
models_to_test = selected_percentages.droplevel(['C', 'hidden_layer_sizes', 'kernel', 'learning_rate', 'alpha'])
models_to_test


In [None]:
random_state = 42
classifiers = {}
for percentage, model, max_depth, n_estimators in models_to_test.index:
    if np.isnan(max_depth):
        max_depth = None
    else:
        max_depth = int(max_depth)
    if model != 'DT':
        n_estimators = int(n_estimators)
    if model == 'ET':
        classifiers[(percentage, model)] = ExtraTreesClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=random_state, class_weight='balanced')
    elif model == 'RF':
        classifiers[(percentage, model)] = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=random_state, class_weight='balanced_subsample')
    elif model == 'DT':
        classifiers[(percentage, model)] = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state, class_weight='balanced')
    classifiers[(percentage, 'Dum')] = DummyClassifier(strategy='stratified', random_state=random_state)
classifiers

In [None]:
import pickle

def preprocess(dataframe, scaler=None):
    result, mzn = dataframe.drop(columns=['mzn', 'dzn'], axis=1), dataframe['mzn']

    # Drop any columns that contain the text 'ewma' or 'gradient'
    result = result.drop(columns=result.columns[result.columns.str.contains('ewma|gradient')], axis=1)

    
    # result = result.drop(result.columns[result.nunique() == 1], axis=1)

    if scaler is None:
        result = result.drop(result.columns[result.nunique() == 1], axis=1)
        scaler = MaxAbsScaler().fit(result)
    else:
        # Drop constant columns except those in scaler.feature_names_in_
        constant_columns = result.columns[result.nunique() == 1]
        features_in = scaler.feature_names_in_
        columns_to_drop = constant_columns.difference(features_in)
        result = result.drop(columns=columns_to_drop, axis=1)
        
        
    result = pd.DataFrame(scaler.transform(result), columns=result.columns, index=result.index)

    result['mzn'] = mzn

    return result, scaler

scalers = {}

train_pkl = Path('./analysis/rerun_with_all_features_train.pkl')

with open(train_pkl, 'rb') as f:
    train_features_at_percentage = pickle.load(f)

for percentage, model in classifiers:
    print(f"Training {model} at {percentage}%")
    df, scaler = preprocess(train_features_at_percentage[percentage])
    scalers[(percentage, model)] = scaler
    df = df.drop(columns=['mzn'], axis=1)
    any_nan = df.isna().any().any()
    
    if any_nan:
        print("The NaN values are in the following columns:")
        print(df.columns[df.isna().any()])
    else:
        train_X, train_y = df.drop(columns=['solved_within_time_limit']), df['solved_within_time_limit']
        print(train_y.value_counts(normalize=False))
        classifiers[(percentage, model)].fit(train_X, train_y)


In [None]:
describe_features = pd.DataFrame({'feature': preprocess(train_features_at_percentage[1])[0].drop(['mzn', 'solved_within_time_limit'], axis=1).columns})
describe_features['description'] = [
    "Number of conflicts",
    "Number of decisions made/nodes",
    "Number of search iterations completed",
    "??? (number of assigned nodes or variables)",
    "Number of variables",
    "Number of backjumps",
    "Number of solutions found",
    "Total time spent",
    "Time spent searching",
    "Number of integer variables",
    "Number of propagations",
    "Number of SAT propagations",
    "Number of propagators",
    "Number of boolean variables",
    "Number of learnt clauses",
    "Number of binary clauses",
    "Number of ternary clauses",
    "Number of clauses longer than 3",
    "The maximum decision level reached",
    "Current decision level of the engine",
    "Size of the tree of the decision level",
    "Amount of memory used by clauses",
    "Amount of memory used by propagators",
    "Ratio of failures/conflicts to unassigned variables",
    "Ratio of visited nodes to open nodes",
    "Fraction of variables that are boolean",
    "Ratio of propagations to variables",
    "Fraction of clauses that are longer than 3",
    "How often the engine backtracks (backjumps / total search time)",
]
describe_features['translated'] = [
    False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, True
]
describe_features

In [None]:
describe_features.columns.str.capitalize()

In [None]:
test_pickle = Path('./analysis/rerun_with_all_features_test.pkl')

with open(test_pickle, 'rb') as f:
    test_features_at_percentage = pickle.load(f)

# Create a figure and axes with 4 subplots
fig, ax = plt.subplots(2, 3, figsize=(6, 5), sharex=True, sharey=True)

fig.subplots_adjust(hspace=0.1, wspace=0.3)
ax = ax.flatten()
scores = []

for i, percentage in enumerate([1, 5, 10, 15, 20]):
    for model in ['ET', 'RF', 'Dum']:
        scaler = scalers[(percentage, model)]
        df, _ = preprocess(test_features_at_percentage[percentage], scaler)
        df = df.drop(columns=['mzn'], axis=1)
        
        test_X, test_y = df.drop(columns=['solved_within_time_limit']), df['solved_within_time_limit']
        
        y_score = classifiers[(percentage, model)].predict_proba(test_X)[:, 1]
        if model != 'Dum':
            display = RocCurveDisplay.from_predictions(test_y, y_score, ax=ax[i], name=model)
            line = display.line_
            display.line_.set_label(line.get_label().replace('AUC = ', ''))
            ax[i].legend()
            ax[i].set_xlabel('')
            ax[i].set_ylabel('')

        prec, rec, score, support = precision_recall_fscore_support(test_y, classifiers[(percentage, model)].predict(test_X), average='binary', pos_label=False)
        accuracy = accuracy_score(test_y, classifiers[(percentage, model)].predict(test_X))
        balanced_accuracy = balanced_accuracy_score(test_y, classifiers[(percentage, model)].predict(test_X))
        # print(f"For {model} at {percentage}%, Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {score:.2f}")
        
        scores.append([percentage, model, accuracy, balanced_accuracy, prec, rec, score])

    ax[i].set_title(f"{percentage}%", fontsize=10)
    ax[i].plot([0, 1], [0, 1], linestyle='--', lw=1, color='r', alpha=.2)
            

fig.supxlabel('False Positive Rate', fontsize=20, y=-0.05, x=0.54)
fig.supylabel('True Positive Rate', fontsize=20, x=-0.01, y=0.4)
ax[5].set_axis_off()
fig.tight_layout()

tikzplotlib.clean_figure(fig)
tikzplotlib.save("roc_curves.tex", axis_width="0.36\\textwidth", axis_height="0.36\\textwidth")

In [None]:
score_df = pd.DataFrame(scores, columns=['Percentage', 'Model', 'Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'F1 score'])
score_df = score_df.set_index(['Percentage', 'Model']).unstack()
score_df.index = score_df.index.astype(str)+"%"
score_df.round(2).style.highlight_max(axis=1, props="font-weight:bold;", subset=pd.IndexSlice[:, ['F1 score']])

In [None]:
(score_df.drop(columns=["Accuracy"]).round(2)
    .style.format_index(escape='latex')
    .format(precision=2)
    .highlight_max(axis=1, props="textbf:--rwrap;", subset=pd.IndexSlice[:, ['F1 score']])
    .to_latex(
        "performance-table.tex",
        hrules=True,
        multicol_align='c',
        caption="Performance measures for each model at varying \% of TL.",
        label="tab:performance-measures",
        position_float="centering")
)

In [None]:
fig = plt.figure(figsize=(100, 15))
plot_tree(classifiers[(5, 'DT')], max_depth=6, feature_names=df.columns, class_names=['unsolved', 'solved'], filled=True, fontsize=8, proportion=True);

In [None]:
def get_importances(percentage, model, classifiers, test_X, test_y):
    model = classifiers[(percentage, model)]
    result = permutation_importance(model, test_X, test_y, n_repeats=10, random_state=42, n_jobs=-1)
    # model = classifiers[(percentage, model)]
    # importance = model.feature_importances_
    # std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
    # return pd.DataFrame({'mean': importance, 'std': std}, index=model.feature_names_in_)
    return pd.DataFrame({'mean': result.importances_mean, 'std': result.importances_std}, index=model.feature_names_in_)

fig, ax = plt.subplots(5, 1, figsize=(10, 20), sharex=False, sharey=False)
fig.subplots_adjust(hspace=1, wspace=0.3)
ax = ax.flatten()

for i, percentage in enumerate([1, 5, 10, 15, 20]):
    # percentage = 5
    df, _ = preprocess(test_features_at_percentage[percentage], scaler)
    df = df.drop(columns=['mzn'], axis=1)
    test_X, test_y = df.drop(columns=['solved_within_time_limit']), df['solved_within_time_limit']
    importance = get_importances(percentage, 'RF', classifiers, test_X, test_y).sort_values(by='mean', ascending=False)
    ax[i] = importance['mean'].plot.bar(yerr=importance['std'], capsize=2, ax=ax[i], title=f"{percentage}%", rot=90)

# plt.barh(m.index, m['importance'], xerr=s['std'], capsize=2)
# rf_importances = pd.DataFrame([get_importances(x, c, classifiers) for x in [5, 10, 15, 20] for c in ['RF']], index=[5, 10, 15, 20])
# et_importances = pd.DataFrame([get_importances(x, c, classifiers) for x in [5, 10, 15, 20] for c in ['ET']], index=[5, 10, 15, 20])

In [None]:
importance = get_importances(percentage, 'ET', classifiers, test_X, test_y).sort_values(by='mean', ascending=False)
importance['mean'].plot.bar(yerr=importance['std'], capsize=2)

In [None]:
all_features = train_features_at_percentage #{i: pd.concat([train_features_at_percentage[i], test_features_at_percentage[i]]) for i in range(1, len(train_features_at_percentage) + 1)}

In [None]:
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
from collections import defaultdict


def get_rankings_for_percentages(percentage, depth, model='RF'):
    df, _ = preprocess(all_features[percentage], scaler)
    df = df.drop(columns=['mzn'], axis=1)
    X, y = df.drop(columns=['solved_within_time_limit']), df['solved_within_time_limit']

    # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
    corr = spearmanr(X).correlation

    # Ensure the correlation matrix is symmetric
    corr = (corr + corr.T) / 2
    np.fill_diagonal(corr, 1)

    # We convert the correlation matrix to a distance matrix before performing
    # hierarchical clustering using Ward's linkage.
    distance_matrix = 1 - np.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))
    # dendro = hierarchy.dendrogram(
    #     dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90
    # )
    # dendro_idx = np.arange(0, len(dendro["ivl"]))

    # ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
    # ax2.set_xticks(dendro_idx)
    # ax2.set_yticks(dendro_idx)
    # ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
    # ax2.set_yticklabels(dendro["ivl"])
    # _ = fig.tight_layout()

    df, _ = preprocess(train_features_at_percentage[percentage], scaler)
    df = df.drop(columns=['mzn'], axis=1)
    X_train, y_train = df.drop(columns=['solved_within_time_limit']), df['solved_within_time_limit']

    df, _ = preprocess(test_features_at_percentage[percentage], scaler)
    df = df.drop(columns=['mzn'], axis=1)
    X_test, y_test = df.drop(columns=['solved_within_time_limit']), df['solved_within_time_limit']

    cluster_ids = hierarchy.fcluster(dist_linkage, depth, criterion="distance")

    cluster_id_to_feature_ids = defaultdict(list)
    for idx, cluster_id in enumerate(cluster_ids):
        cluster_id_to_feature_ids[cluster_id].append(idx)
    selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
    selected_features_names = X.columns[selected_features]

    X_train_sel = X_train[selected_features_names]
    X_test_sel = X_test[selected_features_names]

    clf_sel = RandomForestClassifier(n_estimators=100, random_state=42) if model == 'RF' else ExtraTreesClassifier(n_estimators=100, random_state=42)
    clf_sel.fit(X_train_sel, y_train)
    _, _, f1, _ = precision_recall_fscore_support(y_test, clf_sel.predict(X_test_sel), average='binary', pos_label=False)
    # print(
    #     "Baseline f1 on test data with features removed:"
    #     # f" {clf_sel.score(X_test_sel, y_test):.2}"
    #     f" {f1:.2}"
    # )

    def plot_permutation_importance(clf, X, y, ax, selected_features, cluster_id_to_feature_ids, all_columns):
        result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=2)
        perm_sorted_idx = result.importances_mean.argsort()
        labels = []
        for i in np.array(selected_features)[perm_sorted_idx]:
            for k, v in cluster_id_to_feature_ids.items():
                if i in v:
                    main_feature = all_columns[i]
                    all_features = all_columns[v].difference([main_feature])
                    labels.append(str(main_feature) + " " + str(list(all_features)).replace('[', '(').replace(']', ')'))

        ax.boxplot(
            result.importances[perm_sorted_idx].T,
            vert=False,
            labels=X.columns[perm_sorted_idx],
            # labels=labels,
        )
        ax.axvline(x=0, color="k", linestyle="--")
        return ax, list(reversed(labels))

    fig, ax = plt.subplots(figsize=(17, 6))
    _, labels = plot_permutation_importance(clf_sel, X_test_sel, y_test, ax, selected_features, cluster_id_to_feature_ids, X.columns)
    # ax.set_title("Permutation Importances on selected subset of features\n(test set)")
    # ax.set_xlabel("Decrease in accuracy score")
    # ax.figure.tight_layout()
    # plt.show()
    return labels

importance_df = []
for perc in [1, 5, 10, 15, 20]:
    print(f"Percentage: {perc}")
    labels_with_groups_rf = get_rankings_for_percentages(perc, 1.1, model='RF')
    just_labels_rf = [x.split()[0] for x in labels_with_groups_rf[:5]]  # Get the first 5 labels
    labels_with_groups_et = get_rankings_for_percentages(perc, 1.1, model='ET')
    just_labels_et = [x.split()[0] for x in labels_with_groups_et[:5]]  # Get the first 5 labels
    labels = just_labels_rf + just_labels_et
    importance_df.append(labels)
    print(labels)
importance_df = pd.DataFrame(importance_df, index=['1%', '5%', '10%', '15%', '20%'], columns=pd.MultiIndex.from_product([['RF', 'ET'], range(1, 6)]))
importance_df.index.name = 'Percentage'
importance_df.columns.name
# importance_df.fillna('')

In [None]:
importance_df.columns.names = ['Model', 'Rank']
(importance_df['RF']
        .replace('intVars', 'int_vars')
        .replace('bin', 'bin_clauses')
        .style
        .format_index(escape='latex')
        .format("\\texttt{{{}}}", escape='latex')
        .to_latex("importance-table.tex",
                hrules=True,
                multicol_align='c',
                caption="Permutation importance rankings for RF at varying \% of TL. Rank 1 is the most important feature, rank 5 is less important.",
                label="tab:importance-measures",
                position_float="centering",
        )
)
importance_df['RF'].replace('intVars', 'int_vars').replace('bin', 'bin_clauses')

In [None]:
print(", ".join([f"\\texttt{{{x}}}" for x in {l.split()[0]: l.replace(')', '').replace('(', '').replace("'", "").replace(",", "").replace('_', '\_').split()[1:] for l in labels_with_groups_rf}['bin']]))

In [None]:
fig, ax = plt.subplots(figsize=(6,7))
plot_permutation_importance(classifiers[(percentage, 'DT')], X_test, y_test, ax)

In [None]:
n_total = np.array([train_features_at_percentage[i].shape[0] for i in range(1, len(train_features_at_percentage) + 1)])
n_solved = np.array([train_features_at_percentage[i]['solved_within_time_limit'].sum() for i in range(1, len(train_features_at_percentage) + 1)])
n_not_solved = np.array([n_total[i] - n_solved[i] for i in range(len(n_total))])
percentage_solved = np.array([n_solved[i] / n_total[i] for i in range(len(n_total))]) * 100

x = np.arange(1, 20.5, 0.5)

fig, ax = plt.subplots(figsize=(8, 3))
stacks = ax.stackplot(x, n_not_solved, n_solved, labels=['Not solved', 'Solved'])
ax.set_title("Number of instances at varying % of TL")
ax.set_xlabel("% of TL")
ax.set_ylabel("Total Number of instances")
ax.set_ylim([0, 1400])
ax.legend(loc='upper right', bbox_to_anchor=(0.98, 0.98))

# fig.legend(loc='upper right', bbox_to_anchor=(0.88, 0.88))
# fig.tight_layout()
# plt.savefig("class_balance_over_time.pgf", backend="pgf")
# tikzplotlib.clean_figure()
tikzplotlib.save("class_balance_over_time.tex", axis_width="0.9\\textwidth", axis_height="0.4\\textwidth")

In [None]:
import glob
import enum

class ProblemType(enum.Enum):
    UNKNOWN = 0
    SAT = 1
    OPT = 2

fzn_files = list(glob.glob(f"analysis/problems_compiled/*.fzn"))
summary = []


for fzn_file in track(fzn_files, description="Counting SAT/OPT"):
    with open(fzn_file, "r") as f:
        contents = f.read()
        problem_type = None
        if "satisfy" in contents:
            problem_type = ProblemType.SAT
        elif "minimize" in contents or "maximize" in contents:
            problem_type = ProblemType.OPT
        else:
            problem_type = ProblemType.UNKNOWN
        
        summary.append({
            "Problem": fzn_file,
            "Problem Type": problem_type
        })

summary_df = pd.DataFrame(summary, dtype=str)

In [None]:
regex_str = r'analysis/problems_compiled/.+MZN-(.+)-DZN.+\.fzn'
all_mzn = summary_df['Problem'].str.extract(regex_str, expand=False) + '.mzn'
summary_df['mzn'] = all_mzn
regex_str = r'analysis/problems_compiled/.+DZN-(.+)\.fzn'
all_dzn = summary_df['Problem'].str.extract(regex_str, expand=False) + '.dzn'
summary_df['dzn'] = all_dzn

half_percent_feautures = all_features[1]

summary_df['Past Half Percent'] = summary_df[['mzn', 'dzn']].apply(tuple, axis=1).isin(half_percent_feautures[['mzn', 'dzn']].apply(tuple, axis=1))
summary_past = summary_df[summary_df['Past Half Percent']]
summary_past

grouped_summary_df = summary_df.groupby(['Problem Type', 'Past Half Percent']).count().drop(columns=['mzn', 'dzn']).unstack()
grouped_summary_df = grouped_summary_df.rename(columns={'Problem': 'Count'}).rename(index={str(ProblemType.SAT): 'SAT', str(ProblemType.OPT): 'OPT', ProblemType.UNKNOWN: 'Unknown'}).droplevel(0, axis=1).rename(columns={True: "$\ge$0.5TL\%", False: "<0.5TL\%"})
grouped_summary_df['Total'] = grouped_summary_df.sum(axis=1)
cols = grouped_summary_df.columns
grouped_summary_df[[cols[2], cols[0], cols[1]]].rename_axis(None, axis=1)