In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from instance_space_analysis.feature_computations import get_instance_features


from sklearn.inspection import permutation_importance

In [35]:
data = pd.read_csv("data/heuristic_performance_final_scratch.csv")
data.set_index("instance", inplace=True)

instance_features = get_instance_features(data, 'data/qapdata/')

scaler = StandardScaler()
instance_features[instance_features.columns] = scaler.fit_transform(instance_features[instance_features.columns])

algorithms = [
    'grasp_local_search', 
    'grasp_simulated_annealing', 
    'genetic_algorithm',
    '10_multistart_total_swap_optimal_neighbour_objective',
    '10_multistart_adjacent_swap_optimal_neighbour_objective',
    '10_multistart_adjacent_swap_first_improvement_objective',
    '10_multistart_total_swap_first_improvement_objective',
    'constructive_greedy_local_search_objective',
    'elshafei_constructive_greedy_local_search_objective',
]

restricted_df = data[algorithms]
restricted_df = restricted_df.drop_duplicates(keep='first')

best_algos_df = pd.DataFrame()

best_algos_df['performance'] = restricted_df.min(axis=1)
best_algos_df['best_algo'] = restricted_df.idxmin(axis=1)

instance_features['best_algo'] = restricted_df.idxmin(axis=1).astype('category')
instance_features.dropna(inplace=True)

instance_features.head()

Unnamed: 0,problem_size,flow_sparsity,distance_sparsity,flow_asymmetry,distance_asymmetry,flow_dominance,distance_dominance,flow_max,distance_max,flow_min,distance_min,flow_mean,distance_mean,best_algo
tai256c,5.9547,9.186069,-0.595513,0.39905,0.428339,0.707893,0.934446,-0.312484,5.073131,-0.27435,3.323562,-0.247417,2.607582,10_multistart_total_swap_optimal_neighbour_obj...
nug16b,-0.687821,-0.285651,-0.280453,0.39905,0.428339,-0.43863,-0.078131,-0.307507,-0.303716,-0.267333,-0.208219,-0.24093,-0.122438,grasp_local_search
tai35b,-0.161955,-0.285651,0.688731,0.39905,-2.025594,-0.305154,1.84103,0.742713,0.866027,0.888181,0.367406,1.607858,-0.124651,grasp_local_search
chr22a,-0.521758,0.5291,-0.595513,0.39905,0.428339,1.608492,-0.560231,-0.263706,-0.29936,-0.271149,-0.195523,-0.247417,-0.102523,elshafei_constructive_greedy_local_search_obje...
esc16h,-0.687821,-0.258977,-0.31546,0.39905,0.428339,0.099192,-0.384595,-0.292575,-0.304092,-0.260349,-0.209032,-0.244174,-0.123544,grasp_local_search


In [36]:
import plotly.express as px
px.box(best_algos_df, y="performance", color="best_algo")

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


train, test = train_test_split(instance_features, test_size=0.2, random_state=42)
x_train, y_train = train.loc[:,train.columns != 'best_algo'], train['best_algo']
x_test, y_test = test.loc[:,test.columns != 'best_algo'], test['best_algo']

scaler = StandardScaler().fit(x_train[x_train.columns])
x_train[x_train.columns] = scaler.transform(x_train[x_train.columns])
x_test[x_test.columns] = scaler.transform(x_test[x_test.columns])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
train_mapping = dict(zip(y_train, y_train.cat.codes))
train_mapping['grasp_local_search'] = 0
train_mapping['10_multistart_total_swap_optimal_neighbour_objective'] = 3
train_mapping


{'10_multistart_total_swap_optimal_neighbour_objective': 3,
 'grasp_local_search': 0,
 'constructive_greedy_local_search_objective': 1,
 'elshafei_constructive_greedy_local_search_objective': 2}

In [29]:
logistic_model = LogisticRegression().fit(x_train,y_train.map(train_mapping))

y_pred = logistic_model.predict(x_test)

In [30]:
print(f"average prediction accuracy of {np.mean(y_test.map(train_mapping).values == y_pred)}")

average prediction accuracy of 0.6153846153846154


In [31]:
x_train.columns[[1, 5, 7, 9, 11]]

Index(['flow_sparsity', 'flow_dominance', 'flow_max', 'flow_min', 'flow_mean'], dtype='object')

In [32]:
from statsmodels.discrete.discrete_model import MNLogit

logistic_model = MNLogit(np.asarray(y_train.map(train_mapping)), np.asarray(x_train)).fit()
logistic_model.summary() 

Optimization terminated successfully.
         Current function value: 1.104232
         Iterations 8


0,1,2,3
Dep. Variable:,y,No. Observations:,102.0
Model:,MNLogit,Df Residuals:,63.0
Method:,MLE,Df Model:,36.0
Date:,"Sun, 08 Oct 2023",Pseudo R-squ.:,-0.553
Time:,11:48:32,Log-Likelihood:,-112.63
converged:,True,LL-Null:,-72.523
Covariance Type:,nonrobust,LLR p-value:,1.0

y=1,coef,std err,z,P>|z|,[0.025,0.975]
x1,1.2693,0.773,1.641,0.101,-0.247,2.785
x2,4.1683,2.146,1.942,0.052,-0.039,8.375
x3,-1.1320,0.902,-1.255,0.210,-2.900,0.636
x4,-0.0454,0.360,-0.126,0.900,-0.752,0.661
x5,0.2583,0.891,0.290,0.772,-1.487,2.004
x6,-1.6238,0.990,-1.641,0.101,-3.563,0.316
x7,-0.0157,1.390,-0.011,0.991,-2.741,2.710
x8,-1.9018,1.018,-1.867,0.062,-3.898,0.094
x9,0.6431,1.801,0.357,0.721,-2.887,4.173
x10,8.5602,3.755,2.280,0.023,1.200,15.920
