In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
import umap
from feature_computations import get_instance_features
import plotly.express as px


from sklearn.inspection import permutation_importance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read in data
data = pd.read_csv("data/heuristic_performance_final_scratch.csv")
data.set_index("instance", inplace=True)
# restrict data to chosen algorithms
algorithms = [
    'elshafei_constructive_greedy_local_search_objective',
    'constructive_greedy_local_search_objective',
    '10_multistart_adjacent_swap_optimal_neighbour_objective',
    '10_multistart_adjacent_swap_first_improvement_objective',
    '10_multistart_total_swap_optimal_neighbour_objective',
    '10_multistart_total_swap_first_improvement_objective',
    'grasp_local_search', 
    'grasp_simulated_annealing', 
    'genetic_algorithm'
]

restricted_df = data[algorithms]

# get instance of data

instance_features = get_instance_features(data, 'data/qapdata/')

## K-Means clustering of data
#### Cluster our data into two groups, we will consider one to be 'hard' and the other to be 'easy'

In [3]:
kmeans = KMeans(n_clusters=2, n_init=100).fit(restricted_df)
print(f"The number of instances in group 1 are {sum(kmeans.labels_)}")
print(f"The number of instances in group 0 are {len(kmeans.labels_) - sum(kmeans.labels_)}")

The number of instances in group 1 are 11
The number of instances in group 0 are 117


In [4]:
# using PCA, reduce the dimension of our data, so we may visualise the clustering of K-Means
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(restricted_df)
pca_df = pd.DataFrame(principalComponents).set_index(restricted_df.index)

pca_df['label'] = kmeans.labels_.astype(str)
restricted_df['label'] = kmeans.labels_.astype(str)
instance_features['label'] = kmeans.labels_.astype(str)
fig = px.scatter(pca_df, x = 0, y=1, color='label')
fig.update_layout(
    xaxis_title='PC1',
    yaxis_title='PC2',
    title='K-Means clusters in 2 PC dimensions'
)
fig.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restricted_df['label'] = kmeans.labels_.astype(str)


In [5]:
restricted_df.columns

Index(['elshafei_constructive_greedy_local_search_objective',
       'constructive_greedy_local_search_objective',
       '10_multistart_adjacent_swap_optimal_neighbour_objective',
       '10_multistart_adjacent_swap_first_improvement_objective',
       '10_multistart_total_swap_optimal_neighbour_objective',
       '10_multistart_total_swap_first_improvement_objective',
       'grasp_local_search', 'grasp_simulated_annealing', 'genetic_algorithm',
       'label'],
      dtype='object')

In [6]:
fig = px.box(restricted_df, x="label", y="grasp_simulated_annealing", color="label")
fig.update_layout(
    xaxis_title='Label',
    yaxis_title='Relative Error',
    title='GRASP (with Simulated Annealing) Relative Error by K-Means Label'
)
fig.show()

In [7]:
fig = px.box(restricted_df, x="label", y="genetic_algorithm", color="label")
fig.update_layout(
    xaxis_title='Label',
    yaxis_title='Relative Error',
    title='Genetic Algorithm Relative Error by K-Means Label'
)
fig.show()

### Create train/test split and train logistic regression

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


instance_features['label'] = kmeans.labels_.astype(str)

train, test = train_test_split(instance_features, test_size=0.3, random_state=42)
x_train, y_train = train.loc[:,train.columns != 'label'], train['label']
x_test, y_test = test.loc[:,test.columns != 'label'], test['label']

scaler = StandardScaler().fit(x_train[x_train.columns])
x_train[x_train.columns] = scaler.transform(x_train[x_train.columns])
x_test[x_test.columns] = scaler.transform(x_test[x_test.columns])

weights = {'0': 1, '1': 3}
logistic_model = LogisticRegression(class_weight=weights).fit(x_train,y_train)

y_pred = logistic_model.predict(x_test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
import numpy as np


n_false_negatives = sum(y_pred[y_test.values == '1'] == '0')
n_false_positives = sum(y_pred[y_test.values == '0'] == '1')

print(f"number of false negatives predicted: {n_false_negatives} out of {len(y_pred)} predictions")
print(f"number of false positives predicted: {n_false_positives} out of {len(y_pred)} predictions")
print(f"average prediction accuracy of {np.mean(y_test.values == y_pred)}")

number of false negatives predicted: 0 out of 39 predictions
number of false positives predicted: 4 out of 39 predictions
average prediction accuracy of 0.8974358974358975


In [10]:
px.box(instance_features, x="label", y="flow_dominance")

In [11]:
px.box(instance_features, x="label", y="problem_size")

In [12]:
import statsmodels.api as sm 

logistic_model = sm.Logit(np.asarray(y_train.astype(int)), np.asarray(x_train)).fit(method='BFGS')
logistic_model.summary(xname=x_train.columns.to_list()) 

         Current function value: 0.488772
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36



Maximum number of iterations has been exceeded.


Maximum Likelihood optimization failed to converge. Check mle_retvals



0,1,2,3
Dep. Variable:,y,No. Observations:,89.0
Model:,Logit,Df Residuals:,76.0
Method:,MLE,Df Model:,12.0
Date:,"Thu, 19 Oct 2023",Pseudo R-squ.:,-0.617
Time:,20:37:11,Log-Likelihood:,-43.501
converged:,False,LL-Null:,-26.903
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
problem_size,1.2707,0.594,2.139,0.032,0.106,2.435
flow_sparsity,2.0085,1.343,1.495,0.135,-0.624,4.641
distance_sparsity,-1.4879,0.775,-1.921,0.055,-3.006,0.030
flow_asymmetry,-0.2706,0.350,-0.773,0.440,-0.957,0.416
distance_asymmetry,1.8460,1.158,1.595,0.111,-0.423,4.115
flow_dominance,1.7272,1.003,1.722,0.085,-0.239,3.693
distance_dominance,3.9342,1.970,1.997,0.046,0.073,7.796
flow_max,-0.3798,0.476,-0.798,0.425,-1.312,0.553
distance_max,-3.1614,1.736,-1.821,0.069,-6.564,0.241
