In [70]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
import umap
from instance_space_analysis.feature_computations import get_instance_features

from sklearn.inspection import permutation_importance

In [71]:
data = pd.read_csv("data/heuristic_performance_final_scratch.csv")
data.set_index("instance", inplace=True)

instance_features = get_instance_features(data, 'data/qapdata/')

In [72]:
algorithms = [
    'elshafei_constructive_greedy_local_search_objective',
    'constructive_greedy_local_search_objective',
    '10_multistart_adjacent_swap_optimal_neighbour_objective',
    '10_multistart_adjacent_swap_first_improvement_objective',
    '10_multistart_total_swap_optimal_neighbour_objective',
    '10_multistart_total_swap_first_improvement_objective',
    'grasp_local_search', 
    'grasp_simulated_annealing', 
    'genetic_algorithm'
]

restricted_df = data[algorithms]

scaler = StandardScaler()
restricted_df[restricted_df.columns] = scaler.fit_transform(restricted_df[restricted_df.columns])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [73]:

reducer = umap.UMAP(random_state=42, metric="manhattan")

embedding = reducer.fit_transform(restricted_df)
embedding_df = pd.DataFrame(embedding).set_index(data.index)

kmeans = KMeans(n_clusters=2, n_init=100).fit(embedding_df)
print(f"The number of instances in group 1 are {sum(kmeans.labels_)}")
print(f"The number of instances in group 0 are {len(kmeans.labels_) - sum(kmeans.labels_)}")


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



The number of instances in group 1 are 32
The number of instances in group 0 are 96


In [74]:
import plotly.express as px
embedding_df['label'] = kmeans.labels_.astype(str)
restricted_df['label'] = kmeans.labels_.astype(str)
data['label'] = kmeans.labels_.astype(str)
px.scatter(embedding_df, x = 0, y=1, color='label')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [75]:
restricted_df.columns

Index(['elshafei_constructive_greedy_local_search_objective',
       'constructive_greedy_local_search_objective',
       '10_multistart_adjacent_swap_optimal_neighbour_objective',
       '10_multistart_adjacent_swap_first_improvement_objective',
       '10_multistart_total_swap_optimal_neighbour_objective',
       '10_multistart_total_swap_first_improvement_objective',
       'grasp_local_search', 'grasp_simulated_annealing', 'genetic_algorithm',
       'label'],
      dtype='object')

In [76]:
# px.box(restricted_df, x="label", y="10_multistart_total_swap_optimal_neighbour_objective")
px.histogram(data, color="label", x="10_multistart_total_swap_optimal_neighbour_objective", marginal='box')

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

instance_features[instance_features.columns] = StandardScaler().fit_transform(instance_features[instance_features.columns])

instance_features['label'] = kmeans.labels_.astype(str)

train, test = train_test_split(instance_features, test_size=0.2, random_state=42)
x_train, y_train = train.loc[:,train.columns != 'label'], train['label']
x_test, y_test = test.loc[:,test.columns != 'label'], test['label']

weights = {'0': 1, '1': 3}
logistic_model = LogisticRegression(class_weight=weights).fit(x_train,y_train)

y_pred = logistic_model.predict(x_test)

In [79]:
import numpy as np


n_false_negatives = sum(y_pred[y_test.values == '1'] == '0')
n_false_positives = sum(y_pred[y_test.values == '0'] == '1')

print(f"number of false negatives predicted: {n_false_negatives} out of {len(y_pred)} predictions")
print(f"number of false positives predicted: {n_false_positives} out of {len(y_pred)} predictions")
print(f"average prediction accuracy of {np.mean(y_test.values == y_pred)}")

number of false negatives predicted: 3 out of 26 predictions
number of false positives predicted: 6 out of 26 predictions
average prediction accuracy of 0.6538461538461539


In [81]:
logistic_model.coef_

array([[-0.10166202,  0.70514856, -1.13631189,  0.62828616, -0.84896808,
        -1.54304809, -1.88616027, -1.05408031,  0.12053145,  1.51774129,
         0.        , -0.97064204,  0.24017371, -0.96524168,  0.29104543]])

In [82]:
x_train.columns

Index(['problem_size', 'flow_sparsity', 'distance_sparsity', 'flow_asymmetry',
       'distance_asymmetry', 'flow_dominance', 'distance_dominance',
       'flow_max', 'distance_max', 'flow_min', 'distance_min', 'flow_mean',
       'distance_mean', 'flow_median', 'distance_median'],
      dtype='object')