This notebook will show how to use the branching package to learn branching rules that approximate the strong branching in the set covering problem, which can be applied to any problem domain.

We import necessary packages and functions from the branching package.

In [1]:
import os
import glob
import gzip
import argparse
import pickle
import multiprocessing as mp
import time
import pathlib
import pandas as pd
import numpy as np
import ecole
import natsort
from datetime import datetime

In [2]:
import branching
from branching.utilities import log
from branching.sampling import collect_samples
from branching.feature_names import khalil_feat_names, node_candidate_feat_names
from branching.create_datasets import load_samples

#### Data collection

In this stage, we will solve training and validation instances and collect measurements.

Select the set of training and validation instances.

In [3]:
instances_train = glob.glob(f"instances-setcover/train/*.lp")
instances_valid = glob.glob(f"instances-setcover/valid/*.lp")

Select the number of processors (njobs) to solve instances in parallel.
The seed determines:
1. The order in which instances will be solved.
2. Additional seeds assigned to each instance and fed to the solver that is used for permuting the rows/columns of an instance and the underlying LP solver.
3. The subproblems that will be stored as data.

A seed uniquely determines the data to be collected and as a result, the training and validation datasets.

In [4]:
njobs=5
seed=0

In [5]:
os.makedirs("logs", exist_ok=True)

logfile = f'logs/setcover.txt'

We continue sampling until we collect 5000 candidate measurements. To make sure that we collect data from different instances, we store a subproblem and perform strong branching at it with 5% probability and impose a node limit of 1000 per instance.

In [6]:
ncands_size = 5000
node_record_prob = 0.05
node_limit = 1000

In [7]:
out_dir = f"data/setcover/samples/{seed}"
os.makedirs(out_dir, exist_ok=True)

log(f"Node record probability: {node_record_prob}", logfile)
log(f"Candidate limit (total): {ncands_size}", logfile)
log(f"Node limit (per instance): {node_limit}", logfile)

[2023-09-13 20:26:19.774445] Node record probability: 0.05
[2023-09-13 20:26:19.775902] Candidate limit (total): 5000
[2023-09-13 20:26:19.776386] Node limit (per instance): 1000


We solve training instances and log some information.

In [8]:
log(f"{len(instances_train)} training instances to collect {ncands_size} candidates", logfile)

instances_train = natsort.natsorted(instances_train)

start_time = time.time()

rng = np.random.RandomState(seed)

nepisodes, nsamples, ncands = collect_samples(instances_train, out_dir + '/train', rng, ncands_size,
            njobs, query_expert_prob=node_record_prob,
            node_limit=node_limit)
log("Training samples: --- %.2f seconds ---" % (time.time() - start_time), logfile)
log(f"Number of episodes: {nepisodes}, Number of samples: {nsamples}, Number of candidates: {ncands}", logfile)

[2023-09-13 20:26:19.800760] 20 training instances to collect 5000 candidates
[w 8258] episode 0, seed 686441525, processing instance 'instances-setcover/train/instance_6.lp'...
[w 8260] episode 1, seed 4022173117, processing instance 'instances-setcover/train/instance_3.lp'...
[w 8257] episode 2, seed 92199959, processing instance 'instances-setcover/train/instance_6.lp'...
[w 8261] episode 3, seed 2988243274, processing instance 'instances-setcover/train/instance_4.lp'...
[w 8259] episode 4, seed 478168918, processing instance 'instances-setcover/train/instance_17.lp'...
[m 8233] 1 samples written, 93 cand.s collected, ep 0 (3 in buffer).
[m 8233] 2 samples written, 182 cand.s collected, ep 0 (5 in buffer).
[m 8233] 3 samples written, 268 cand.s collected, ep 0 (6 in buffer).
[w 8258] episode 0 done, 3 samples
[w 8258] episode 5, seed 1570198172, processing instance 'instances-setcover/train/instance_9.lp'...
[m 8233] 4 samples written, 381 cand.s collected, ep 1 (12 in buffer).
[w 8

In the above example, we had to solve 6 instances. 59 subproblems were stored collectively. In total, we have 5078 candidate variable measurements.

Similarly, we solve validation instances.

In [9]:
log(f"{len(instances_valid)} validation instances to collect {ncands_size} candidates", logfile)

instances_valid = natsort.natsorted(instances_valid)

start_time = time.time()

rng = np.random.RandomState(seed)

nepisodes, nsamples, ncands = collect_samples(instances_valid, out_dir + '/valid', rng, ncands_size,
                    njobs, query_expert_prob=node_record_prob,
                    node_limit=node_limit)

log("Validation samples: --- %.2f seconds ---" % (time.time() - start_time), logfile)
log(f"Number of episodes: {nepisodes}, Number of samples: {nsamples}, Number of candidates: {ncands}", logfile)

[2023-09-13 20:31:40.580746] 20 validation instances to collect 5000 candidates
[w 8708] episode 0, seed 686441525, processing instance 'instances-setcover/valid/instance_6.lp'...
[w 8705] episode 2, seed 92199959, processing instance 'instances-setcover/valid/instance_6.lp'...
[w 8706] episode 1, seed 4022173117, processing instance 'instances-setcover/valid/instance_3.lp'...
[w 8704] episode 3, seed 2988243274, processing instance 'instances-setcover/valid/instance_4.lp'...
[w 8707] episode 4, seed 478168918, processing instance 'instances-setcover/valid/instance_17.lp'...
[w 8707] episode 4 done, 2 samples
[w 8707] episode 5, seed 1570198172, processing instance 'instances-setcover/valid/instance_9.lp'...
[m 8233] 1 samples written, 83 cand.s collected, ep 0 (7 in buffer).
[m 8233] 2 samples written, 175 cand.s collected, ep 0 (9 in buffer).
[m 8233] 3 samples written, 262 cand.s collected, ep 0 (10 in buffer).
[m 8233] 4 samples written, 358 cand.s collected, ep 0 (27 in buffer).
[

#### Creating datasets

In [15]:
node_candidate_feat_names_dict = node_candidate_feat_names()
node_candidate_feats = []
    
khalil_feat_names_dict = khalil_feat_names()
khalil_feats = []

for i in range(len(node_candidate_feat_names_dict)):
    node_candidate_feats.append(f"N_{i+1}_{node_candidate_feat_names_dict[i]}")
    
for i in range(len(khalil_feat_names_dict)):
    khalil_feats.append(f"K_{i+1}_{khalil_feat_names_dict[i]}")

feat_names = node_candidate_feats + khalil_feats # combine dictionaries

In [16]:
train_files = list(pathlib.Path(f'data/setcover/samples/{seed}/train').glob('sample_*.pkl'))
valid_files = list(pathlib.Path(f'data/setcover/samples/{seed}/valid').glob('sample_*.pkl'))

train_files = natsort.natsorted(train_files)
valid_files = natsort.natsorted(valid_files)

train_max_size = ncands_size
valid_max_size = ncands_size

We store the first 5000 candidate measurements obtained by solving training (validation) instances into a training (validation) dataset.

In [17]:
log("Loading training samples", logfile)

train_x, train_y, train_ncands = load_samples(
        train_files, train_max_size, logfile)
log(f"  {train_x.shape[0]} training samples", logfile)

log("Loading validation samples", logfile)

valid_x, valid_y, valid_ncands = load_samples(
        valid_files, valid_max_size, logfile)

log(f"  {valid_x.shape[0]} validation samples", logfile)

[2023-09-13 20:42:36.895312] Loading training samples
[2023-09-13 20:42:37.018086]   dataset size limit reached (5000 candidate variables)
[2023-09-13 20:42:37.022095]   5000 training samples
[2023-09-13 20:42:37.022335] Loading validation samples
[2023-09-13 20:42:37.124208]   dataset size limit reached (5000 candidate variables)
[2023-09-13 20:42:37.157290]   5000 validation samples


In [18]:
col_names = np.concatenate((feat_names, ["Score"]))

train_dataset = np.concatenate((train_x, train_y.reshape(-1,1)), axis = 1)
valid_dataset = np.concatenate((valid_x, valid_y.reshape(-1,1)), axis = 1)

train_df = pd.DataFrame(train_dataset, columns = col_names)
valid_df = pd.DataFrame(valid_dataset, columns = col_names)

os.makedirs(f"data/setcover/datasets/{seed}", exist_ok=True)

train_df.to_csv(f"data/setcover/datasets/{seed}/train.csv", index = False)
valid_df.to_csv(f"data/setcover/datasets/{seed}/valid.csv", index = False)

#### Preprocessing datasets

We eliminate constant features and highly correlated features from the dataset.

In [19]:
log(f"Before preprocessing:  {train_df.shape[1] - 1} features", logfile)

# Remove constant columns
nonconstant_columns = train_df.std() > (10 ** -10)
train_df = train_df.loc[:, nonconstant_columns]
valid_df = valid_df.loc[:, nonconstant_columns]

# Remove correlated columns
correlation_matrix = train_df.iloc[:,:-1].corr().abs()
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

train_df.drop(train_df[to_drop], axis=1, inplace=True)
valid_df.drop(valid_df[to_drop], axis=1, inplace=True)

log(f"After preprocessing:  {train_df.shape[1] - 1} features", logfile)
log("Features: ",logfile)
log(train_df.columns[:-1], logfile)
log("Targets: ",logfile)
log(train_df.columns[-1:], logfile)

dataset_dir = f"data/setcover/datasets_preprocessed/{seed}"
    
os.makedirs(dataset_dir, exist_ok=True)

train_df.to_csv(f"{dataset_dir}/train.csv", index = False)
valid_df.to_csv(f"{dataset_dir}/valid.csv", index = False)

[2023-09-13 20:42:44.823032] Before preprocessing:  110 features
[2023-09-13 20:42:44.986759] After preprocessing:  65 features
[2023-09-13 20:42:44.987020] Features: 
[2023-09-13 20:42:44.987754] Index(['N_1_objective', 'N_9_solution_value', 'N_14_incumbent_value',
       'N_15_average_incumbent_value', 'N_20_solution_infeasibility',
       'N_21_edge_mean', 'N_22_edge_min', 'N_23_edge_max', 'N_24_bias_mean',
       'N_25_bias_min', 'N_26_bias_max', 'N_27_obj_cos_sim_mean',
       'N_28_obj_cos_sim_min', 'N_29_obj_cos_sim_max', 'N_30_is_tight_mean',
       'N_33_dual_solution_mean', 'N_34_dual_solution_min',
       'N_36_scaled_age_mean', 'N_38_scaled_age_max', 'K_1_obj_coef',
       'K_4_n_rows', 'K_5_rows_deg_mean', 'K_6_rows_deg_stddev',
       'K_7_rows_deg_min', 'K_9_rows_pos_coefs_count',
       'K_14_rows_neg_coefs_count', 'K_15_rows_neg_coefs_mean',
       'K_16_rows_neg_coefs_stddev', 'K_17_rows_neg_coefs_min',
       'K_18_rows_neg_coefs_max', 'K_21_pseudocost_up', 'K_22_pse