# Rankability-Sensitivity Tests (Real Data)
This notebook executes tests to measure the sensitivity of certain ranking methods to small perturbations in the data. It collects many datapoints about each input matrix and saves the entire dataset into a csv file.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sensitivity_tests import *
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import os
from os.path import join

from gurobipy import *
setParam("OutputFlag", 0)

Academic license - for non-commercial use only


In [3]:
directories = ["nfl_data", "lolib_data"]
additional_files = []

files = []
for directory in directories:
    files.extend([join(directory, f) for f in os.listdir(directory) if os.path.isfile(join(directory, f)) and not f.endswith(".md")])
files.extend(additional_files)
print(len(files), files[0])

97 nfl_data/nfl2008_afc


In [4]:
CSV_FILENAME = "sensitivity_dataset_real.csv"
def write_chunk(dataset):
    if dataset == []:
        return
    df = pd.DataFrame(dataset)
    if not os.path.isfile(CSV_FILENAME):
        df.to_csv(CSV_FILENAME, mode='a', index=False)
    elif len(df.columns) != len(pd.read_csv(CSV_FILENAME, nrows=1).columns):
        raise Exception("Columns do not match!! Dataframe has " +
                        str(len(df.columns)) + " columns. CSV file has " +
                        str(len(pd.read_csv(CSV_FILENAME, nrows=1).columns)) + " columns.")
    else:
        target_columns = list(pd.read_csv(CSV_FILENAME, nrows=1).columns)
        source_columns = df.columns
        if not all([(col in target_columns) for col in source_columns]):
            raise Exception("Columns of dataframe and csv file do not match!!")
        df[target_columns].to_csv(CSV_FILENAME, mode='a', index=False, header=False)

In [None]:
dataset = []
instances_per_chunk = 1
n_trials = 50
n_restarts = 50
instance_idx = 0

for filepath in tqdm(files):
    data_source = LOLib(filepath, full_path=True)
    if data_source.get_n() > 50:
        print(filepath + " skipped", flush=True)
        continue
    print(filepath + " running", flush=True)
    dataset.append(ProblemInstance(data_source).collect_data(num_random_restarts=n_restarts,
                                                             n_sensitivity_trials=n_trials))
    if (instance_idx + 1) % instances_per_chunk == 0:
        write_chunk(dataset)
        dataset = []
    instance_idx += 1
write_chunk(dataset)

  0%|          | 0/97 [00:00<?, ?it/s]

nfl_data/nfl2008_afc running


  1%|          | 1/97 [00:44<1:11:26, 44.65s/it]

nfl_data/nfl2018_nfc running


  2%|▏         | 2/97 [00:51<52:37, 33.23s/it]  

nfl_data/nfl2009_nfc running


  3%|▎         | 3/97 [00:56<39:04, 24.94s/it]

nfl_data/nfl1994_afc running


  4%|▍         | 4/97 [01:27<41:09, 26.56s/it]

nfl_data/nfl2018_afc running


  5%|▌         | 5/97 [02:12<49:08, 32.05s/it]

nfl_data/nfl2003_afc running


  6%|▌         | 6/97 [02:55<53:52, 35.52s/it]

nfl_data/nfl2014_afc running


  7%|▋         | 7/97 [03:41<57:43, 38.49s/it]

nfl_data/nfl2016_afc running


  8%|▊         | 8/97 [04:25<59:51, 40.35s/it]

nfl_data/nfl1992_afc running


  9%|▉         | 9/97 [04:55<54:20, 37.05s/it]

nfl_data/nfl2001_nfc running


 10%|█         | 10/97 [05:00<39:50, 27.47s/it]

nfl_data/nfl2004_nfc running


 11%|█▏        | 11/97 [05:06<30:03, 20.97s/it]

nfl_data/nfl2017_afc running


 12%|█▏        | 12/97 [05:49<39:27, 27.85s/it]

nfl_data/nfl2000_afc running


 13%|█▎        | 13/97 [06:29<44:01, 31.44s/it]

nfl_data/nfl2010_nfc running


 14%|█▍        | 14/97 [06:35<32:46, 23.70s/it]

nfl_data/nfl1990_nfc running


 15%|█▌        | 15/97 [06:39<24:24, 17.86s/it]

nfl_data/nfl2014_nfc running


 16%|█▋        | 16/97 [06:45<19:14, 14.25s/it]

nfl_data/nfl2016_nfc running


 18%|█▊        | 17/97 [06:51<15:38, 11.73s/it]

nfl_data/nfl1990_afc running


 19%|█▊        | 18/97 [07:20<22:19, 16.96s/it]

nfl_data/nfl2005_nfc running


 20%|█▉        | 19/97 [07:26<17:40, 13.60s/it]

nfl_data/nfl2003_nfc running


 21%|██        | 20/97 [07:32<14:35, 11.36s/it]

nfl_data/nfl1997_afc running


 22%|██▏       | 21/97 [08:09<24:01, 18.97s/it]

nfl_data/nfl1993_nfc running


 23%|██▎       | 22/97 [08:13<18:17, 14.64s/it]

nfl_data/nfl2008_nfc running


 24%|██▎       | 23/97 [08:19<14:57, 12.13s/it]

nfl_data/nfl2012_nfc running


 25%|██▍       | 24/97 [08:26<12:34, 10.33s/it]

nfl_data/nfl1993_afc running


 26%|██▌       | 25/97 [08:55<19:24, 16.18s/it]

nfl_data/nfl1999_nfc running


 27%|██▋       | 26/97 [09:00<15:07, 12.78s/it]

nfl_data/nfl2001_afc running


 28%|██▊       | 27/97 [09:40<24:31, 21.03s/it]

nfl_data/nfl2013_nfc running


 29%|██▉       | 28/97 [09:47<19:05, 16.60s/it]

nfl_data/nfl2009_afc running


 30%|██▉       | 29/97 [10:31<28:10, 24.86s/it]

nfl_data/nfl1998_afc running


 31%|███       | 30/97 [11:07<31:33, 28.26s/it]

nfl_data/nfl2019_afc running


 32%|███▏      | 31/97 [11:54<37:13, 33.84s/it]

nfl_data/nfl2004_afc running


 33%|███▎      | 32/97 [12:38<39:56, 36.86s/it]

nfl_data/nfl2002_afc running


 34%|███▍      | 33/97 [13:32<44:41, 41.91s/it]

nfl_data/nfl1991_afc running


 35%|███▌      | 34/97 [14:02<40:22, 38.45s/it]

nfl_data/nfl2015_nfc running


 36%|███▌      | 35/97 [14:08<29:43, 28.76s/it]

nfl_data/nfl1996_afc running


 37%|███▋      | 36/97 [14:46<32:00, 31.48s/it]

nfl_data/nfl2012_afc running


 38%|███▊      | 37/97 [15:29<34:59, 34.98s/it]

nfl_data/nfl2000_nfc running


 39%|███▉      | 38/97 [15:34<25:34, 26.01s/it]

nfl_data/nfl2011_afc running


 40%|████      | 39/97 [16:17<30:02, 31.09s/it]

nfl_data/nfl2007_nfc running


 41%|████      | 40/97 [16:23<22:21, 23.54s/it]

nfl_data/nfl2006_nfc running


 42%|████▏     | 41/97 [16:29<17:03, 18.27s/it]

nfl_data/nfl1992_nfc running


 43%|████▎     | 42/97 [16:33<12:54, 14.08s/it]

nfl_data/nfl2007_afc running


 44%|████▍     | 43/97 [17:17<20:45, 23.06s/it]

nfl_data/nfl2005_afc running


 45%|████▌     | 44/97 [18:02<26:04, 29.53s/it]

nfl_data/nfl2010_afc running


 46%|████▋     | 45/97 [18:47<29:38, 34.19s/it]

nfl_data/nfl2013_afc running


 47%|████▋     | 46/97 [19:30<31:18, 36.82s/it]

nfl_data/nfl2015_afc running


 48%|████▊     | 47/97 [20:14<32:31, 39.04s/it]

nfl_data/nfl1991_nfc running


 49%|████▉     | 48/97 [20:18<23:20, 28.57s/it]

nfl_data/nfl1998_nfc running


 51%|█████     | 49/97 [20:23<17:06, 21.39s/it]

nfl_data/nfl1996_nfc running


 52%|█████▏    | 50/97 [20:28<12:59, 16.59s/it]

nfl_data/nfl2011_nfc running


 53%|█████▎    | 51/97 [20:34<10:16, 13.40s/it]

nfl_data/nfl1997_nfc running


 54%|█████▎    | 52/97 [20:39<08:11, 10.92s/it]

nfl_data/nfl2019_nfc running


 55%|█████▍    | 53/97 [20:45<06:54,  9.43s/it]

nfl_data/nfl1995_nfc running


 56%|█████▌    | 54/97 [20:51<05:51,  8.17s/it]

nfl_data/nfl2017_nfc running


 57%|█████▋    | 55/97 [20:57<05:15,  7.51s/it]

nfl_data/nfl1995_afc running


 58%|█████▊    | 56/97 [21:33<10:57, 16.04s/it]

nfl_data/nfl1999_afc running


 59%|█████▉    | 57/97 [22:13<15:38, 23.45s/it]

nfl_data/nfl2006_afc running


 60%|█████▉    | 58/97 [22:58<19:18, 29.70s/it]

nfl_data/nfl1994_nfc running


 61%|██████    | 59/97 [23:02<14:02, 22.17s/it]

nfl_data/nfl2002_nfc running


 62%|██████▏   | 60/97 [23:08<10:42, 17.37s/it]

lolib_data/N-econ77 skipped
lolib_data/N-econ73 skipped
lolib_data/N-econ76 skipped
lolib_data/N-EX5 running


 66%|██████▌   | 64/97 [1:50:28<3:42:50, 405.16s/it]

lolib_data/N-econ61 skipped
lolib_data/N-EX1 running


In [None]:
df = pd.read_csv(CSV_FILENAME)
sensitivities = [col for col in df.columns if "mean_sensitivity" in col]
sensitivities

In [None]:
df["overall_mean_sensitivity"] = df[sensitivities].mean(axis=1)

In [None]:
df.plot.scatter("p_lowerbound", "overall_mean_sensitivity", title="Sensitivity by p")

In [None]:
df.plot.scatter("kendall_w", "overall_mean_sensitivity", title="Sensitivity by Kendall W")

In [None]:
df.plot.scatter("max_L2_dist", "overall_mean_sensitivity", title="Sensitivity by Max L2 Dist")

In [None]:
df.plot.scatter("mean_L2_dist", "overall_mean_sensitivity", title="Sensitivity by Mean L2 Dist")

In [None]:
df.plot.scatter("min_tau", "overall_mean_sensitivity", title="Sensitivity by Min Tau")

In [None]:
df.plot.scatter("mean_tau", "overall_mean_sensitivity", title="Sensitivity by Mean Tau")

In [None]:
def plot_taus(rankability_vals, taus, method_name, scatter_alpha=0.01, histogram_bins=30, save_dir="rankability_figures_run5"):
    # Create directory for saving if specified
    saving = False
    if save_dir is not None and save_dir != "":
        Path(save_dir).mkdir(parents=True, exist_ok=True)
        saving = True
    
    # Plot scatter plot
    plt.scatter(rankability_vals, taus, alpha=scatter_alpha)
    title = "%s Sensitivity Scatter" % method_name
    plt.title(title)
    plt.xlabel('Rankability')
    plt.xlim(0.0, 1.0)
    plt.ylabel('Tau')
    plt.ylim(-1.0, 1.0)
    if saving:
        save_path = os.path.join(save_dir, title.replace(" ", "_") + ".png")
        plt.savefig(save_path)
    plt.show()
    
    # Plot 2D Histogram
    plt.hist2d(rankability_vals, taus, bins=histogram_bins, range=[[0.0, 1.0], [-1.0, 1.0]])
    title = "%s Sensitivity Histogram" % method_name
    plt.title(title)
    plt.xlabel('Rankability')
    plt.ylabel('Tau')
    if saving:
        save_path = os.path.join(save_dir, title.replace(" ", "_") + ".png")
        plt.savefig(save_path)
    plt.show()