# Rankability-Sensitivity Tests (Real Data)
This notebook executes tests to measure the sensitivity of certain ranking methods to small perturbations in the data. It collects many datapoints about each input matrix and saves the entire dataset into a csv file.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sensitivity_tests import *
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import os
from os.path import join

from gurobipy import *
setParam("OutputFlag", 0)

Academic license - for non-commercial use only


In [3]:
directories = ["nfl_data", "lolib_data"]
additional_files = []

files = []
for directory in directories:
    files.extend([join(directory, f) for f in os.listdir(directory) if os.path.isfile(join(directory, f)) and not f.endswith(".md")])
files.extend(additional_files)
print(len(files), files[0])

97 nfl_data/nfl2008_afc


In [4]:
CSV_FILENAME = "sensitivity_dataset_real.csv"
def write_chunk(dataset):
    if dataset == []:
        return
    df = pd.DataFrame(dataset)
    if not os.path.isfile(CSV_FILENAME):
        df.to_csv(CSV_FILENAME, mode='a', index=False)
    elif len(df.columns) != len(pd.read_csv(CSV_FILENAME, nrows=1).columns):
        raise Exception("Columns do not match!! Dataframe has " +
                        str(len(df.columns)) + " columns. CSV file has " +
                        str(len(pd.read_csv(CSV_FILENAME, nrows=1).columns)) + " columns.")
    else:
        target_columns = list(pd.read_csv(CSV_FILENAME, nrows=1).columns)
        source_columns = df.columns
        if not all([(col in target_columns) for col in source_columns]):
            raise Exception("Columns of dataframe and csv file do not match!!")
        df[target_columns].to_csv(CSV_FILENAME, mode='a', index=False, header=False)

In [None]:
dataset = []
instances_per_chunk = 1
n_trials = 50
n_restarts = 50
instance_idx = 0

for filepath in tqdm(files):
    data_source = LOLib(filepath, full_path=True)
    if data_source.get_n() > 50:
        print(filepath + " skipped", flush=True)
        continue
    print(filepath + " running", flush=True)
    dataset.append(ProblemInstance(data_source).collect_data(num_random_restarts=n_restarts,n_sensitivity_trials=n_trials))
    if (instance_idx + 1) % instances_per_chunk == 0:
        write_chunk(dataset)
        dataset = []
    instance_idx += 1
write_chunk(dataset)

  0%|          | 0/97 [00:00<?, ?it/s]

nfl_data/nfl2008_afc running


  1%|          | 1/97 [00:44<1:11:26, 44.65s/it]

nfl_data/nfl2018_nfc running


  2%|▏         | 2/97 [00:51<52:37, 33.23s/it]  

nfl_data/nfl2009_nfc running


  3%|▎         | 3/97 [00:56<39:04, 24.94s/it]

nfl_data/nfl1994_afc running


  4%|▍         | 4/97 [01:27<41:09, 26.56s/it]

nfl_data/nfl2018_afc running


  5%|▌         | 5/97 [02:12<49:08, 32.05s/it]

nfl_data/nfl2003_afc running


  6%|▌         | 6/97 [02:55<53:52, 35.52s/it]

nfl_data/nfl2014_afc running


In [None]:
df = pd.read_csv(CSV_FILENAME)
sensitivities = [col for col in df.columns if "mean_sensitivity" in col]
sensitivities

In [None]:
df["overall_mean_sensitivity"] = df[sensitivities].mean(axis=1)

In [None]:
df.plot.scatter("p_lowerbound", "overall_mean_sensitivity", title="Sensitivity by p")

In [None]:
df.plot.scatter("kendall_w", "overall_mean_sensitivity", title="Sensitivity by Kendall W")

In [None]:
df.plot.scatter("max_L2_dist", "overall_mean_sensitivity", title="Sensitivity by Max L2 Dist")

In [None]:
df.plot.scatter("mean_L2_dist", "overall_mean_sensitivity", title="Sensitivity by Mean L2 Dist")

In [None]:
df.plot.scatter("min_tau", "overall_mean_sensitivity", title="Sensitivity by Min Tau")

In [None]:
df.plot.scatter("mean_tau", "overall_mean_sensitivity", title="Sensitivity by Mean Tau")

In [None]:
def plot_taus(rankability_vals, taus, method_name, scatter_alpha=0.01, histogram_bins=30, save_dir="rankability_figures_run5"):
    # Create directory for saving if specified
    saving = False
    if save_dir is not None and save_dir != "":
        Path(save_dir).mkdir(parents=True, exist_ok=True)
        saving = True
    
    # Plot scatter plot
    plt.scatter(rankability_vals, taus, alpha=scatter_alpha)
    title = "%s Sensitivity Scatter" % method_name
    plt.title(title)
    plt.xlabel('Rankability')
    plt.xlim(0.0, 1.0)
    plt.ylabel('Tau')
    plt.ylim(-1.0, 1.0)
    if saving:
        save_path = os.path.join(save_dir, title.replace(" ", "_") + ".png")
        plt.savefig(save_path)
    plt.show()
    
    # Plot 2D Histogram
    plt.hist2d(rankability_vals, taus, bins=histogram_bins, range=[[0.0, 1.0], [-1.0, 1.0]])
    title = "%s Sensitivity Histogram" % method_name
    plt.title(title)
    plt.xlabel('Rankability')
    plt.ylabel('Tau')
    if saving:
        save_path = os.path.join(save_dir, title.replace(" ", "_") + ".png")
        plt.savefig(save_path)
    plt.show()