# How Good is X*?
## March Madness Dataset

We want to determine whether X* is a good estimate of the rank distributions of each team.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import copy
import os
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr
from scipy.stats import skew
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
from pathlib import Path
home = str(Path.home())
home

In [None]:
import sys
sys.path.insert(0,"%s/rankability_toolbox_dev"%home)
import pyrankability

In [None]:
sys.path.insert(0,"%s/sensitivity_study/src"%home)
from sensitivity_tests import *
from utilities import *
from base import *

In [None]:
games={}
years = ["2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018"]
for year in years:
    games[year] = read_data('../data/%steams.txt'%year,'../data/%sgames.txt'%year,'../data/%sMadnessTeams.txt'%year)
print(year)
games[year]

In [None]:
# Note to future self: Parameters from FODS paper but might need to be optimized
direct_thres = 2
spread_thres = 2
weight_indirect = 0.5
Ds = {}

In [None]:
for year in tqdm(games.keys()):
    madness_teams = np.unique(list(games[year].team1_name.loc[games[year].team1_madness == 1]) + list(games[year].team2_name.loc[games[year].team2_madness == 1]))
    game_list = list(games[year].index)
    
    game_df = pd.DataFrame({"team1_name":games[year]['team1_name'],
                            "team1_score":games[year]['points1'],
                            "team1_H_A_N": games[year]['H_A_N1'],
                            "team2_name":games[year]['team2_name'],
                            "team2_score":games[year]['points2'],
                            "team2_H_A_N": games[year]['H_A_N1'],
                            "date": games[year]['date']
                           }).sort_values(by='date').drop('date',axis=1)
    map_func = lambda linked: pyrankability.construct.support_map_vectorized_direct_indirect_weighted(linked,direct_thres=direct_thres,spread_thres=spread_thres,weight_indirect=weight_indirect)
    Ds[year] = pyrankability.construct.V_count_vectorized(game_df,map_func).loc[madness_teams,madness_teams]

In [None]:
df_details = {}
# This function constructs a dataframe of features
# (collected from the first D matrix of each pair)
# To be used to predict the movement of the pair (kendall tau of rankings)
def get_rankability_results(n_restarts=200):
    df_ks = []
    df_years = []
    df_p_stats = {}
    for year in tqdm(games.keys()):
        D = Ds[year].fillna(0)
        k,details = pyrankability.rank.solve(D,method='lop', num_random_restarts=n_restarts, lazy=False, cont=True)
        p_stats = get_P_stats(details["P"])
        for name, val in p_stats.items():
            if name not in df_p_stats:
                df_p_stats[name] = []
            df_p_stats[name].append(val)
        df_ks.append(k)
        df_years.append(year)
        df_details[year] = details

    results_temp = {"k":df_ks,"Year":df_years}
    for key, val in df_p_stats.items():
        if key in results_temp:
            raise ValueError("Duplicate column name! Fix collision before moving on!")
        results_temp[key] = val

    return pd.DataFrame(results_temp).set_index("Year")

In [None]:
rankability_results = get_rankability_results()
rankability_results

In [None]:
for year in Ds.keys():
    details = df_details[year]

    opt_rank = list(details["Pfirst"][0])

    x_star_from_P = mean_x_star = np.zeros_like(details["x"])
    P_set = set(details["P"])
    for rank in P_set:
        for i in range(len(rank)):
            for j in range(i + 1, len(rank)):
                x_star_from_P[rank[i],rank[j]] += 1.0
    for i in range(len(x_star_from_P)):
        for j in range(i + 1, len(x_star_from_P)):
            total = x_star_from_P[i][j] + x_star_from_P[j][i]
            if total > 0:
                x_star_from_P[i][j] /= total
                x_star_from_P[j][i] /= total

    mean_x_star = np.zeros_like(details["x"])
    for x_star in details["xs"]:
        mean_x_star += x_star
    mean_x_star /= len(details["xs"])
    plt.imshow(mean_x_star[opt_rank, :][:, opt_rank])
    plt.title("Mean X_star for " + year)
    plt.show()

    plt.imshow(x_star_from_P[opt_rank, :][:, opt_rank])
    plt.title("Constructed X_star for " + year + " (" + str(len(P_set)) + ")")
    plt.show()