In [1]:
import numpy as np
import pandas as pd
import sys, os, time
from scipy.stats import norm
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
import ray
sys.path.append('../')


from src.data_structures import FactorGraph, PriorityQueue
from src.sampling_algorithms.factor_components import (gaussian_bounce, chain_bounce_fn, 
                                                       gaussian_grad_potential_fn, 
                                                       gaussian_chain_grad_potential_fn,
                                                           logistic, 
                                                       generate_logistic_bounce, 
                                                       alias_sample, 
                                                       lambda_r, 
                                                       grad_logistic, 
                                                       lambda_bound
                                                      )
from src.utils import (interp, get_xtv, get_first_moment, get_second_moment, get_var)
from src.utils.serialize import pickle_obj, unpickle_obj
from src.sampling_algorithms import MaskedLocalBPS, LocalBPS
from src.mcmc_diagnostics.diagnostic import MCMCDiagnostic
from src.sampling_algorithms.masked_bps.masked_bps_output import OutputReader

from src.utils.serialize import pickle_obj, unpickle_obj, load_json, save_json
from src.utils.params import hash_dict
from src.plots.arrow_plot import arrow_plot
from arviz.stats import ess

from matplotlib import rc
rc('text', usetex=False)
import shutil

parent_dir = '../'
os.environ["PYTHONPATH"] = parent_dir + ":" + os.environ.get("PYTHONPATH", "")

In [2]:
tag = 'football'
sampler = 'masked'
refresh_rate = 0.01
out_dir = 'masked_bps_tmp'
run_time = 60
num_workers = 12

In [3]:
for tag in ['football', 'nfl']:
    for sampler in ['local', 'masked']:
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
            
        if tag == 'nfl':

            num_cuts = 3
            data_dir = '/home/james/Dynamic_BT/NFL_data'

            paths = os.listdir(data_dir)

            dfs = []
            for fp in paths:
                path = os.path.join(data_dir, fp)
                year = fp.strip(".csv")
                df = pd.read_csv(path)
                df['year'] = year
                df['score'] = 0. + df['PtsW']>df['PtsL']
                dfs.append(df)


            winners = df['Winner/tie'].unique()

            losers = df['Loser/tie'].unique()

            def intersect(sets):
                result = sets[0]
                for s in sets:
                    result = result.intersection(s)
                return result

            sets =  [set(d['Winner/tie'].unique()) for d in dfs]
            teams = list(intersect(sets))
            d = len(teams)

            team_map = {team : i for i, team in enumerate(teams)}

            T = len(paths)

            cov_list = []
            dfs = []
            ys = []
            for fp in paths:
                covs = []
                path = os.path.join(data_dir, fp)

                df = pd.read_csv(path)
                rview_1 = (df['PtsW']>df['PtsL'])
                rview_2 = df['Winner/tie'].isin(teams)
                rview_3 = df['Loser/tie'].isin(teams)
                df = df[rview_1 & rview_2 & rview_3]
                y = 0. + (df['PtsW']>df['PtsL']).values
                ys.append(y)

                for i, row in df.to_dict('I').items():
                    home = row['Winner/tie']
                    home_i = team_map[home]
                    away  = row['Loser/tie']
                    away_i = team_map[away]

                    cov = np.zeros(d)
                    cov[home_i] = 1
                    cov[away_i] = -1
                    covs.append(cov.copy())

                cov_list.append(np.array(covs.copy()))

            cov_list = np.array(cov_list)




            cov_p = [np.maximum(0., covs) for covs in cov_list]
            cov_n = [-np.minimum(0., covs) for covs in cov_list]
            sign = [np.expand_dims(-1.**(y),-1) for y in ys]



            mu0 = np.array([0. for _ in range(d)])
            sig0 = np.diag([1. for _ in range(d)])
            mu1 = np.array([0. for _ in range(d)])
            sig1 = 2.*sig0

            global_event_samplers = []
            local_event_samplers = []
            grad_fns = []
            sigs = []

            for t in range(T-1):
                if t == 0:
                    sigs.append(2*sig0)
                    global_event_samplers.append(chain_bounce_fn(mu1, mu0, sig1, sig0, sig0))
                    grad_fns.append(gaussian_chain_grad_potential_fn(mu1, mu0, sig1, sig0, sig0))
                else:
                    sigs.append(sigs[t-1] + sig0)
                    global_event_samplers.append(chain_bounce_fn(mu1, mu0, sigs[t], sigs[t-1], sigs[t-1]))
                    grad_fns.append(gaussian_chain_grad_potential_fn(mu1, mu0, sigs[t], sigs[t-1], sigs[t-1]))

            for t in range(T):        
                local_event_samplers.append(alias_sample(t, cov_n, cov_p, cov_list, sign, ys))



            bounce_fns = [gaussian_bounce(mu0, sig0)] + global_event_samplers + local_event_samplers

            grad_factor_potential_fns = [gaussian_grad_potential_fn(mu0, sig0)] + \
            grad_fns + \
            [grad_logistic(cov_list[g], ys[g]) for g in range(T)]


            factor_indices = [[i for i in range(d)]] + \
                             [[t*d+i for i in range(d)]+[(t+1)*d+i for i in range(d)] for t in range(T-1)] + \
                             [[d*t+i for i in range(d)] for t in range(T)] 

            factor_potential_fns = [lambda x: x for _ in grad_factor_potential_fns]

            nodes = list(set(n for f in factor_indices for n in f ))


            model = FactorGraph(dim_x=len(nodes),
                              factor_indices=factor_indices,
                              factor_potential_fns=factor_potential_fns,
                              grad_factor_potential_fns=grad_factor_potential_fns)



        if tag == 'football':
            num_cuts = 12
            df = pd.read_csv('https://raw.githubusercontent.com/jalapic/engsoccerdata/master/data-raw/england.csv')

            teams = df[df.Season>1950].home.unique()
            dfs = [df[df.Season == yr] for yr in df.Season.unique() if yr > 1950]

            common_teams = []
            for team in teams:
                if all(team in d.home.tolist() for d in dfs):
                    common_teams.append(team)

            rview1 = df.home.isin(common_teams)
            rview2 = df.visitor.isin(common_teams)
            rview3 = df.result != 'D'
            rview4 = df.Season > 1950

            df['score'] = df.result.map({'A':0, 'H':1})
            rdf = df[rview1 & rview2 & rview3 & rview4]

            d = len(common_teams)

            years = rdf['Season'].unique()
            years.sort()
            team_map = {team : i for i, team in enumerate(common_teams)}

            all_years = rdf.Season.astype(int)

            T = len(years)

            cov_list = []
            ys = []
            for year in years:
                year_rdf = rdf[all_years== year.item()]
                covs = []
                ys.append(year_rdf.score.values)
                for i, row in year_rdf.to_dict('I').items():
                    home = row['home']
                    home_i = team_map[home]
                    away  = row['visitor']
                    away_i = team_map[away]

                    cov = np.zeros(d)
                    cov[home_i] = 1
                    cov[away_i] = -1
                    covs.append(cov.copy())

                cov_list.append(np.array(covs.copy()))

            cov_list = np.array(cov_list)

            cov_p = [np.maximum(0., covs) for covs in cov_list]
            cov_n = [-np.minimum(0., covs) for covs in cov_list]
            sign = [np.expand_dims(-1.**(y),-1) for y in ys]



            mu0 = np.array([0. for _ in range(d)])
            sig0 = np.diag([1. for _ in range(d)])
            mu1 = np.array([0. for _ in range(d)])
            sig1 = 2.*sig0

            global_event_samplers = []
            local_event_samplers = []
            grad_fns = []
            sigs = []

            for t in range(T-1):
                if t == 0:
                    sigs.append(2*sig0)
                    global_event_samplers.append(chain_bounce_fn(mu1, mu0, sig1, sig0, sig0))
                    grad_fns.append(gaussian_chain_grad_potential_fn(mu1, mu0, sig1, sig0, sig0))
                else:
                    sigs.append(sigs[t-1] + sig0)
                    global_event_samplers.append(chain_bounce_fn(mu1, mu0, sigs[t], sigs[t-1], sigs[t-1]))
                    grad_fns.append(gaussian_chain_grad_potential_fn(mu1, mu0, sigs[t], sigs[t-1], sigs[t-1]))

            for t in range(T):        
                local_event_samplers.append(alias_sample(t, cov_n, cov_p, cov_list, sign, ys))



            bounce_fns = [gaussian_bounce(mu0, sig0)] + global_event_samplers + local_event_samplers

            grad_factor_potential_fns = [gaussian_grad_potential_fn(mu0, sig0)] + \
            grad_fns + \
            [grad_logistic(cov_list[g], ys[g]) for g in range(T)]


            factor_indices = [[i for i in range(d)]] + \
                             [[t*d+i for i in range(d)]+[(t+1)*d+i for i in range(d)] for t in range(T-1)] + \
                             [[d*t+i for i in range(d)] for t in range(T)] 

            factor_potential_fns = [lambda x: x for _ in grad_factor_potential_fns]

            nodes = list(set(n for f in factor_indices for n in f ))


            model = FactorGraph(dim_x=len(nodes),
                              factor_indices=factor_indices,
                              factor_potential_fns=factor_potential_fns,
                              grad_factor_potential_fns=grad_factor_potential_fns)


        init_x = np.random.randn(len(nodes))
        init_v = np.random.randn(len(nodes))


        if sampler == 'local':
            local_bps = LocalBPS(init_x = init_x,
                     init_v = init_v,
                     factor_graph = model,
                     bounce_fns=bounce_fns,
                     refresh_rate=0.001)

            start = time.time()
            nsim= 2*10**5
            results = local_bps.simulate(nsim)
            res = results
            duration = time.time()-start
            print(duration)
            print('Finished run')


            m = min(len(nodes),100)
            esses = []
            for i in range(m):
                x1,v1,t1=get_xtv(res,i)
                x = interp(x1,t1,v1, num_intervals=nsim*5)
                e = ess(x, method='bulk')
                esses.append(e)

            ess_speed = np.mean(esses)/duration
            del res


        if sampler == 'masked':

            def make_split_fn(d, T):
                d = d
                T = T
                def split_mask_into_groups(factor_indices, mask):
                    blocks = []
                    current_block = []
                    current_block.append(0)
                    current_block.append(T)
                    for f in range(1, T):
                        prev_f = f - 1
                        if np.sum(mask[np.intersect1d(factor_indices[prev_f], factor_indices[f])]) > 0:
                            current_block.append(f)
                            current_block.append(f+T)
                        else:
                            blocks.append(current_block)
                            current_block = []
                            current_block.append(f)
                            current_block.append(f+T)
                    blocks.append(current_block)
                    return blocks
                return split_mask_into_groups

            def sample_mask(num_cuts, num_params, factor_indices, d):
                    mask = np.repeat(1, num_params)
                    f_is = np.random.choice(len(factor_indices),num_cuts)

                    for i in f_is:
                        cuts = factor_indices[i][-d:]
                        mask[cuts]=0
                        mask[:d] = 1
                        mask[-d:] = 1
                    return mask

            num_params = len(nodes)

            #-----------------------------------------------------------------------------
            sample_mask_fn = lambda: sample_mask(num_cuts, num_params, factor_indices, d)
            init_mask = sample_mask_fn()

            # Shutdown and init ray
            if ray.is_initialized():
                ray.shutdown()

            ray.init()


            # init sampler
            #--------------------------------------------------------------------------------------
            mlbps = MaskedLocalBPS(init_x = init_x,
                                   init_v = init_v,
                                   init_mask = init_mask,
                                   factor_graph=model,
                                   bounce_fns=bounce_fns,
                                   refresh_rate= refresh_rate,
                                   split_mask_fn = make_split_fn(d, T),
                                   sample_mask_fn=sample_mask_fn,
                                   max_number_sub_samplers = num_workers)

            # run sampler
            start = datetime.datetime.now()
            res = mlbps.simulate_for_time(run_time, output_dir=out_dir)
            results, groups, masks = res
            stop = datetime.datetime.now()
            time_delta = (stop-start).seconds
            print('Finished run')
            print(time_delta)

            # Shutdown and init ray
            if ray.is_initialized():
                ray.shutdown()

            output_reader = OutputReader(mlbps)
            output, num_iterations = output_reader.read_output(out_dir, verbose = False, inplace =False)

            m = min(100,mlbps.d)
            chains = {}
            esses = []
            for i in range(m):
                x, v, t, mask = output[i]['x'], output[i]['v'], output[i]['t'], output[i]['mask']
                x = np.array(x)
                v = np.array(v)
                t = np.array(t)
                e = ess(x, method='bulk')
                esses.append(e)
        #         mask = np.array(mask)
        #         nsim = len(x)
        #         xs = interp(x, t, v*mask, num_intervals= nsim*5)
        #         chains["x_{0}".format(i)] = xs
            ess_speed = np.mean(esses)/time_delta
            del res
            shutil.rmtree(out_dir)
        #     mcmc_diagnostic_obj = MCMCDiagnostic(chains)
        #     esses = [mcmc_diagnostic_obj.ess('x_{0}'.format(i)) for i in range(d)]

        #     iteration_speed = num_iterations/time_delta
        #     ess_speed  = np.mean(esses)/time_delta



        results = {
            "ess_speed" : ess_speed.item(),
            'T' : T,
            'd' : d,
            'method': sampler,
            'data': tag
        }
        print(results)
        pickle_obj(results, os.path.join('./', 'dbt_{0}_{1}.pkl'.format(tag, sampler)))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


  cov_list = np.array(cov_list)


100%|██████████| 200000/200000 [02:33<00:00, 1300.80it/s]


163.9191119670868
Finished run
{'ess_speed': 0.010008268245397548, 'T': 69, 'd': 61, 'method': 'local', 'data': 'football'}


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


  cov_list = np.array(cov_list)
2021-02-18 04:15:51,863	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=32259)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32259)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32262)[0m   e = np.exp(covariates.dot(x+t*v))
[2m[36m(pid=32262)[0m   return np.maximum(0.,(covariates*(e/(1+e)-y)).dot(v))
[2m[36m(pid=32262)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32262)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32265)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32265)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32269)[0m   e = np.exp(covariates.dot(x+t*v))
[2m[36m(pid=32269)[0m   return np.maximum(0.,(covariates*(e/(1+e)-y)).dot(v))
[2m[36m(pid=32264)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32264)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32272)[0m   e = np.exp(covariates.dot(x+t*v))
[2m[36m(pid=32272)[0m   return np.maximum(0.,(covariates*(e/(1+e)-y)).dot(v))
[2m[36m(pid=32272)[0m   e =

[2m[36m(pid=32259)[0m   e = np.exp(covariates.dot(x+t*v))
[2m[36m(pid=32259)[0m   return np.maximum(0.,(covariates*(e/(1+e)-y)).dot(v))
[2m[36m(pid=32270)[0m   e = np.exp(covariates.dot(x+t*v))
[2m[36m(pid=32270)[0m   return np.maximum(0.,(covariates*(e/(1+e)-y)).dot(v))
[2m[36m(pid=32267)[0m   e = np.exp(covariates.dot(x+t*v))
[2m[36m(pid=32267)[0m   return np.maximum(0.,(covariates*(e/(1+e)-y)).dot(v))


Finished run
70


  cov_list = np.array(cov_list)
  0%|          | 0/200000 [00:00<?, ?it/s]

{'ess_speed': 0.0909632710621263, 'T': 69, 'd': 61, 'method': 'masked', 'data': 'football'}


100%|██████████| 200000/200000 [00:50<00:00, 3987.78it/s]


51.444275856018066
Finished run
{'ess_speed': 0.3783762921385651, 'T': 9, 'd': 31, 'method': 'local', 'data': 'nfl'}


  cov_list = np.array(cov_list)
2021-02-18 04:21:10,445	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=32675)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32675)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32675)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32675)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32684)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32684)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32684)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32684)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32683)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32683)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32683)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32683)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32682)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32682)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32682)[0m   e = np.exp(cov.dot(x))
[2m[36m(pid=32682)[0m   out = cov*(e/(1+e)-y)
[2m[36m(pid=32682)[0m   e = np.exp(covariates.dot(x+t*v))

Finished run
83
{'ess_speed': 0.13570349940121632, 'T': 9, 'd': 31, 'method': 'masked', 'data': 'nfl'}
