In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm 

from model.mma_features import DataPreprocessor, SimpleFeatureExtractor, EloFeatureExtractor, BioFeatureExtractor

stats_df = pd.read_csv("data/clean_stats3.csv")
# run thru preprocessor to get _opp stats for each fight
DP = DataPreprocessor(stats_df)
pp_df = DP.get_preprocessed_df()
pp_df.head()

Unnamed: 0,Date,Opponent,Event,FighterResult,TSL,TSA,SSL,SSA,TSL-TSA,KD,...,SDHA_opp,SDLL_opp,SDLA_opp,TD_fails_opp,submission_rate_opp,distance_strikes_landed_opp,clinch_strikes_landed_opp,standing_strikes_opp,KD_power_opp,ground_strikes_landed_opp
0,1991-09-26,Murilo Bustamante,Desafio,L,,,,,,,...,,,,,,,,,,
1,1992-01-01,Renzo Gracie,Desafio,L,,,,,,,...,,,,,,,,,,
2,1993-08-29,Moura Moura,CP X CB,W,,,,,,,...,,,,,,,,,,
3,1993-08-29,Jose Landi-Jons,CP X CB,L,,,,,,,...,,,,,,,,,,
4,1993-11-08,Ken Shamrock,Pancrase,L,,,,,,,...,,,,,,,,,,


In [3]:
simple_fe = SimpleFeatureExtractor(pp_df)
simple_fe.fit_transform_all()
simple_fe.trans_df.head()

100%|██████████| 4450/4450 [00:43<00:00, 101.46it/s]


Unnamed: 0,FighterID,OpponentID,Date,total_fights,total_ufc_fights,t_since_last_fight,total_fights_opp,total_ufc_fights_opp,t_since_last_fight_opp,t_since_last_fight_diff,t_since_last_fight_log_diff,total_fights_diff,total_fights_sqrt_diff,total_ufc_fights_diff,total_ufc_fights_sqrt_diff
0,2558095/marcelo-mendes,2354059/murilo-bustamante,1991-09-26,0,0,730,0,0,730,0,0.0,0,0.0,0,0.0
1,2501396/luiz-augusto-alvareda,2354119/renzo-gracie,1992-01-01,0,0,730,0,0,730,0,0.0,0,0.0,0,0.0
2,2354132/jose-landi-jons,3107994/moura-moura,1993-08-29,0,0,730,0,0,730,0,0.0,0,0.0,0,0.0
4,2557847/takaku-fuke,2335653/ken-shamrock,1993-11-08,0,0,730,0,0,730,0,0.0,0,0.0,0,0.0
8,2335738/gerard-gordeau,2504082/kevin-rosier,1993-11-12,0,0,730,0,0,730,0,0.0,0,0.0,0,0.0


In [4]:
# real_elo_target_cols = ["SSL", "TSL", "TDL"]
real_elo_target_cols = [
#     "fighter_result_time_left", 
    "ordinal_fighter_result",
    "submission_fighter_result",
    "tko_ko_fighter_result",
    "decision_fighter_result",
    "finish_fighter_result",
]
diff_elo_target_cols = [
    'TSL',
    'TDL',
    'TDS',
    'SSL',
    'SM',
    'RV',
    'KD',
    'SGHL',
    'SGBL',
    'SCBL',
    'SCHL',
    'ADTB',
    'ADTM',
    'AD',
    'TD_fails',
    'distance_strikes_landed',
    'clinch_strikes_landed',
    'standing_strikes',
    'ground_strikes_landed',
]
binary_elo_target_cols = ["Win"]

elo_fe = EloFeatureExtractor(pp_df.assign(Win=pp_df["FighterResult"].map({"W":1.,"L":0.,"D":np.nan})), 
                             elo_alpha=0.4,
                             real_elo_target_cols = real_elo_target_cols, 
                             diff_elo_target_cols = diff_elo_target_cols,
                             binary_elo_target_cols = binary_elo_target_cols)
elo_fe.fit_transform_all()
elo_fe.elo_df.head()

3959it [00:00, 6872.04it/s]
3959it [00:00, 6093.56it/s]
3959it [00:00, 6744.59it/s]
3962it [00:00, 6529.07it/s]
3959it [00:00, 6298.96it/s]
3959it [00:00, 6408.03it/s]
3959it [00:00, 6760.29it/s]
3959it [00:00, 6233.30it/s]
3959it [00:00, 6292.59it/s]
3959it [00:00, 6622.54it/s]
3962it [00:00, 5250.75it/s]
3959it [00:00, 5728.86it/s]
3959it [00:00, 6542.50it/s]
3959it [00:00, 6880.49it/s]
3962it [00:00, 5412.27it/s]
39390it [00:07, 5603.79it/s]
3959it [00:00, 5375.66it/s]
40119it [00:06, 5801.43it/s]
3962it [00:00, 6991.84it/s]
40119it [00:06, 6377.43it/s]
3959it [00:00, 6308.96it/s]
40119it [00:05, 7183.42it/s]
3959it [00:00, 7209.01it/s]
40119it [00:05, 6702.49it/s]
40119it [00:06, 5856.30it/s]


Unnamed: 0,FighterID,OpponentID,oldFighterEloAD,oldOpponentEloAD,predTargetAD,targetAD,Date,newFighterEloAD,newOpponentEloAD,oldEloDiffAD,...,newFighterElosubmission_fighter_result,newOpponentElosubmission_fighter_result,oldEloDiffsubmission_fighter_result,oldFighterElotko_ko_fighter_result,oldOpponentElotko_ko_fighter_result,predTargettko_ko_fighter_result,targettko_ko_fighter_result,newFighterElotko_ko_fighter_result,newOpponentElotko_ko_fighter_result,oldEloDifftko_ko_fighter_result
0,2335635/jason-delucia,2335757/trent-jenkins,0.0,0.0,0.0,1.0,1993-11-12,0.2,-0.2,0.0,...,0.2,-0.2,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
1,2335738/gerard-gordeau,2504081/teila-tuli,0.0,0.0,0.0,0.0,1993-11-12,0.0,0.0,0.0,...,-0.128,-0.032,-0.16,0.0,0.0,0.0,1,0.2,-0.2,0.0
2,2335728/zane-frazier,2504082/kevin-rosier,0.0,0.0,0.0,0.0,1993-11-12,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1,-0.2,0.2,0.0
3,2335738/gerard-gordeau,2504082/kevin-rosier,0.0,0.0,0.0,0.0,1993-11-12,0.0,0.0,0.0,...,-0.1024,-0.0256,-0.128,0.2,0.2,0.0,1,0.4,0.0,0.0
4,2335635/jason-delucia,2951141/scott-baker,0.2,0.0,0.2,0.267949,1994-03-11,0.21359,-0.01359,0.2,...,0.2256,-0.1936,0.032,0.0,0.0,0.0,0,0.0,0.0,0.0


In [5]:
feat_df = elo_fe.elo_df.merge(
    simple_fe.trans_df,
    on=["FighterID", "OpponentID", "Date"],
    how="inner"
)

bio_df = pd.read_csv("data/clean_bios.csv")
bio_fe = BioFeatureExtractor(bio_df)
feat_df = bio_fe.fit_transform_all(feat_df)
feat_df.head()

Unnamed: 0,FighterID,OpponentID,oldFighterEloAD,oldOpponentEloAD,predTargetAD,targetAD,Date,newFighterEloAD,newOpponentEloAD,oldEloDiffAD,...,imp_reach_opp,imp_weight_opp,imp_height_opp,age,age_opp,age_diff,reach_diff,weight_diff,log_weight_diff,height_diff
0,2335635/jason-delucia,2335757/trent-jenkins,0.0,0.0,0.0,1.0,1993-11-12,0.2,-0.2,0.0,...,78.426958,250.0,75.854283,,,0.0,-5.273051,-64.4375,-0.298069,-4.64658
1,2335738/gerard-gordeau,2504081/teila-tuli,0.0,0.0,0.0,0.0,1993-11-12,0.0,0.0,0.0,...,78.426958,250.0,75.854283,34.646575,,0.0,0.0,0.0,0.0,0.0
2,2335728/zane-frazier,2504082/kevin-rosier,0.0,0.0,0.0,0.0,1993-11-12,0.0,0.0,0.0,...,71.634058,167.003261,69.868379,27.345205,,0.0,0.0,0.0,0.0,0.0
3,2335738/gerard-gordeau,2504082/kevin-rosier,0.0,0.0,0.0,0.0,1993-11-12,0.0,0.0,0.0,...,71.634058,167.003261,69.868379,34.646575,,0.0,0.0,0.0,0.0,0.0
4,2335635/jason-delucia,2951141/scott-baker,0.2,0.0,0.2,0.267949,1994-03-11,0.21359,-0.01359,0.2,...,78.426958,250.0,75.854283,,,0.0,-5.273051,-64.4375,-0.298069,-4.64658


In [6]:
# okay i need the opening moneyline...
ml_df = pd.read_csv("data/clean_stats_plus_ml.csv")[[
    "Date", "FighterID", "OpponentID", 
    'FighterOpen', 'OpponentOpen',
    'FighterCloseLeft', 'FighterCloseRight', 'OpponentCloseLeft',
    'OpponentCloseRight',
]]
ml_df["Date"] = pd.to_datetime(ml_df["Date"])

def parse_american_odds(x:pd.Series):
    fav_inds = x <= 0
    dog_inds = x > 0
    y = pd.Series(0, index=x.index)
    y.loc[fav_inds] = -1 * x / (100 - x)
    y.loc[dog_inds] = 100 / (100 + x)
    return y

ml_df["p_fighter"] = parse_american_odds(ml_df["FighterOpen"])
ml_df["p_opponent"] = parse_american_odds(ml_df["OpponentOpen"])
ml_df["p_fighter_midpoint"] = (ml_df["p_fighter"] + 1 - ml_df["p_opponent"]) / 2
ml_df["p_fighter_implied"] = ml_df["p_fighter"] / (ml_df["p_fighter"] + ml_df["p_opponent"])
# print(feat_df.shape, ml_df.shape)
ml_df.columns

Index(['Date', 'FighterID', 'OpponentID', 'FighterOpen', 'OpponentOpen',
       'FighterCloseLeft', 'FighterCloseRight', 'OpponentCloseLeft',
       'OpponentCloseRight', 'p_fighter', 'p_opponent', 'p_fighter_midpoint',
       'p_fighter_implied'],
      dtype='object')

In [7]:
feat_ml_df = feat_df.merge(
    ml_df, 
    on=["Date", "FighterID", "OpponentID"],
    how="inner"
)
print(feat_ml_df.shape)
feat_ml_df.head()

(3220, 217)


Unnamed: 0,FighterID,OpponentID,oldFighterEloAD,oldOpponentEloAD,predTargetAD,targetAD,Date,newFighterEloAD,newOpponentEloAD,oldEloDiffAD,...,FighterOpen,OpponentOpen,FighterCloseLeft,FighterCloseRight,OpponentCloseLeft,OpponentCloseRight,p_fighter,p_opponent,p_fighter_midpoint,p_fighter_implied
0,2335629/bj-penn,2335885/jens-pulver,0.842355,0.364097,0.478257,2.645751,2007-06-23,1.275853,-0.069401,0.478257,...,-490.0,340.0,-357.0,-330.0,225.0,280.0,0.830508,0.227273,0.801618,0.785142
1,2335676/joe-lauzon,2354360/brandon-melendez,0.091024,0.0,0.091024,2.0,2007-06-23,0.472819,-0.381795,0.091024,...,-430.0,330.0,-700.0,-475.0,355.0,425.0,0.811321,0.232558,0.789381,0.777217
2,2335447/anderson-silva,2335475/nate-marquardt,0.227196,0.736307,-0.509111,-1.0,2007-07-07,0.129018,0.834485,-0.509111,...,-170.0,150.0,-145.0,-145.0,125.0,125.0,0.62963,0.4,0.614815,0.611511
3,2335302/heath-herring,2335521/antonio-rodrigo-nogueira,0.10641,0.0,0.10641,-2.645751,2007-07-07,-0.444022,0.550432,0.10641,...,470.0,-810.0,500.0,500.0,-700.0,-700.0,0.175439,0.89011,0.142664,0.164646
4,2335694/frankie-edgar,2335717/mark-bocek,0.0,0.0,0.0,1.414214,2007-07-07,0.282843,-0.282843,0.0,...,-260.0,180.0,-230.0,-230.0,190.0,190.0,0.722222,0.357143,0.68254,0.669118


In [8]:
feat_cols = [
    "oldEloDiffordinal_fighter_result", 
    "oldEloDiffsubmission_fighter_result",
    "oldEloDifftko_ko_fighter_result",
    "oldEloDiffdecision_fighter_result",
#     "oldEloDifffinish_fighter_result",
#     "oldEloDifffighter_result_time_left",
    
#     'oldEloDiffAD', 
    'oldEloDiffADTB', 
    'oldEloDiffADTM', 
    'oldEloDiffKD',
    'oldEloDiffRV', 
#     'oldEloDiffSCBL', 
#     'oldEloDiffSCHL', 
    'oldEloDiffSGBL',
    'oldEloDiffSGHL', 
    'oldEloDiffSM', 'oldEloDiffSSL', 'oldEloDiffTDL',
    'oldEloDiffTDS', 
#     'oldEloDiffTD_fails', 
    'oldEloDiffTSL',
    'oldEloDiffclinch_strikes_landed', 
    'oldEloDiffdistance_strikes_landed',
    'oldEloDiffground_strikes_landed', 
    'oldEloDiffstanding_strikes',
    
    'oldEloDiffWin',
    
    "t_since_last_fight_log_diff", 
#     "t_since_last_fight_diff",
    "total_fights_sqrt_diff", 
    "total_ufc_fights_diff",
    
    "age_diff", "reach_diff", 
#     "weight_diff", 
    "log_weight_diff",
    "height_diff",
]

max_train_dt = pd.to_datetime("2021-07-01")

train_df = feat_ml_df.loc[feat_ml_df["Date"] <= max_train_dt].dropna(subset=[*feat_cols, "targetWin"])
test_df = feat_ml_df.loc[feat_ml_df["Date"] > max_train_dt].dropna(subset=[*feat_cols, "targetWin"])

In [9]:
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler(with_mean=False)
scale_ = (train_df[feat_cols]**2).mean(0)
X_train = train_df[feat_cols] / scale_
X_test = test_df[feat_cols] / scale_

y_train = train_df["targetWin"]
y_test = test_df["targetWin"]

In [10]:
def logit(x):
    return np.log(x) - np.log(1-x)

ml_train = logit(train_df["p_fighter_implied"])
ml_test = logit(test_df["p_fighter_implied"])

In [16]:
# okay uhh
import stan

code = """

data {
    int<lower=0> n;                     // number of data points in training data
    int<lower=0> n2;                    // number of data points in test data
    int<lower=1> d;                     // explanatory variable dimension
    int<lower=0,upper=1> y[n];          // response variable
    real<lower=0> beta_prior_std;       // prior scale on beta

    matrix[n, d] X;                     // explanatory variable
    vector[n] ml_logit;                   // logit of the opening money line

    matrix[n2, d] X2;                   // test data
    vector[n2] ml_logit2;                 // test data

}

parameters {
    vector[d] beta;
}

transformed parameters {
    vector[n] eta;
    vector[n2] eta2;
    eta = ml_logit + (X * beta);      // linear predictor
    eta2 = ml_logit2 + (X2 * beta);   // linear predictor for test data
}

model {
    for(i in 1:d){
        beta[i] ~ normal(0, beta_prior_std);
        //beta[i] ~ cauchy(0, beta_prior_std); //prior for slopes following gelman 2008
    }

    // observation model
    y ~ bernoulli_logit(eta);
}

generated quantities {
    vector[n2] y_pred;
    y_pred = inv_logit(eta2);  // y values predicted for test data
}
"""

class SimpleSymmetricModel(object):
    
    def __init__(self, beta_prior_std=0.1, num_chains=4, num_samples=1000):
        self.beta_prior_std = float(beta_prior_std)
        self.code = code
        self.scale_ = None
        self.fit = None
        self.num_chains = num_chains
        self.num_samples = num_samples
        
    def _fit(self, data):
        posterior = stan.build(self.code, data=data, random_seed=1)
        fit = posterior.sample(num_chains=self.num_chains, num_samples=self.num_samples)
        self.fit = fit
        return fit
        
    def fit_predict(self, train_df, test_df, feat_cols):
        scale_ = (train_df[feat_cols]**2).mean(0)
        self.scale_ = scale_
        X_train = train_df[feat_cols] / scale_
        X_test = test_df[feat_cols] / scale_

        y_train = train_df["targetWin"]
        y_test = test_df["targetWin"]

        ml_train = logit(train_df["p_fighter_implied"])
        ml_test = logit(test_df["p_fighter_implied"])
        
        data = {
            "n": train_df.shape[0],
            "n2": test_df.shape[0],
            "d": len(feat_cols),
            "y": y_train.astype(int).values,
            "beta_prior_std": self.beta_prior_std,
            "X": X_train.values,
            "ml_logit": ml_train.values,
            "X2": X_test.values,
            "ml_logit2": ml_test.values,
        }

        fit = self._fit(data)
        return fit["y_pred"].mean(1)

sym_model = SimpleSymmetricModel(beta_prior_std=2.0, num_samples=500)
y_pred = sym_model.fit_predict(train_df, test_df, feat_cols)
(y_pred.round() == test_df["targetWin"]).mean()

Building...



Building: found in cache, done.Messages from stanc:
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
Sampling:   0%
Sampling:   0% (1/6000)
Sampling:   0% (2/6000)
Sampling:   0% (3/6000)
Sampling:   0% (4/6000)
Sampling:   2% (103/6000)
Sampling:   3% (202/6000)
Sampling:   5% (301/6000)
Sampling:   7% (400/6000)
Sampling:   8% (500/6000)
Sampling:  10% (600/6000)
Sampling:  12% (700/6000)
Sampling:  13% (800/6000)
Sampling:  15% (900/6000)
Sampling:  17% (1000/6000)
Sampling:  18% (1100/6000)
Sampling:  20% (1200/6000)
Sampling:  22% (1300/6000)
Sampling:  23% (1400/6000)
Sampling:  25% (1500/6000)
Sampling:  27% (1600/6000)
Sampling:  28% (1700/6000)
Sampling:  30% (1800/6000)
Sampling:  32% (1900/6000)
Sampling:  33% (2000/6000)
Sampling:  35% (2100/6000)
Sampling:  37% (2200/6000)
Sampling:  38% (

0.6555891238670695

In [17]:
from sklearn.metrics import log_loss

log_loss(y_pred=y_pred, y_true=y_test)

0.6099921107636985

In [18]:
pd.DataFrame(list(zip(feat_cols, sym_model.fit["beta"].mean(1)))).sort_values(1)

Unnamed: 0,0,1
0,oldEloDiffordinal_fighter_result,-1.522528
22,total_ufc_fights_diff,-0.406048
17,oldEloDiffground_strikes_landed,-0.168887
14,oldEloDiffTSL,-0.067233
26,height_diff,-0.051841
20,t_since_last_fight_log_diff,-0.036082
6,oldEloDiffKD,-0.018691
5,oldEloDiffADTM,-0.008985
7,oldEloDiffRV,-0.000976
19,oldEloDiffWin,0.001228


In [64]:
from sklearn.decomposition import PCA, KernelPCA

class PcaSymmetricModel(object):
    
    def __init__(self, beta_prior_std=0.1, n_pca=8, num_chains=4, num_samples=1000):
        self.beta_prior_std = float(beta_prior_std)
        self.n_pca = n_pca
        self.code = code
        self.scale_ = None
        self.pca = PCA(n_components=n_pca, whiten=True)
        self.fit = None
        self.num_chains = num_chains
        self.num_samples = num_samples
        
    def _fit(self, data):
        posterior = stan.build(self.code, data=data, random_seed=1)
        fit = posterior.sample(num_chains=self.num_chains, num_samples=self.num_samples)
        self.fit = fit
        return fit
        
    def fit_predict(self, train_df, test_df, feat_cols):
        scale_ = (train_df[feat_cols]**2).mean(0)
        self.scale_ = scale_
        X_train = train_df[feat_cols] / scale_
        X_test = test_df[feat_cols] / scale_
        
        # pca happens here
        X_pca_train = self.pca.fit_transform(X_train)
        X_pca_test = self.pca.transform(X_test)

        y_train = train_df["targetWin"]
        y_test = test_df["targetWin"]

        ml_train = logit(train_df["p_fighter_implied"])
        ml_test = logit(test_df["p_fighter_implied"])
        
        data = {
            "n": train_df.shape[0],
            "n2": test_df.shape[0],
            "d": X_pca_train.shape[1],
            "y": y_train.astype(int).values,
            "beta_prior_std": self.beta_prior_std,
            "X": X_pca_train,
            "ml_logit": ml_train.values,
            "X2": X_pca_test,
            "ml_logit2": ml_test.values,
        }

        fit = self._fit(data)
        return fit["y_pred"].mean(1)

pca_model = PcaSymmetricModel(n_pca=16, beta_prior_std=1.0, num_samples=500)
y_pred = pca_model.fit_predict(train_df, test_df, feat_cols)
acc = (y_pred.round() == test_df["targetWin"]).mean()
print(f"accuracy: {acc}")
print(f"log loss: {log_loss(y_pred=y_pred, y_true=y_test)}")

Building...



Building: found in cache, done.Messages from stanc:
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
Sampling:   0%
Sampling:   8% (500/6000)
Sampling:  17% (1000/6000)
Sampling:  23% (1400/6000)
Sampling:  27% (1600/6000)
Sampling:  33% (2000/6000)
Sampling:  50% (3000/6000)
Sampling:  68% (4100/6000)
Sampling:  90% (5400/6000)
Sampling: 100% (6000/6000)
Sampling: 100% (6000/6000), done.
Messages received during sampling:
  Gradient evaluation took 0.000132 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.32 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.00016 seconds
  1000 transitions using 10 leapfrog steps per transition would take 1.6 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 0.000133 seconds
  1000 tra

accuracy: 0.649546827794562
log loss: 0.6088821271992869


In [19]:
gender_df = pd.read_csv("data/fighter_genders.csv")
# gender_map = gender_df.set_index("FighterID").to_dict()["gender"]

train_gender_df = train_df.merge(gender_df, on="FighterID", how="left")
test_gender_df = test_df.merge(gender_df, on="FighterID", how="left")

train_gender_df["gender"].isnull().any(), test_gender_df["gender"].isnull().any()

(False, False)

In [30]:
# okay uhh
import stan

hier_code = """

data {
    int<lower=0> n;                     // number of data points in training data
    int<lower=0> n2;                    // number of data points in test data
    int<lower=1> d;                     // explanatory variable dimension
    int<lower=0,upper=1> y[n];          // response variable
    real<lower=0> beta_prior_std;       // prior scale on beta mean across groups
    real<lower=0> intra_group_std;      // prior scale on beta, std dev of group's beta around mean
    
    int<lower=1> L;                     // number of levels (if this is gender, then it's just 2)
    int<lower=1,upper=L> ll[n];         // level of each fight in train
    int<lower=1,upper=L> ll2[n2];       // level of each fight in test

    matrix[n, d] X;                     // explanatory variable
    vector[n] ml_logit;                   // logit of the opening money line

    matrix[n2, d] X2;                   // test data
    vector[n2] ml_logit2;                 // test data

}

parameters {
    vector[d] beta[L];    // coefficient values for each level
    real mu[d];           // mean of coefficient values for each level
}

transformed parameters {
    //vector[n] eta;
    //vector[n2] eta2;
    //eta = ml_logit + (X * beta);      // linear predictor
    //eta2 = ml_logit2 + (X2 * beta);   // linear predictor for test data
}

model {

    mu ~ normal(0, beta_prior_std);
    for (l in 1:L)
        beta[l] ~ normal(mu, intra_group_std); // TODO magic number - more or less defines covariance btw genders

    vector[n] x_beta_ll;
    for (i in 1:n)
        x_beta_ll[i] = X[i] * beta[ll[i]];
    y ~ bernoulli_logit(ml_logit + x_beta_ll);
}

generated quantities {
    vector[n2] y_pred;
    
    for(i in 1:n2){
        y_pred[i] = inv_logit(ml_logit2[i] + X[i] * beta[ll2[i]]);
    }
    //y_pred = inv_logit(eta2);  // y values predicted for test data
}
"""

class HierarchicalSymmetricModel(SimpleSymmetricModel):
    
    def __init__(self, beta_prior_std=1.0, intra_group_std=0.1, num_chains=4, num_samples=100):
        self.beta_prior_std = float(beta_prior_std)
        self.intra_group_std = float(intra_group_std)
        self.code = hier_code
        self.scale_ = None
        self.fit = None
        self.num_chains = num_chains
        self.num_samples = num_samples
        
    def fit_predict(self, train_df, test_df, feat_cols):
        scale_ = (train_df[feat_cols]**2).mean(0)
        self.scale_ = scale_
        X_train = train_df[feat_cols] / scale_
        X_test = test_df[feat_cols] / scale_

        y_train = train_df["targetWin"]
        y_test = test_df["targetWin"]

        ml_train = logit(train_df["p_fighter_implied"])
        ml_test = logit(test_df["p_fighter_implied"])
        
        gender_int_train = train_df["gender"].map({"M":1, "W":2})
        gender_int_test = test_df["gender"].map({"M":1, "W":2})
        
        data = {
            "n": train_df.shape[0],
            "n2": test_df.shape[0],
            "d": len(feat_cols),
            "y": y_train.astype(int).values,
            "beta_prior_std": self.beta_prior_std,
            "intra_group_std": self.intra_group_std,
            "L": 2,
            "ll": gender_int_train.values,
            "ll2": gender_int_test.values,
            "X": X_train.values,
            "ml_logit": ml_train.values,
            "X2": X_test.values,
            "ml_logit2": ml_test.values,
        }
        fit = self._fit(data)
        return fit["y_pred"].mean(1)

hier_model = HierarchicalSymmetricModel(beta_prior_std=1.0, intra_group_std=0.5)
y_pred = hier_model.fit_predict(train_gender_df, test_gender_df, feat_cols)
(y_pred.round() == test_gender_df["targetWin"]).mean()

Building...



Building: 15.1s, done.Messages from stanc:
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
    of arrays by

0.6586102719033232

In [31]:
# since i chose an aggressively tight spread btw m and w, 
# this should be close to 0.6087682255527661
log_loss(y_pred=y_pred, y_true=y_test)

0.6389669085580539

In [32]:
m_beta = hier_model.fit["beta"][0].mean(1)
w_beta = hier_model.fit["beta"][1].mean(1)

beta_df = pd.DataFrame({
    "m_beta": m_beta,
    "w_beta": w_beta,
    "diff": m_beta - w_beta,
    "feat": feat_cols,
})
beta_df.sort_values("diff")

Unnamed: 0,m_beta,w_beta,diff,feat
26,-0.060313,0.24869,-0.309003,height_diff
20,-0.078592,0.194258,-0.27285,t_since_last_fight_log_diff
17,-0.147215,-0.011044,-0.136171,oldEloDiffground_strikes_landed
14,-0.087859,0.035567,-0.123426,oldEloDiffTSL
1,0.375854,0.474594,-0.098739,oldEloDiffsubmission_fighter_result
21,0.067404,0.144301,-0.076897,total_fights_sqrt_diff
2,0.432484,0.468904,-0.03642,oldEloDifftko_ko_fighter_result
6,-0.021657,0.01362,-0.035277,oldEloDiffKD
12,0.039987,0.074489,-0.034502,oldEloDiffTDL
22,-0.421128,-0.393234,-0.027894,total_ufc_fights_diff


# Trying some weird indexing

In [60]:
# okay uhh
import stan

hier_code = """

data {
    int<lower=0> n;                     // number of data points in training data
    int<lower=0> n2;                    // number of data points in test data
    int<lower=1> d;                     // explanatory variable dimension
    int<lower=0,upper=1> y[n];          // response variable
    real<lower=0> beta_prior_std;       // prior scale on beta mean across groups
    real<lower=0> intra_group_std;      // prior scale on beta, std dev of group's beta around mean
    
    vector[n] is_m;      // 0 if woman, 1 if man
    vector[n2] is_m2;    // 0 if woman, 1 if man
    
    matrix[n, d] X;                     // explanatory variable
    vector[n] ml_logit;                   // logit of the opening money line

    matrix[n2, d] X2;                   // test data
    vector[n2] ml_logit2;                 // test data

}

parameters {
    vector[d] beta_m;
    vector[d] beta_w;
}

transformed parameters {
    vector[n] eta;
    vector[n2] eta2;
    eta = (
        ml_logit + 
        ((X * beta_m) .* is_m) + 
        ((X * beta_w) .* (1 - is_m))
    );      // linear predictor
    eta2 = (
        ml_logit2 + 
        ((X2 * beta_m) .* is_m2) + 
        ((X2 * beta_w) .* (1 - is_m2))
    );   // linear predictor for test data
}

model {
    beta_m ~ normal(0, beta_prior_std);
    beta_w ~ normal(beta_m, intra_group_std); // damn i hope this works

    y ~ bernoulli_logit(eta);
}

generated quantities {
    vector[n2] y_pred;
    
    y_pred = inv_logit(eta2);  // y values predicted for test data
}
"""

class HierarchicalSymmetricModel(SimpleSymmetricModel):
    
    def __init__(self, beta_prior_std=1.0, intra_group_std=0.1, num_chains=4, num_samples=100):
        self.beta_prior_std = float(beta_prior_std)
        self.intra_group_std = float(intra_group_std)
        self.code = hier_code
        self.scale_ = None
        self.fit = None
        self.num_chains = num_chains
        self.num_samples = num_samples
        
    def fit_predict(self, train_df, test_df, feat_cols):
        scale_ = (train_df[feat_cols]**2).mean(0)
        self.scale_ = scale_
        X_train = train_df[feat_cols] / scale_
        X_test = test_df[feat_cols] / scale_

        y_train = train_df["targetWin"]
        y_test = test_df["targetWin"]

        ml_train = logit(train_df["p_fighter_implied"])
        ml_test = logit(test_df["p_fighter_implied"])
        
        is_m_train = train_df["gender"].map({"M":1, "W":0})
        is_m_test = test_df["gender"].map({"M":1, "W":0})
        
        data = {
            "n": train_df.shape[0],
            "n2": test_df.shape[0],
            "d": len(feat_cols),
            "y": y_train.astype(int).values,
            "beta_prior_std": self.beta_prior_std,
            "intra_group_std": self.intra_group_std,
            "is_m": is_m_train.values,
            "is_m2": is_m_test.values,
            "X": X_train.values,
            "ml_logit": ml_train.values,
            "X2": X_test.values,
            "ml_logit2": ml_test.values,
        }
        fit = self._fit(data)
        return fit["y_pred"].mean(1)

hier_model = HierarchicalSymmetricModel(beta_prior_std=1.0, intra_group_std=0.1, num_samples=500)
y_pred = hier_model.fit_predict(train_gender_df, test_gender_df, feat_cols)
acc = (y_pred.round() == test_df["targetWin"]).mean()
print(f"accuracy: {acc}")
print(f"log loss: {log_loss(y_pred=y_pred, y_true=y_test)}")

Building...



Building: found in cache, done.Messages from stanc:
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
Sampling:   0%
Sampling:   0% (1/6000)
Sampling:   0% (2/6000)
Sampling:   0% (3/6000)
Sampling:   0% (4/6000)
Sampling:   2% (103/6000)
Sampling:   3% (202/6000)
Sampling:   5% (301/6000)
Sampling:   7% (400/6000)
Sampling:   8% (500/6000)
Sampling:  10% (600/6000)
Sampling:  12% (700/6000)
Sampling:  13% (800/6000)
Sampling:  15% (900/6000)
Sampling:  17% (1000/6000)
Sampling:  18% (1100/6000)
Sampling:  20% (1200/6000)
Sampling:  22% (1300/6000)
Sampling:  23% (1400/6000)
Sampling:  25% (1500/6000)
Sampling:  27% (1600/6000)
Sampling:  28% (1700/6000)
Sampling:  30% (1800/6000)
Sampling:  32% (1900/6000)
Sampling:  33% (2000/6000)
Sampling:  35% (2100/6000)
Sampling:  37% (2200/6000)
Sampling:  38% (

accuracy: 0.6435045317220544
log loss: 0.6105770307773669


In [63]:
class HierPcaSymmetricModel(HierarchicalSymmetricModel):
    
    def __init__(self, n_pca=8, beta_prior_std=1.0, intra_group_std=0.1, num_chains=4, num_samples=100):
        self.beta_prior_std = float(beta_prior_std)
        self.intra_group_std = float(intra_group_std)
        self.code = hier_code
        self.scale_ = None
        self.n_pca = n_pca
        self.pca = PCA(n_components=n_pca, whiten=True)
        self.fit = None
        self.num_chains = num_chains
        self.num_samples = num_samples
        
    def fit_predict(self, train_df, test_df, feat_cols):
        scale_ = (train_df[feat_cols]**2).mean(0)
        self.scale_ = scale_
        X_train = train_df[feat_cols] / scale_
        X_test = test_df[feat_cols] / scale_
        
        X_pca_train = self.pca.fit_transform(X_train)
        X_pca_test = self.pca.transform(X_test)

        y_train = train_df["targetWin"]
        y_test = test_df["targetWin"]

        ml_train = logit(train_df["p_fighter_implied"])
        ml_test = logit(test_df["p_fighter_implied"])
        
        is_m_train = train_df["gender"].map({"M":1, "W":0})
        is_m_test = test_df["gender"].map({"M":1, "W":0})
        
        data = {
            "n": train_df.shape[0],
            "n2": test_df.shape[0],
            "d": self.n_pca,
            "y": y_train.astype(int).values,
            "beta_prior_std": self.beta_prior_std,
            "intra_group_std": self.intra_group_std,
            "is_m": is_m_train.values,
            "is_m2": is_m_test.values,
            "X": X_pca_train,
            "ml_logit": ml_train.values,
            "X2": X_pca_test,
            "ml_logit2": ml_test.values,
        }
        fit = self._fit(data)
        return fit["y_pred"].mean(1)

hier_model = HierPcaSymmetricModel(n_pca=16, beta_prior_std=1.0, intra_group_std=0.1, num_samples=200)
y_pred = hier_model.fit_predict(train_gender_df, test_gender_df, feat_cols)
acc = (y_pred.round() == test_df["targetWin"]).mean()
print(f"accuracy: {acc}")
print(f"log loss: {log_loss(y_pred=y_pred, y_true=y_test)}")

Building...



Building: found in cache, done.Messages from stanc:
    of arrays by placing brackets after a variable name is deprecated and
    will be removed in Stan 2.32.0. Instead use the array keyword before the
    type. This can be changed automatically using the auto-format flag to
    stanc
Sampling:   0%
Sampling:   2% (100/4800)
Sampling:   4% (200/4800)
Sampling:   6% (300/4800)
Sampling:   6% (301/4800)
Sampling:   8% (401/4800)
Sampling:  10% (501/4800)
Sampling:  13% (601/4800)
Sampling:  15% (700/4800)
Sampling:  19% (900/4800)
Sampling:  21% (1000/4800)
Sampling:  23% (1100/4800)
Sampling:  27% (1300/4800)
Sampling:  29% (1400/4800)
Sampling:  33% (1600/4800)
Sampling:  38% (1800/4800)
Sampling:  40% (1900/4800)
Sampling:  42% (2000/4800)
Sampling:  44% (2100/4800)
Sampling:  46% (2200/4800)
Sampling:  48% (2300/4800)
Sampling:  52% (2500/4800)
Sampling:  56% (2700/4800)
Sampling:  60% (2900/4800)
Sampling:  65% (3100/4800)
Sampling:  73% (3500/4800)
Sampling:  81% (3900/4800)
Samp

accuracy: 0.6435045317220544
log loss: 0.6095273331402441


1