In [24]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize


df = pd.read_csv("fightData.txt")

### Extract Data
Here the data is extracted and processed

In [25]:
def data_extractor(fighter):
    
    mask1 = df["Fighter1"] == fighter
    mask2 = df["Fighter2"] == fighter

    rows = df.loc[mask1 | mask2] # Check for the chosen fighter as both fighter 1 and fighter 2

    results1 = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]] # Fighters results
    results2 = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]] # Opponenets results

    for i in range(len(rows)): # Loop over the matches the chosen fighters was in

        values = rows.values[i]

        num_round = values[6]

        time = ((int(num_round)-1)*5*60 + int(values[7].split(":")[0])*60 + int(values[7].split(":")[1]))

        if values[3] == fighter:
            results1[0].append(1) # Win
            results2[0].append(0)
        else:
            results1[0].append(0) # Lose
            results2[0].append(1)

        if values[1] == fighter: # If the chosen fighter is fighter 1

            results1[1].append(float(values[9])/time) # KD/second
            if float(values[11].split(" ")[2]) == 0:
                results1[2].append(0)
            else:
                results1[2].append(float(values[11].split(" ")[0])/float(values[11].split(" ")[2])) # SIG_STR%
            results1[3].append(float(values[11].split(" ")[0])/time) # SIG_STR/second
            results1[4].append(float(values[15].split(" ")[0])/time) # TD/second
            results1[5].append(float(int(values[17])/time)) # SUB_ATT/second
            if float(values[11].split(" ")[2]) == 0:
                results1[6].append(0)
            else:
                results1[6].append((float(values[11].split(" ")[0])/float(values[11].split(" ")[2]))*(float(values[11].split(" ")[0])/time)) # SIG_HIT/s
            results1[7].append((int(values[21].split(":")[0])*60 + int(values[21].split(":")[1]))/time) # CTRL%
            results1[8].append(float(values[23].split(" ")[0])/time) # Head/second
            results1[9].append(float(values[25].split(" ")[0])/time) # Body/second
            results1[10].append(float(values[27].split(" ")[0])/time) # Leg/second
            results1[11].append(float(values[29].split(" ")[0])/time) # Distance/second
            results1[12].append(float(values[31].split(" ")[0])/time) # Clinch/second
            results1[13].append(float(values[33].split(" ")[0])/time) # Ground/second

            # Opponent
            results2[1].append(float(values[10])/time) # KD/second
            if float(values[12].split(" ")[2]) == 0:
                results2[2].append(0)
            else:
                results2[2].append(float(values[12].split(" ")[0])/float(values[12].split(" ")[2])) # SIG_STR%
            results2[3].append(float(values[12].split(" ")[0])/time) # SIG_STR/second
            results2[4].append(float(values[16].split(" ")[0])/time) # TD/second
            results2[5].append(float(values[18])/time) # SUB_ATT/second
            if float(values[12].split(" ")[2]) == 0:
                results2[6].append(0)
            else:
                results2[6].append((float(values[12].split(" ")[0])/float(values[12].split(" ")[2]))*(float(values[12].split(" ")[0])/time)) # SIG_HIT/s
            results2[7].append((int(values[22].split(":")[0])*60 + int(values[22].split(":")[1]))/time) # CTRL%
            results2[8].append(float(values[24].split(" ")[0])/time) # Head/second
            results2[9].append(float(values[26].split(" ")[0])/time) # Body/second
            results2[10].append(float(values[28].split(" ")[0])/time) # Leg/second
            results2[11].append(float(values[30].split(" ")[0])/time) # Distance/second
            results2[12].append(float(values[32].split(" ")[0])/time) # Clinch/second
            results2[13].append(float(values[34].split(" ")[0])/time) # Ground/second

        else: # If the chosen fighter is fighter 2

            results1[1].append(float(values[10])/time) # KD/second
            if float(values[12].split(" ")[2]) == 0:
                results1[2].append(0)
            else:
                results1[2].append(float(values[12].split(" ")[0])/float(values[12].split(" ")[2])) # SIG_STR%
            results1[3].append(float(values[12].split(" ")[0])/time) # SIG_STR/second
            results1[4].append(float(values[16].split(" ")[0])/time) # TD/second
            results1[5].append(float(values[18])/time) # SUB_ATT/second
            if float(values[12].split(" ")[2]) == 0:
                results1[6].append(0)
            else:
                results1[6].append((float(values[12].split(" ")[0])/float(values[12].split(" ")[2]))*(float(values[12].split(" ")[0])/time)) # SIG_HIT/s
            results1[7].append((int(values[22].split(":")[0])*60 + int(values[22].split(":")[1]))/time) # CTRL%
            results1[8].append(float(values[24].split(" ")[0])/time) # Head/second
            results1[9].append(float(values[26].split(" ")[0])/time) # Body/second
            results1[10].append(float(values[28].split(" ")[0])/time) # Leg/second
            results1[11].append(float(values[30].split(" ")[0])/time) # Distance/second
            results1[12].append(float(values[32].split(" ")[0])/time) # Clinch/second
            results1[13].append(float(values[34].split(" ")[0])/time) # Ground/second

            # Motståndare
            results2[1].append(float(values[9])/time) # KD/second
            if float(values[11].split(" ")[2]) == 0:
                results2[2].append(0)
            else:
                results2[2].append(float(values[11].split(" ")[0])/float(values[11].split(" ")[2])) # SIG_STR%
            results2[3].append(float(values[11].split(" ")[0])/time) # SIG_STR/second
            results2[4].append(float(values[15].split(" ")[0])/time) # TD/second
            results2[5].append(float(values[17])/time) # SUB_ATT/second
            if float(values[11].split(" ")[2]) == 0:
                results2[6].append(0)
            else:
                results2[6].append((float(values[11].split(" ")[0])/float(values[11].split(" ")[2]))*(float(values[11].split(" ")[0])/time)) # SIG_HIT/s
            results2[7].append((int(values[21].split(":")[0])*60 + int(values[21].split(":")[1]))/time) # CTRL%
            results2[8].append(float(values[23].split(" ")[0])/time) # Head/second
            results2[9].append(float(values[25].split(" ")[0])/time) # Body/second
            results2[10].append(float(values[27].split(" ")[0])/time) # Leg/second
            results2[11].append(float(values[29].split(" ")[0])/time) # Distance/second
            results2[12].append(float(values[31].split(" ")[0])/time) # Clinch/second
            results2[13].append(float(values[33].split(" ")[0])/time) # Ground/second

    return results1, results2

In [26]:
def all_variable_creator(fighter, s, skip):
    #fighter is the fighters name, s is the amount of mathces we want to take the rollning average of and skip is the amount of matches we want to skip.
    #you should skip until the match you want to predict, dont skip the matych you predict

    indices = [1,2,3,4,5,6,7,8,9,10,11,12,13]

    data_fighter, data_opps  = data_extractor(fighter)

    weights = 1/s * np.ones(s)

    weights_t = 1/(s-1) * np.ones(s-1)

    all_vars_fighter = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    all_vars_opps = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    test_data_f = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    test_data_o = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

    n = len(data_fighter[0])
    
    for i in range(n): # Loop over all the fights

        if i == skip: # The final data point
            for j in indices:
                test_data_f[j].append( (np.array([data_fighter[j][i+k] for k in range(s-1)]) @ weights_t ) )
                test_data_o[j].append( (np.array([data_opps[j][i+k] for k in range(s-1)]) @ weights_t ) )
        elif i < skip:
            continue

        all_vars_fighter[0].append(data_fighter[0][i]) #Win or Loss
        all_vars_opps[0].append(data_opps[0][i]) #Win or Loss

        # Averages over the last s fights
        if i < n-(s-1):
            for j in indices:
                all_vars_fighter[j].append( (np.array([data_fighter[j][i+k] for k in range(s)]) @ weights ) )
                all_vars_opps[j].append( (np.array([data_opps[j][i+k] for k in range(s)]) @ weights ) )

        # Averages over less than s fights if there are fewer than s left
        else:
            weights = 1/(n-i) * np.ones(n-i)

            for j in indices:
                all_vars_fighter[j].append( np.array([data_fighter[j][k] for k in range(i,n)]) @ weights )
                all_vars_opps[j].append( np.array([data_opps[j][k] for k in range(i,n)]) @ weights )
    
    return all_vars_fighter, all_vars_opps, test_data_f, test_data_o

In [27]:
def variable_selector(fighter, index_used, s, skip):

    all_f, all_o, all_test_f, all_test_o = all_variable_creator(fighter, s, skip) # Extracts and processes the data

    indices = [1,2,3,4,5,6,7,8,9,10,11,12,13]

    used_f, used_o = [], []

    test_f = []
    test_o = []

    # Always add Win/Loss 
    used_f.append(all_f[0][1:]) 
    
    # Add the used indices
    for i in index_used:

        if i in indices:
            used_f.append(all_f[i][1:])
            used_o.append(all_o[i][1:])
            test_f.append(all_test_f[i][0])
            test_o.append(all_test_o[i][0])

    test_data = [0] + test_f + test_o
    used_f_o = used_f + used_o
    return used_f_o, test_data

### Logistic regression
Here we implement our logistic regression model


In [28]:
def mod(data, betas): # Takes the dotproduct

    value = betas[0] + np.dot(data[1:], betas[1:])

    return value

In [29]:
def grad(vars, betas): # Takes the gradient

    grad = []

    # Loops over every beta
    for j in range(len(vars)):

        sum = 0

        if j == 0:
            # Loops over every fight
            for i in range(len(vars[0])):
                sum =  sum + (vars[0][i] - 1/(1+math.exp(-mod(np.transpose(vars)[i],betas))))

        else:
            # Loops over every fight
            for i in range(len(vars[0])):
                sum = sum + (vars[0][i] - 1/(1+math.exp(-mod(np.transpose(vars)[i],betas))))*vars[j][i]
    
        
        grad.append(-sum)
    
    return grad

In [30]:
def hess(vars,betas): # Takse the hessian

    hess = []
    
    for r in range(len(vars)):

        hess.append([])

        for c in range(len(vars)):

            if r == 0 and c == 0:

                sum = 0

                # Loops over every fight
                for i in range(len(vars[0])):
                    sum -= 1/(np.exp(mod(np.transpose(vars)[i],betas)) + 2 + np.exp(-mod(np.transpose(vars)[i],betas)))
            
                hess[r].append(-sum)
            
            elif r == 0:

                sum = 0

                # Loops over every fight
                for i in range(len(vars[0])):
                    sum -= vars[c][i]/(np.exp(mod(np.transpose(vars)[i],betas)) + 2 + np.exp(-mod(np.transpose(vars)[i],betas)))
            
                hess[r].append(-sum)

            elif c == 0:

                sum = 0

                # Loops over every fight
                for i in range(len(vars[0])):
                    sum -= vars[r][i]/(np.exp(mod(np.transpose(vars)[i],betas)) + 2 + np.exp(-mod(np.transpose(vars)[i],betas)))
            
                hess[r].append(-sum)
            
            else:

                sum = 0

                # Loops over every fight
                for i in range(len(vars[0])):
                    sum -= vars[c][i]*vars[r][i]/(np.exp(mod(np.transpose(vars)[i],betas)) + 2 + np.exp(-mod(np.transpose(vars)[i],betas)))
            
                hess[r].append(-sum)

    return hess

In [31]:
def newtons(vars,betas): # Runs Newton's method
    error = 1
    gradi = [1,1]
    while error > 10**-5:
        try:
            betasprev = betas
            hessi = hess(vars,betas)
            gradi = grad(vars,betas)
            betas = betas - 0.5*np.linalg.solve(hessi, gradi)
            error = max(abs(betasprev-betas))
        except np.linalg.LinAlgError:
            print("Singular matrix encountered. Stopped the simulation")
            return betas
        except:
            print("An unexpected error occured")
            return betas

    return betas

In [32]:
def neg_log_lkh(betas,vars): # Takes the negative loglikelihood function

    sum = 0

    for i in range(len(vars[0])):

        sum = sum + np.log(1+np.exp(mod(np.transpose(vars)[i], betas))) - vars[0][i]*mod(np.transpose(vars)[i], betas)
    
    return sum

In [33]:
def lkh_solver(vars):

    # Randomizes a number between -l and l to use as an initial guess for Newton's method
    l = 0.1
    betas_0 = l*np.random.uniform(-1, 1, len(vars))

    # Use Newton's method to find the betas that minimize the function
    betas = newtons(vars,betas_0)

    # Use scipy's minimize function to find the betas that minimize the function
    res = minimize(neg_log_lkh, betas_0, args=(vars), method='BFGS')
    betas_scipy = res.x

    return betas, betas_scipy

In [34]:
def proba_reg(fighter, used_vars, s, skip): # Calculates the probability of winning for the chosen fighter using logistic regression

    vars, test_data = variable_selector(fighter,used_vars, s, skip)
    
    print("Calculating W/L betas for fighter:", fighter)

    betas, betas_scipy = lkh_solver(vars)

    print("Done for fighter:", fighter)

    pred = 1/(1+np.exp(-mod(test_data,betas)))
    pred_scipy = 1/(1+np.exp(-mod(test_data,betas_scipy)))
        

    return pred, pred_scipy

### PYTORCH

Here we use pytorch for their logistic regression model

In [35]:
def proba_scikit(fighter, used_vars, s, skip): # Calculates the probability of winning for the chosen fighter using scikit

    vars, test_data = variable_selector(fighter, used_vars, s, skip)

    model = LogisticRegression()
    model.fit(np.transpose(vars[1:]), vars[0])
    probs = model.predict_proba([test_data[1:]])

    return probs[0][1]

### Bayesian regression
Here we implement our Bayesian regression model

In [36]:
def generate_sigma(X, Y, beta_hat): # Generates sigma using the appropriate distribution

    n,m = X.shape

    # Draw from a Chi-square distribution
    chi_square_sample = np.random.chisquare(df=n-m, size=1)
    
    # Take the reciprocal to get an Inverse Chi-square sample
    inv_chi_square_sample = 1 / chi_square_sample

    k = m
    s = 1/(n-k) * np.transpose(Y - X @ beta_hat) @ (Y - X @ beta_hat)

    # Scale the sample by the standard deviation
    scaled_sample = np.sqrt(s) * np.sqrt(inv_chi_square_sample)

    return scaled_sample

In [37]:
def generate_betas(X, Y, beta_hat): # Generates beta using the appropriate distribution

    sigmas = generate_sigma(X, Y, beta_hat)
    # Draw from a multivariate normal distribution
    beta_sample = np.random.multivariate_normal(mean=beta_hat, cov= sigmas * np.linalg.inv(X.T @ X), size=1)

    return beta_sample, sigmas

In [38]:
def simulate_bayes(X, Y, test, n_samples): # Generates the predictions using the appropriate distribution

    predictions = []
    try:
        beta_hat = np.linalg.inv(X.T @ X) @ X.T @ Y
    except np.linalg.LinAlgError:
        print("Singular matrix error encountered.")
        return None

    for _ in range(n_samples):
        
            beta_sample, sigmas = generate_betas(X, Y, beta_hat)

            predictions.append(np.random.normal(loc=test @ beta_sample.T, scale=sigmas))
        
    return predictions

In [39]:
def proba_bayes(fighter, used_vars, s, skip, n_samples): # Generates n samples of the probability using Bayesian regression and returns the mean of these

    vars, test_data = variable_selector(fighter, used_vars, s, skip)

    X = np.transpose(vars[1:])
    X = np.insert(X, 0, 1, axis=1)

    test = np.transpose(test_data[1:])
    test = np.insert(test, 0, 1)

    Y = vars[0]
    predictions = simulate_bayes(X, Y, test, n_samples)
    
    # makes sure that the predictions are between 0 and 1
    mean_pred = np.mean(predictions)

    return mean_pred

### Result Table


In [40]:
# >19 matches, 0.15 < winrate < 0.85
fights = [["Bobby Green","Jim Miller"], ["Vicente Luque", "Rafael Dos Anjos"], ["Tony Ferguson", "Bobby Green"], ["Charles Oliveira", "Beneil Dariush"], ["Matt Brown", "Court McGee"],
          ["Gilbert Burns", "Jorge Masvidal"], ["Gilbert Burns", "Neil Magny"], ["Drew Dober", "Bobby Green"], ["Nate Diaz", "Tony Ferguson"], ["Jim Miller", "Donald Cerrone"],
          ["Mauricio Rua", "Ovince Saint Preux"], ["Cub Swanson", "Darren Elkins"], ["Michael Johnson", "Clay Guida"], ["Anthony Pettis", "Donald Cerrone"], ["BJ Penn", "Clay Guida"], 
          ["Matt Brown", "Diego Sanchez"], ["Tim Boetsch", "Johny Hendricks"], ["BJ Penn", "Dennis Siver"], ["Thiago Alves", "Patrick Cote"], ["Matt Hughes", "Josh Koscheck"]]
fighters = ["Bobby Green", "Jim Miller", "Vicente Luque", "Rafael Dos Anjos", "Tony Ferguson", "Bobby Green", "Charles Oliveira", "Beneil Dariush", "Matt Brown", "Court McGee",
            "Gilbert Burns", "Jorge Masvidal", "Gilbert Burns", "Neil Magny", "Drew Dober", "Bobby Green", "Nate Diaz", "Tony Ferguson", "Jim Miller", "Donald Cerrone",
            "Mauricio Rua", "Ovince Saint Preux", "Cub Swanson", "Darren Elkins", "Michael Johnson", "Clay Guida", "Anthony Pettis", "Donald Cerrone",
            "BJ Penn", "Clay Guida", "Matt Brown", "Diego Sanchez", "Tim Boetsch", "Johny Hendricks", "BJ Penn", "Dennis Siver", "Thiago Alves", "Patrick Cote", "Matt Hughes", "Josh Koscheck"]
skips = [0,0, 1,1, 1,3, 1,1, 0,1, 2,0, 3,3, 3,4, 0,2, 4,0, 1,2, 2,3, 4,6, 1,2, 0,9, 6,5, 2,1, 2,0, 5,0, 0,6]

s = 5
used_vars = [6,7]
#1[KD/s] 2[SIG_STR%] 3[SIG/s] 4[TD/s] 5[SUB_ATT/s] 6[SIG_HIT/s] 7[CTRL%] 8[HEAD/s] 9[BODY/s] 10[LEG/s] 11[Dist/s] 12[Clinch/s] 13[Ground/s]

Logistic regression results

In [None]:
logistic_predictions = []
logistic_predictions_scipy = []

for i in range(len(fighters)): # Results are generated for the chosen fights using logistic regression
    fighter = fighters[i]
    skip = skips[i]

    logi_reg = proba_reg(fighter, used_vars, s, skip)
    logistic_predictions.append(logi_reg[0])
    logistic_predictions_scipy.append(logi_reg[1])

Scikit-learn results

In [42]:
sklearn_predictions = []

for i in range(len(fighters)): # Results are generated for the chosen fights using scikit-learn
    
    fighter = fighters[i]
    skip = skips[i]
    sklearn = proba_scikit(fighter, used_vars, s, skip)
    sklearn_predictions.append(sklearn)


Bayesian regression results

In [43]:
bayes_predictions = []

for i in range(len(fighters)): # Results are generated for the chosen fights using Bayesian regression
        
    fighter = fighters[i]
    skip = skips[i]
    bayes = proba_bayes(fighter, used_vars, s, skip, 1000)
    bayes_predictions.append(bayes)


Results table

In [None]:
# Creates a dataframe for each fight
for i in range(0, len(fighters), 2):
    df_fight = pd.DataFrame({
        'Fighter': [fighters[i], fighters[i+1]],
        'Log Reg': [logistic_predictions[i], logistic_predictions[i+1]],
        'Bay': [bayes_predictions[i], bayes_predictions[i+1]],
        'Skl': [sklearn_predictions[i], sklearn_predictions[i+1]],
    })
    print(f"Fight: {fights[i//2]}")
    print(df_fight)
    print("\n")