In [1]:
import pandas as pd

data = pd.read_csv("Documents/ufc_master.csv")

#Drop betting odds, these will be predicted later
data = data.drop(columns=['R_odds','B_odds'])

#Drop missing values
data = data.dropna()

#Switch Winner column to binary values
data['Winner'].replace(['Red','Blue'],[1,0], inplace=True)

#Separate results from predictor variables
Winner = data.iloc[:,0]
Variables = data.iloc[:,1:]

#Transform categorical variables to dummy columns
Variables = pd.get_dummies(Variables)

#data.info()
#data.head()
#Variables.head()

In [2]:
#Scaling the data and Fitting the Model
#Using PCA since several of the variables will be highly correlated with each other
#Score the first time was .601
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

Variables_Train, Variables_Test, Winner_Train, Winner_Test = train_test_split(Variables, Winner)

pipe = Pipeline([('pca', PCA()),('scale', StandardScaler()),('log_reg', LogisticRegression())])

pipe.fit(Variables_Train, Winner_Train).score(Variables_Test, Winner_Test)

0.6161825726141079

In [3]:
#Data has a lot of multicollinearity. This new dataset focuses on differentials between opponents, which should have less
#multicollinearity. This logistic regression model can show the statistical signficance of individual variables
import statsmodels.api as sm
import numpy as np

data = pd.read_csv("Documents/ufc_master_differential.csv")

#Drop missing values
data = data.dropna()

#Switch Winner column to binary values
data['Winner'].replace(['Red','Blue'],[1,0], inplace=True)

#Separate results from predictor variables, and don't include betting odds
Winner = data.iloc[:,0]
Variables = data.iloc[:,3:]

#Transform categorical variables to dummy columns
Variables = pd.get_dummies(Variables)

Variables_Train, Variables_Test, Winner_Train, Winner_Test = train_test_split(Variables, Winner)

Winner_Train = np.asarray(Winner_Train)
Variables_Train=np.asarray(Variables_Train)

logit_model=sm.Logit(Winner_Train, Variables_Train)
result = logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.656014
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 3671
Model:                          Logit   Df Residuals:                     3653
Method:                           MLE   Df Model:                           17
Date:                Tue, 24 Oct 2023   Pseudo R-squ.:                 0.03245
Time:                        12:08:50   Log-Likelihood:                -2408.2
converged:                       True   LL-Null:                       -2489.0
Covariance Type:            nonrobust   LLR p-value:                 1.323e-25
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.0241      0.040     -0.601      0.548      -0.103       0.054
x2            -0.0845      0.

In [4]:
#This time using a custom dataset that focuses on statistically significant variables

data = pd.read_csv("Documents/ufc_master_custom_vers1.csv")

#Drop missing values
data = data.dropna()

#Switch Winner column to binary values
data['Winner'].replace(['Red','Blue'],[1,0], inplace=True)

#Separate results from predictor variables
Winner = data.iloc[:,0]
Variables = data.iloc[:,1:]

#No PCA this time, variables should not have multicollinearity
Variables_Train, Variables_Test, Winner_Train, Winner_Test = train_test_split(Variables, Winner)

pipe = Pipeline([('scale', StandardScaler()),('log_reg', LogisticRegression())])

pipe.fit(Variables_Train, Winner_Train).score(Variables_Test, Winner_Test)


0.6021241830065359