In [2]:
import pandas as pd
# We import pandas because we have a very large data set. It's a csv file with all data for all columns present (al large table). Pandas is ideal for this scenario.

import numpy as np
# We import numpy because we might want to use the built in functions to see the correlation of multiple columns in the data frame.

import statsmodels.api as sm
# We use that statsmodels library to validate the np and sklearn libraries

import matplotlib.pyplot as plt
# We might not need this because pandas has its own .plot() function, but just in case we want to plot something specific.

from sklearn.linear_model import LinearRegression
# Because we want to use regression on the data set we will use the science kit linear_model and import LinearRegression.

from sklearn import metrics
# We might want to see how well variables correlate with eachother, we will use the the metrics library as well.


In [3]:
df = pd.read_csv("data/preprocessed_data.csv")
df.columns.tolist()

['Winner',
 'title_bout',
 'no_of_rounds',
 'B_current_lose_streak',
 'B_current_win_streak',
 'B_draw',
 'B_avg_BODY_att',
 'B_avg_BODY_landed',
 'B_avg_CLINCH_att',
 'B_avg_CLINCH_landed',
 'B_avg_DISTANCE_att',
 'B_avg_DISTANCE_landed',
 'B_avg_GROUND_att',
 'B_avg_GROUND_landed',
 'B_avg_HEAD_att',
 'B_avg_HEAD_landed',
 'B_avg_KD',
 'B_avg_LEG_att',
 'B_avg_LEG_landed',
 'B_avg_PASS',
 'B_avg_REV',
 'B_avg_SIG_STR_att',
 'B_avg_SIG_STR_landed',
 'B_avg_SIG_STR_pct',
 'B_avg_SUB_ATT',
 'B_avg_TD_att',
 'B_avg_TD_landed',
 'B_avg_TD_pct',
 'B_avg_TOTAL_STR_att',
 'B_avg_TOTAL_STR_landed',
 'B_longest_win_streak',
 'B_losses',
 'B_avg_opp_BODY_att',
 'B_avg_opp_BODY_landed',
 'B_avg_opp_CLINCH_att',
 'B_avg_opp_CLINCH_landed',
 'B_avg_opp_DISTANCE_att',
 'B_avg_opp_DISTANCE_landed',
 'B_avg_opp_GROUND_att',
 'B_avg_opp_GROUND_landed',
 'B_avg_opp_HEAD_att',
 'B_avg_opp_HEAD_landed',
 'B_avg_opp_KD',
 'B_avg_opp_LEG_att',
 'B_avg_opp_LEG_landed',
 'B_avg_opp_PASS',
 'B_avg_opp_REV',
 

In [4]:
# We will start with a simple linear regression. We will predict `B_avg_DISTANCE_landed` by avg by `B_avg_DISTANCE_att`.
# This way we can see if or what impact the B_avg_DISTANCE_landed has on the B_win_by_KO/TKO.

# Independent variable x
x = df["B_avg_DISTANCE_att"].to_numpy().reshape((-1, 1))

# Dependent variable y
y = df["B_avg_DISTANCE_landed"].to_numpy().reshape((-1, 1))

model = LinearRegression().fit(x, y)

y_predict = model.predict(x)

print(f"b0: {model.intercept_}")
print(f"b1: {model.coef_}")
# print(f"y_predict: {y_predict}")

det = metrics.r2_score(y, y_predict)
print(f"The determination coefficient is: {det}")

b0: [-0.08531672]
b1: [[0.36629796]]
The determination coefficient is: 0.8864090479926107


In [5]:
# Another way to do this is using the statmodels api
x = sm.add_constant(x)
model = sm.OLS(y, x)
results = model.fit()
results.summary()

# We can observe that the r-sqared with both methods is about 0.886.

0,1,2,3
Dep. Variable:,y,R-squared:,0.886
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,28010.0
Date:,"Sun, 07 Jun 2020",Prob (F-statistic):,0.0
Time:,17:36:55,Log-Likelihood:,-11155.0
No. Observations:,3592,AIC:,22310.0
Df Residuals:,3590,BIC:,22330.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0853,0.149,-0.575,0.566,-0.376,0.206
x1,0.3663,0.002,167.376,0.000,0.362,0.371

0,1,2,3
Omnibus:,686.963,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7460.474
Skew:,0.585,Prob(JB):,0.0
Kurtosis:,9.963,Cond. No.,112.0


In [6]:
# Now let's do the basic multivariate linear regression.
# We want to predict if either red or blue wins based on the fight data of that particular fight. This means we will use:
# Winner as red = 1, and blue = 0
# We will use the statsmodels api becuase it gives both the r^2 and the adjusted r^2

def set_y(outcomes):
    y = []

    for i in outcomes:
        if i == "Red":
            y.append(1)
        elif i == "Blue":
            y.append(0)
        else:
            raise Exception(f"Can't parse value {i}")

    return y
    

outcomes = df["Winner"].to_numpy().reshape((-1, 1))

y = set_y(outcomes)

x = df[["B_avg_BODY_att",
"B_avg_BODY_landed",
"B_avg_CLINCH_att",
"B_avg_CLINCH_landed",
"B_avg_DISTANCE_att",
"B_avg_DISTANCE_landed", 
"B_avg_GROUND_att",
"B_avg_GROUND_landed",
"B_avg_HEAD_att",
"B_avg_HEAD_landed",
"B_avg_LEG_att",
"B_avg_LEG_landed",
"B_avg_PASS",
"B_avg_REV", 
"B_avg_SIG_STR_att",
"B_avg_SIG_STR_landed", 
"B_avg_SIG_STR_pct",
"B_avg_SUB_ATT",
"B_avg_TD_att",
"B_avg_TD_landed",
"B_avg_TD_pct", 
"B_avg_TOTAL_STR_att",
"B_avg_TOTAL_STR_landed", 
"R_avg_BODY_att",
"R_avg_BODY_landed",
"R_avg_CLINCH_att",
"R_avg_CLINCH_landed",
"R_avg_DISTANCE_att",
"R_avg_DISTANCE_landed", 
"R_avg_GROUND_att",
"R_avg_GROUND_landed",
"R_avg_HEAD_att",
"R_avg_HEAD_landed",
"R_avg_LEG_att",
"R_avg_LEG_landed",
"R_avg_PASS",
"R_avg_REV", 
"R_avg_SIG_STR_att",
"R_avg_SIG_STR_landed", 
"R_avg_SIG_STR_pct",
"R_avg_SUB_ATT",
"R_avg_TD_att",
"R_avg_TD_landed",
"R_avg_TD_pct", 
"R_avg_TOTAL_STR_att",
"R_avg_TOTAL_STR_landed"]]

x = sm.add_constant(x)
model = sm.OLS(y, x)
res = model.fit()
res.summary()

# prediction = model.predict(x)
# print(prediction)

0,1,2,3
Dep. Variable:,y,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.063
Method:,Least Squares,F-statistic:,7.328
Date:,"Sun, 07 Jun 2020",Prob (F-statistic):,1.2e-36
Time:,17:36:55,Log-Likelihood:,-2270.8
No. Observations:,3592,AIC:,4620.0
Df Residuals:,3553,BIC:,4861.0
Df Model:,38,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6324,0.063,10.028,0.000,0.509,0.756
B_avg_BODY_att,-0.0018,0.004,-0.456,0.648,-0.010,0.006
B_avg_BODY_landed,-0.0030,0.005,-0.582,0.560,-0.013,0.007
B_avg_CLINCH_att,0.0034,0.003,1.000,0.318,-0.003,0.010
B_avg_CLINCH_landed,-0.0023,0.005,-0.472,0.637,-0.012,0.007
B_avg_DISTANCE_att,-0.0032,0.002,-1.768,0.077,-0.007,0.000
B_avg_DISTANCE_landed,0.0035,0.003,1.288,0.198,-0.002,0.009
B_avg_GROUND_att,0.0014,0.003,0.436,0.663,-0.005,0.008
B_avg_GROUND_landed,-0.0062,0.005,-1.306,0.191,-0.015,0.003

0,1,2,3
Omnibus:,95774.268,Durbin-Watson:,1.844
Prob(Omnibus):,0.0,Jarque-Bera (JB):,479.118
Skew:,-0.586,Prob(JB):,9.14e-105
Kurtosis:,1.648,Cond. No.,1.09e+16


In [7]:
# We can check the results using the sklearn library
model = LinearRegression().fit(x, y)
y_predict = model.predict(x)
det = metrics.r2_score(y, y_predict)


print(f"b0: {model.intercept_}")
print(f"b1: {model.coef_}")
print(f"det: {det}")

b0: 0.6324133523206238
b1: [ 7.14465628e-17 -1.79520610e-03 -3.04255827e-03  3.41739701e-03
 -2.27970459e-03 -3.20844396e-03  3.54960973e-03  1.38659213e-03
 -6.17562943e-03  6.07811877e-03 -8.60053612e-03 -2.68736748e-03
  6.73737010e-03  3.45136482e-04  1.46065395e-02  1.59554518e-03
 -4.90572429e-03  3.06055355e-02  3.75418442e-04 -8.10322784e-03
 -2.91202072e-02  1.13399794e-01 -4.62896712e-03  7.15138298e-03
 -5.36734656e-03  3.75649202e-03  4.15072182e-03 -3.95596644e-03
 -2.97542680e-03  4.66260970e-03  1.22536480e-02 -1.50075463e-02
  3.96978615e-03 -3.42586173e-03  1.48265034e-02 -1.46315333e-02
  7.04445450e-03 -7.39198234e-02  1.34289430e-02 -1.43009031e-02
  1.30212874e-01  2.60470690e-02 -1.01451924e-03  1.69737035e-02
  8.71176290e-02 -1.55521088e-02  1.58258546e-02]
det: 0.0726772006341736


In [None]:
# Out statsmodels summary tells us that there may be colineatiry problems between some of the constants. We can use the variance inflation factor to determine which n of X correlate with eachother

from statsmodels.stats.outliers_influence import variance_inflation_factor



In [None]:
# Because our independent variables are not yet scaled we might get wierd results. Lets see if scaling helps bring back the determination coefficient.


In [None]:
# We can also look for multicolinearity.
# This allows us to see which independent variable has the most inpact on the prediction.
