In [None]:
import pandas as pd
# We import pandas because we have a very large data set. It's a csv file with all data for all columns present (al large table). Pandas is ideal for this scenario.

import numpy as np
# We import numpy because we might want to use the built in functions to see the correlation of multiple columns in the data frame.

import statsmodels.api as sm
# We use that statsmodels library to validate the np and sklearn libraries

import matplotlib.pyplot as plt
# We might not need this because pandas has its own .plot() function, but just in case we want to plot something specific.

from sklearn.linear_model import LinearRegression
# Because we want to use regression on the data set we will use the science kit linear_model and import LinearRegression.

from sklearn import metrics
# We might want to see how well variables correlate with eachother, we will use the the metrics library as well.


In [None]:
df = pd.read_csv("data/preprocessed_data.csv")
df.columns.tolist()

In [None]:
# We will start with a simple linear regression. We will predict `B_avg_DISTANCE_landed` by avg by `B_avg_DISTANCE_att`.
# This way we can see if or what impact the B_avg_DISTANCE_landed has on the B_win_by_KO/TKO.

# Independent variable x
x = df["B_avg_DISTANCE_att"].to_numpy().reshape((-1, 1))

# Dependent variable y
y = df["B_avg_DISTANCE_landed"].to_numpy().reshape((-1, 1))

model = LinearRegression().fit(x, y)

y_predict = model.predict(x)

print(f"b0: {model.intercept_}")
print(f"b1: {model.coef_}")
# print(f"y_predict: {y_predict}")

det = metrics.r2_score(y, y_predict)
print(f"The determination coefficient is: {det}")

In [None]:
# Another way to do this is using the statmodels api
x = sm.add_constant(x)
model = sm.OLS(y, x)
results = model.fit()
results.summary()

# We can observe that the r-sqared with both methods is about 0.886.

In [None]:
# Now let's do the basic multivariate linear regression.
# We want to predict if either red or blue wins based on the fight data of that particular fight. This means we will use:
# Winner as red = 1, and blue = 0
# We will use the statsmodels api becuase it gives both the r^2 and the adjusted r^2

def set_y(outcomes):
    y = []

    for i in outcomes:
        if i == "Red":
            y.append(1)
        elif i == "Blue":
            y.append(0)
        else:
            raise Exception(f"Can't parse value {i}")

    return y
    

outcomes = df["Winner"].to_numpy().reshape((-1, 1))

y = set_y(outcomes)

x = df[["B_avg_BODY_att",
"B_avg_BODY_landed",
"B_avg_CLINCH_att",
"B_avg_CLINCH_landed",
"B_avg_DISTANCE_att",
"B_avg_DISTANCE_landed", 
"B_avg_GROUND_att",
"B_avg_GROUND_landed",
"B_avg_HEAD_att",
"B_avg_HEAD_landed",
"B_avg_LEG_att",
"B_avg_LEG_landed",
"B_avg_PASS",
"B_avg_REV", 
"B_avg_SIG_STR_att",
"B_avg_SIG_STR_landed", 
"B_avg_SIG_STR_pct",
"B_avg_SUB_ATT",
"B_avg_TD_att",
"B_avg_TD_landed",
"B_avg_TD_pct", 
"B_avg_TOTAL_STR_att",
"B_avg_TOTAL_STR_landed", 
"R_avg_BODY_att",
"R_avg_BODY_landed",
"R_avg_CLINCH_att",
"R_avg_CLINCH_landed",
"R_avg_DISTANCE_att",
"R_avg_DISTANCE_landed", 
"R_avg_GROUND_att",
"R_avg_GROUND_landed",
"R_avg_HEAD_att",
"R_avg_HEAD_landed",
"R_avg_LEG_att",
"R_avg_LEG_landed",
"R_avg_PASS",
"R_avg_REV", 
"R_avg_SIG_STR_att",
"R_avg_SIG_STR_landed", 
"R_avg_SIG_STR_pct",
"R_avg_SUB_ATT",
"R_avg_TD_att",
"R_avg_TD_landed",
"R_avg_TD_pct", 
"R_avg_TOTAL_STR_att",
"R_avg_TOTAL_STR_landed"]]

x = sm.add_constant(x)
model = sm.OLS(y, x)
res = model.fit()
res.summary()

# prediction = model.predict(x)
# print(prediction)

In [None]:
# We can check the results using the sklearn library
model = LinearRegression().fit(x, y)
y_predict = model.predict(x)
det = metrics.r2_score(y, y_predict)


print(f"b0: {model.intercept_}")
print(f"b1: {model.coef_}")
print(f"det: {det}")

In [None]:
# We can test if normalizing the data helps with the determination coeficient

for r in x.columns:
    x[r] = x[r].apply(lambda v: (v - x[r].min()) / (x[r].max() - x[r].min()))

x.head(10)

In [None]:
# Let's see what de determination coeficient is now
# But first we have to remove the const column, because it contains NaN's and it is not relevant to y

x = x.drop(["const"], axis=1)

model = LinearRegression().fit(x, y)
y_predict = model.predict(x)
det = metrics.r2_score(y, y_predict)


print(f"b0: {model.intercept_}")
print(f"b1: {model.coef_}")
print(f"det: {det}")

In [None]:
# This does not appear to change the determination coeficient. I think it's because, even though we changed the numbers to a different scale, the porportions are still the same.
# We can also try the z-score, and see how that affects the determination coeficient.
from scipy.stats.mstats import zscore

zx = zscore(x)
zy = zscore(y)

sm.OLS(zy, zx).fit().summary()

In [None]:
# We can also look for multicolinearity.
# This allows us to see which independent variable has the most inpact on the prediction.

from statsmodels.stats.outliers_influence import variance_inflation_factor

# We will need a column of constants to set the intercept term in the equation.
x["const"] = 1
print(x.head(10))

vif = pd.DataFrame()
vif["VarianceInflationFactor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["Variables"] = x.columns

vif.head(10)
print(x.columns[7])
print(x.columns[19])


In [None]:
# There appears to be a strong colinearify between average tagke down landed, and average ground landed.
# This is pretty logical, considering the chanses of you hitting the opponsent on the ground are very high if you have taken your opponent down, and pretty low if you have not taken your opponent down.
# This is why I will remove average ground landed, since it is much more important to knop how good a fighter is in taking the opponent down.

x = x.drop(["B_avg_GROUND_landed", "R_avg_GROUND_landed"], axis=1)

sm.OLS(y, x).fit().summary()

In [None]:
# This does not appear to have effect on the predictabily of the match. This can also be explained because if you would view take downs and ground attacks as one, it would not make much difference in the outcome of the match.

# For this reason we will try a different technique.
# We will attempt to make this more predictable by using machine learning to solve our problem.

# Step 1: Create test and training datasets
# We already have x and y, so we just need to split those.
def preprocess(x, y):
    from sklearn.model_selection import train_test_split

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1337)

    x_train = x_train.to_numpy().reshape((-1, x.shape[1]))
    x_test = x_test.to_numpy().reshape((-1, x.shape[1]))
    y_train = np.array(y_train).reshape((-1, 1))
    y_test = np.array(y_test).reshape((-1, 1))

    np.save("models/datasets/x_train", x_train)
    np.save("models/datasets/x_test", x_test)
    np.save("models/datasets/y_train", y_train)
    np.save("models/datasets/y_test", y_test)

    print(x_train.shape)
    print(x_test.shape)
    print(y_train.shape)
    print(y_test.shape)

preprocess(x, y)

In [None]:
# Step 2: Create a neural network that can process this data and predict the outcomes of the match
def model():
    import matplotlib.pyplot as plt
    from keras.models import Sequential
    from keras.layers import Dense, Dropout

    x_train = np.load("models/datasets/x_train.npy")
    x_test = np.load("models/datasets/x_test.npy")
    y_train = np.load("models/datasets/y_train.npy")
    y_test = np.load("models/datasets/y_test.npy")

    model = Sequential()
    model.add(Dense(512, input_shape=(x_train.shape[1],), activation="relu"))
    model.add(Dropout(0.8))
    model.add(Dense(256, activation="relu"))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    metrics = model.fit(x_train, y_train, batch_size=32, epochs=30, validation_data=(x_test, y_test))

    evaluation = model.evaluate(x_test, y_test)

    model.save("models/networks/small1.h5")

    model.summary()
    print(f"Evaluation: {evaluation}")

    plt.plot(metrics.history["accuracy"])
    plt.plot(metrics.history["val_accuracy"])
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend(["Train", "Val"])
    plt.show()

    plt.plot(metrics.history["loss"])
    plt.plot(metrics.history["val_loss"])
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend(["Traing", "Val"])
    plt.show()

model()

In [None]:
# As you can see this model is pretty shit. This is propably because there is still multi_colinearity in the data.

# We will try to remove more columns from x, and see what happends.
# We will eliminate how much fighters attack, so that we are left with the quality of each type of attack.

dropcol = []

for c in x.columns:
    if c.endswith("att"):
        dropcol.append(c)


print(dropcol)
x = x.drop(dropcol, axis=1)

preprocess(x, y)
model()

In [None]:
# So I added more dropout and the model is actually much better. It's still not reliable but after adjusting the network we can see inprovements in performance