# UFC Chapter - Saturn Prototype

## Dependency Center

In [1]:
# Import General Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Import scikit-learn Utilities and Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report, 
    mean_squared_error
)
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier

# Import Modeling Tools
from sklearn.svm import SVC
import xgboost as xgb

# Import Keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


## Data Loading and Cleansing

In [2]:
# Load the data from the source
url = 'https://www.kaggle.com/mdabbert/ultimate-ufc-dataset?select=ufc-master.csv'
data = pd.read_csv('Data/ufc-master.csv')

# Create a DataFrame
ufcdata = pd.DataFrame(data)

# Display basic information
print(f"The shape of the DataFrame is {ufcdata.shape}")
print(f"Columns in the DataFrame: {ufcdata.columns}")

# Show an example row of the df to better understand the data.
pd.set_option('display.max_columns', None)

# Since this is a very large dataset, there are many NaN values for some of the columns as they don't apply to a each fight.
# In this case, there are columns for the official UFC rankings, but each fight only takes place in 1 division, where only 15 fighters are ranked.
# Thus, there are many categorical columns that could just be represented by 0.

# In order to fix this issue, we use the .fillna function to fill empty values with a 0.
ufcdata.fillna(0, inplace=True)


The shape of the DataFrame is (4896, 119)
Columns in the DataFrame: Index(['R_fighter', 'B_fighter', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'date',
       'location', 'country', 'Winner',
       ...
       'finish_details', 'finish_round', 'finish_round_time',
       'total_fight_time_secs', 'r_dec_odds', 'b_dec_odds', 'r_sub_odds',
       'b_sub_odds', 'r_ko_odds', 'b_ko_odds'],
      dtype='object', length=119)


In [3]:
# Next, to further clean the data, we remove the columns at the end of the df because they are only filled out for about half the fights.
# The columns we want to remove begin with "better_rank" and end at the last column of the df.
# This is a seperate issue from the NaN issue becuase filling these columns with 0 would lead to summation errors.

# Drop columns starting from 'better_rank' to the end
columns_to_drop = ufcdata.columns[ufcdata.columns.get_loc("better_rank"):]
ufcdata.drop(columns=columns_to_drop, inplace=True)

# Display dataframe portion
display(ufcdata.head())


Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Featherweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Light Heavyweight,MALE,5,0,1,0,3.42,0.59,0.7,0.24,1.0,4,2,11,0,0,0,1,4,0,0,5,Orthodox,198.12,208.28,205,3,0,0,3.95,0.48,0.1,0.61,0.36,4,8,43,1,0,0,2,11,0,0,13,Orthodox,187.96,193.04,205,37,29,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.53,0.6,-0.37,1,1,10.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alex Oliveira,Niko Price,170.0,-200,170.0,50.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Welterweight,MALE,3,2,0,0,5.16,0.42,0.8,0.79,0.22,2,5,24,0,0,0,0,4,2,0,6,Orthodox,182.88,193.04,170,2,0,0,2.97,0.51,0.5,2.27,0.37,4,8,44,0,0,1,3,4,3,0,11,Orthodox,180.34,193.04,170,33,32,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.0,-1,2.19,0.3,-1.48,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.0,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Middleweight,MALE,3,1,0,0,2.92,0.41,0.1,1.15,0.34,5,5,38,0,0,2,6,1,0,0,9,Southpaw,185.42,195.58,185,1,0,0,3.77,0.49,1.7,4.48,0.44,4,4,13,0,0,0,0,1,5,0,6,Orthodox,190.5,195.58,205,34,32,0,0,1,3,1,25,0,0,-5,-5.08,0.0,-2,-0.85,-1.6,-3.33,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Lightweight,MALE,3,1,0,0,4.04,0.34,0.0,0.0,0.0,0,1,3,0,0,0,0,0,0,0,0,Orthodox,177.8,177.8,155,1,0,0,3.79,0.36,0.0,1.57,0.35,2,3,15,0,0,0,2,2,0,0,4,Orthodox,175.26,182.88,155,29,32,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.25,0.0,-1.57,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,0,2,0,5.22,0.56,0.0,2.54,0.39,2,3,19,0,0,0,4,1,0,0,5,Orthodox,175.26,172.72,145,0,4,0,2.64,0.62,0.6,2.85,0.52,4,0,8,0,0,0,2,0,2,0,4,Orthodox,175.26,177.8,155,28,33,0,-2,-2,1,3,11,0,1,-2,0.0,-5.08,5,2.58,-0.6,-0.31,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Now our data is cleaned and ready for our analysis questions! 
# One-hot encoding for categorical variables.
# Map categorical variables to numerical values
GenderMap = {'MALE': 1, 'FEMALE': 2}
StanceMap = {'Orthodox': 1, 'Southpaw': 2}

ufcdata['gender'] = ufcdata['gender'].map(GenderMap)
ufcdata['B_Stance'] = ufcdata['B_Stance'].map(StanceMap)
ufcdata['R_Stance'] = ufcdata['R_Stance'].map(StanceMap)

# Null injections
ufcdata = ufcdata.fillna(0)

# Display dataframe portion
display(ufcdata.head())

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Featherweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Light Heavyweight,1,5,0,1,0,3.42,0.59,0.7,0.24,1.0,4,2,11,0,0,0,1,4,0,0,5,1.0,198.12,208.28,205,3,0,0,3.95,0.48,0.1,0.61,0.36,4,8,43,1,0,0,2,11,0,0,13,1.0,187.96,193.04,205,37,29,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.53,0.6,-0.37,1,1,10.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alex Oliveira,Niko Price,170.0,-200,170.0,50.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Welterweight,1,3,2,0,0,5.16,0.42,0.8,0.79,0.22,2,5,24,0,0,0,0,4,2,0,6,1.0,182.88,193.04,170,2,0,0,2.97,0.51,0.5,2.27,0.37,4,8,44,0,0,1,3,4,3,0,11,1.0,180.34,193.04,170,33,32,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.0,-1,2.19,0.3,-1.48,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.0,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Middleweight,1,3,1,0,0,2.92,0.41,0.1,1.15,0.34,5,5,38,0,0,2,6,1,0,0,9,2.0,185.42,195.58,185,1,0,0,3.77,0.49,1.7,4.48,0.44,4,4,13,0,0,0,0,1,5,0,6,1.0,190.5,195.58,205,34,32,0,0,1,3,1,25,0,0,-5,-5.08,0.0,-2,-0.85,-1.6,-3.33,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Red,False,Lightweight,1,3,1,0,0,4.04,0.34,0.0,0.0,0.0,0,1,3,0,0,0,0,0,0,0,0,1.0,177.8,177.8,155,1,0,0,3.79,0.36,0.0,1.57,0.35,2,3,15,0,0,0,2,2,0,0,4,1.0,175.26,182.88,155,29,32,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.25,0.0,-1.57,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.0,2021-10-02,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,1,3,0,2,0,5.22,0.56,0.0,2.54,0.39,2,3,19,0,0,0,4,1,0,0,5,1.0,175.26,172.72,145,0,4,0,2.64,0.62,0.6,2.85,0.52,4,0,8,0,0,0,2,0,2,0,4,1.0,175.26,177.8,155,28,33,0,-2,-2,1,3,11,0,1,-2,0.0,-5.08,5,2.58,-0.6,-0.31,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Create binary columns for 'Red Winner' and 'Blue Winner'
ufcdata['Red Winner'] = (ufcdata['Winner'] == 'Red').astype(int)
ufcdata['Blue Winner'] = (ufcdata['Winner'] == 'Blue').astype(int)

# Drop the original 'Winner' column
ufcdata.drop(columns=['Winner'], inplace=True)

# Display the updated DataFrame
display(ufcdata.head())


Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Featherweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,Red Winner,Blue Winner
0,Thiago Santos,Johnny Walker,-150.0,130,66.666667,130.0,2021-10-02,"Las Vegas, Nevada, USA",USA,False,Light Heavyweight,1,5,0,1,0,3.42,0.59,0.7,0.24,1.0,4,2,11,0,0,0,1,4,0,0,5,1.0,198.12,208.28,205,3,0,0,3.95,0.48,0.1,0.61,0.36,4,8,43,1,0,0,2,11,0,0,13,1.0,187.96,193.04,205,37,29,-3,1,0,-8,-6,-32,-1,-7,0,10.16,15.24,-8,-0.53,0.6,-0.37,1,1,10.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
1,Alex Oliveira,Niko Price,170.0,-200,170.0,50.0,2021-10-02,"Las Vegas, Nevada, USA",USA,False,Welterweight,1,3,2,0,0,5.16,0.42,0.8,0.79,0.22,2,5,24,0,0,0,0,4,2,0,6,1.0,182.88,193.04,170,2,0,0,2.97,0.51,0.5,2.27,0.37,4,8,44,0,0,1,3,4,3,0,11,1.0,180.34,193.04,170,33,32,0,0,-2,-5,-3,-20,0,0,-1,2.54,0.0,-1,2.19,0.3,-1.48,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2,Misha Cirkunov,Krzysztof Jotko,110.0,-130,110.0,76.923077,2021-10-02,"Las Vegas, Nevada, USA",USA,False,Middleweight,1,3,1,0,0,2.92,0.41,0.1,1.15,0.34,5,5,38,0,0,2,6,1,0,0,9,2.0,185.42,195.58,185,1,0,0,3.77,0.49,1.7,4.48,0.44,4,4,13,0,0,0,0,1,5,0,6,1.0,190.5,195.58,205,34,32,0,0,1,3,1,25,0,0,-5,-5.08,0.0,-2,-0.85,-1.6,-3.33,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,Alexander Hernandez,Mike Breeden,-675.0,475,14.814815,475.0,2021-10-02,"Las Vegas, Nevada, USA",USA,False,Lightweight,1,3,1,0,0,4.04,0.34,0.0,0.0,0.0,0,1,3,0,0,0,0,0,0,0,0,1.0,177.8,177.8,155,1,0,0,3.79,0.36,0.0,1.57,0.35,2,3,15,0,0,0,2,2,0,0,4,1.0,175.26,182.88,155,29,32,0,0,-2,-4,-2,-12,0,-2,0,2.54,-5.08,3,0.25,0.0,-1.57,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
4,Joe Solecki,Jared Gordon,-135.0,115,74.074074,115.0,2021-10-02,"Las Vegas, Nevada, USA",USA,False,Lightweight,1,3,0,2,0,5.22,0.56,0.0,2.54,0.39,2,3,19,0,0,0,4,1,0,0,5,1.0,175.26,172.72,145,0,4,0,2.64,0.62,0.6,2.85,0.52,4,0,8,0,0,0,2,0,2,0,4,1.0,175.26,177.8,155,28,33,0,-2,-2,1,3,11,0,1,-2,0.0,-5.08,5,2.58,-0.6,-0.31,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


## Machine Learning Model Workshops

In [6]:
# Select relevant columns
columns = [
    'R_odds', 'gender', 'lose_streak_dif', 'win_streak_dif',
    'longest_win_streak_dif', 'win_dif', 'loss_dif', 'total_round_dif',
    'ko_dif', 'sub_dif', 'height_dif', 'reach_dif', 'age_dif',
    'avg_sub_att_dif', 'avg_td_dif'
]

X = ufcdata[columns]
y = ufcdata["Red Winner"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=50
)


### XGBoost Model

In [7]:
# Train, Test, Eval: XGBoost Model
# Define parameter grid for GridSearch
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

# Initialize XGBoost classifier and perform GridSearch for hyperparameter tuning
xgb_clf = xgb.XGBClassifier(objective='binary:logistic')
grid_search = GridSearchCV(xgb_clf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters from the Grid Search
best_params = grid_search.best_params_

# Train the classifier with the best parameters and make predictions
xgb_clf_best = xgb.XGBClassifier(objective='binary:logistic', **best_params)
xgb_clf_best.fit(X_train, y_train)
y_pred = xgb_clf_best.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f'Optimized XGBoost Model Metrics:')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Optimized XGBoost Model Metrics:
Accuracy: 0.6528250510551395
Precision: 0.6468183074190802
Recall: 0.6528250510551395
F1 Score: 0.6457786138922916


### Support Vector Machine Model

In [8]:
# Train, Test, Eval: Support Vector Machine Model
# Define a more simplified parameter grid for GridSearch
param_grid_svm = {
    'C': [0.1, 1],  # Reduced to only two options
    'kernel': ['linear'],  # Use only 'linear' as it's generally faster than 'rbf' or 'poly'
}

# Initialize SVM classifier and perform GridSearch for hyperparameter tuning
# Optionally, use a random subset of the data to speed up GridSearch
svm_clf = SVC()
grid_search_svm = GridSearchCV(svm_clf, param_grid_svm, cv=2, scoring='accuracy')  # Kept CV at 2
grid_search_svm.fit(X_train.sample(frac=0.5, random_state=42), y_train.sample(frac=0.5, random_state=42))  # Using 50% of the data

# Get the best parameters from the Grid Search
best_params_svm = grid_search_svm.best_params_

# Train the classifier with the best parameters and make predictions
svm_clf_best = SVC(**best_params_svm)
svm_clf_best.fit(X_train, y_train)  # Using full data for final training
y_pred_svm = svm_clf_best.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

# Print evaluation metrics
print(f'Optimized SVM Model Metrics:')
print(f'Accuracy: {accuracy_svm}')
print(f'Precision: {precision_svm}')
print(f'Recall: {recall_svm}')
print(f'F1 Score: {f1_svm}')


Optimized SVM Model Metrics:
Accuracy: 0.6596324029952348
Precision: 0.6538446732428276
Recall: 0.6596324029952348
F1 Score: 0.651708714286105


In [None]:
# Support Vector Machine Origin Code Block
#svm_clf = SVC(kernel='linear')
#svm_clf.fit(X_train, y_train)
#y_pred_svm = svm_clf.predict(X_test)
#accuracy_svm = accuracy_score(y_test, y_pred_svm)
#print(f'SVM Accuracy: {accuracy_svm}')


In [14]:
# Support Vector Machine Alternate Code Block
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize LinearSVC classifier
linear_svc_clf = LinearSVC(dual=False, random_state=42)

# Train the classifier
linear_svc_clf.fit(X_train, y_train)

# Make predictions
y_pred_linear_svc = linear_svc_clf.predict(X_test)

# Evaluate the model
accuracy_linear_svc = accuracy_score(y_test, y_pred_linear_svc)
precision_linear_svc = precision_score(y_test, y_pred_linear_svc, average='weighted')
recall_linear_svc = recall_score(y_test, y_pred_linear_svc, average='weighted')
f1_linear_svc = f1_score(y_test, y_pred_linear_svc, average='weighted')

# Print evaluation metrics
print(f'LinearSVC Model Metrics:')
print(f'Accuracy: {accuracy_linear_svc}')
print(f'Precision: {precision_linear_svc}')
print(f'Recall: {recall_linear_svc}')
print(f'F1 Score: {f1_linear_svc}')


LinearSVC Model Metrics:
Accuracy: 0.6582709326072158
Precision: 0.6523386356778194
Recall: 0.6582709326072158
F1 Score: 0.649783690369872


### Random Forest Model

In [9]:
# Train, Test, Eval: Random Forest Classifier Model
model_rf = RandomForestClassifier(n_estimators=100, max_features=7, random_state=42, min_samples_leaf=5, max_depth=5)
model_rf.fit(X_train, y_train)
predict_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_test, predict_rf)
print(f'Random Forest Accuracy: {acc_rf}')


Random Forest Accuracy: 0.6541865214431586


In [10]:
# Train, Test, Eval: AdaBoost Classifier Model
base_est = DecisionTreeClassifier(max_depth=4)
ada = AdaBoostClassifier(estimator=base_est)

param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 1],
    'estimator__max_depth': [3, 5]
}

grid_search = GridSearchCV(ada, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_ada = grid_search.best_estimator_
best_ada.fit(X_train, y_train)
predict_ada = best_ada.predict(X_test)

acc_ada = accuracy_score(y_test, predict_ada)
print(f'Ada Boost Accuracy: {acc_ada}')

# Additional metrics for AdaBoost
precision_ada = precision_score(y_test, predict_ada, average='weighted')
recall_ada = recall_score(y_test, predict_ada, average='weighted')
f1_ada = f1_score(y_test, predict_ada, average='weighted')

print(f'Precision: {precision_ada}')
print(f'Recall: {recall_ada}')
print(f'F1 Score: {f1_ada}')


Ada Boost Accuracy: 0.6501021102791015
Precision: 0.6441005481430112
Recall: 0.6501021102791015
F1 Score: 0.6434999903836425


In [11]:
# Train, Test, Eval: Dummy Classifier Model
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
print(f'Dummy Classifier Accuracy: {accuracy_dummy}')


Dummy Classifier Accuracy: 0.584070796460177


In [12]:
# Train, Test, Eval: Decision Tree Classifier Model
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {accuracy_dt}')


Decision Tree Accuracy: 0.585432266848196


In [13]:
# Train, Test, Eval: Keras Model
# Convert labels to integers
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# One-hot encode labels
y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Create a Keras model
keras_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train_one_hot.shape[1], activation='softmax')
])

# Compile the model (notice the change from 'lr' to 'learning_rate')
keras_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = keras_model.fit(X_train, y_train_one_hot, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = keras_model.evaluate(X_test, y_test_one_hot)
print(f'Keras Model Accuracy: {accuracy}')

# Additional Evaluation Metrics
y_pred_one_hot = keras_model.predict(X_test)
y_pred_encoded = np.argmax(y_pred_one_hot, axis=1)

precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Keras Model Accuracy: 0.6324030160903931
Precision: 0.6259241070342676
Recall: 0.6324029952348537
F1 Score: 0.626307436212349
