# Formatação dos dados

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
import re
import pickle
import scienceplots
warnings.filterwarnings("ignore")

plt.style.use(['science','grid','notebook'])

pd.set_option('display.max_columns',100)

info_lutador = pd.read_csv('fighter_details.csv')
ufc = pd.read_csv('total_fight_data.csv', sep=';')

In [2]:
columns = ['R_SIG_STR.', 'B_SIG_STR.', 'R_TOTAL_STR.', 'B_TOTAL_STR.',
       'R_TD', 'B_TD', 'R_HEAD', 'B_HEAD', 'R_BODY','B_BODY', 'R_LEG', 'B_LEG', 
        'R_DISTANCE', 'B_DISTANCE', 'R_CLINCH','B_CLINCH', 'R_GROUND', 'B_GROUND']

attemp = '_att'
landed = '_landed'

for column in columns:
    ufc[column+attemp] = ufc[column].apply(lambda X:int(X.split('of')[1]))
    ufc[column+landed] = ufc[column].apply(lambda X:int(X.split('of')[0]))

ufc.drop(columns, axis=1, inplace=True)
ufc['Winner'].fillna('Draw', inplace=True)

In [3]:
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

In [4]:
def seconder(x):
    mins, secs = map(float, x.split(':'))
    td = timedelta(minutes=mins, seconds=secs)
    return td.total_seconds()

In [5]:
pct_columns = ['R_SIG_STR_pct','B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']

for column in pct_columns:
    ufc[column].replace('---','0.0%', regex=True, inplace=True)

In [6]:
rev_columns = ['R_REV', 'B_REV']

for column in rev_columns:
    ufc[column].replace('--','0:0', regex=True, inplace=True)

In [7]:
ufc['R_REV'] = ufc['R_REV'].apply(seconder)
ufc['B_REV'] = ufc['B_REV'].apply(seconder)

In [8]:
for column in pct_columns:
    ufc[column] = ufc[column].apply(lambda X:float(X.replace('%', ''))/100)

In [9]:
def Division(X):
    for division in weight_classes:
        if division in X:
            return division
    if X == 'Catch Weight Bout' or 'Catchweight Bout':
        return 'Catch Weight'
    else:
        return 'Open Weight'

In [10]:
weight_classes = ['Women\'s Strawweight', 'Women\'s Bantamweight', 
                  'Women\'s Featherweight', 'Women\'s Flyweight', 'Lightweight', 
                  'Welterweight', 'Middleweight','Light Heavyweight', 
                  'Heavyweight', 'Featherweight','Bantamweight', 'Flyweight', 'Open Weight']

ufc['Weight_class'] = ufc['Fight_type'].apply(Division)

In [11]:
ufc['Winner'].fillna('Draw', inplace=True)

In [12]:
def get_renamed_winner(row):
    if row['R_fighter'] == row['Winner']:
        return 'Red'
    if row['B_fighter'] == row['Winner']:
        return 'Blue'
    elif row['Winner'] == 'Draw':
        return 'Draw'
    
ufc['Winner'] = ufc[['R_fighter', 'B_fighter', 'Winner']].apply(get_renamed_winner, axis=1)

In [13]:
def inch_to_cm(X):
    if X is np.NaN:
        return X
    elif len(X.split("'")) == 2:
        feet = float(X.split("'")[0])
        inches = int(X.split("'")[1].replace(' ','').replace('"', ''))
        return (feet*30.48) + (inches * 2.54)
    else:
        return float(X.replace('"',''))*2.54

In [14]:
info_lutador['Height'] = info_lutador['Height'].apply(inch_to_cm)
info_lutador['Reach'] = info_lutador['Reach'].apply(inch_to_cm)

In [15]:
info_lutador['Weight'] = info_lutador['Weight'].apply(lambda X: float(X.replace(' lbs', '')) if X is not np.NaN else X)

In [16]:
ufc_final = ufc.merge(info_lutador, left_on='B_fighter', right_on='fighter_name', how='inner')

In [17]:
fighters = []
for j in ufc_final['R_fighter']:
    if j not in fighters:
        fighters.append(j)
for i in ufc_final['B_fighter']:
    if i not in fighters:
        fighters.append(i)

In [18]:
len(fighters)

2416

In [19]:
ufc_final = ufc_final.drop('fighter_name', axis=1)

In [20]:
ufc_final.rename(columns={'Height':'R_Height',
                          'Weight':'R_Weight',
                          'Reach':'R_Reach',
                          'Stance':'R_Stance',
                          'DOB':'R_DOB'}, inplace=True)

In [21]:
ufc_final = ufc_final.merge(info_lutador, left_on='B_fighter', right_on='fighter_name', how='left')

In [22]:
ufc_final = ufc_final.drop('fighter_name', axis=1)

In [23]:
ufc_final.rename(columns={'Height':'B_Height',
                          'Weight':'B_Weight',
                          'Reach':'B_Reach',
                          'Stance':'B_Stance',
                          'DOB':'B_DOB'}, inplace=True)

In [24]:
ufc_final['R_DOB'] = pd.to_datetime(ufc_final['R_DOB'])
ufc_final['B_DOB'] = pd.to_datetime(ufc_final['B_DOB'])
ufc_final['date'] = pd.to_datetime(ufc_final['date'])

In [25]:
ufc_final['R_year'] = ufc_final['R_DOB'].apply(lambda X: X.year)
ufc_final['B_year'] = ufc_final['B_DOB']. apply(lambda X: X.year)
ufc_final['date_year'] = ufc_final['date'].apply(lambda X: X.year)


def get_age(row):
    B_age = (row['date_year'] - row['B_year'])
    R_age = (row['date_year'] - row['R_year'])
    if np.isnan(B_age) != True:
        B_age = B_age
    if np.isnan(R_age) != True:
        R_age = R_age
    return pd.Series([B_age, R_age], index = ['B_age', 'R_age'])

In [26]:
ufc_final[['B_age', 'R_age']] = ufc_final[['date_year', 'R_year', 'B_year']].apply(get_age, axis=1)

In [27]:
ufc_final.drop(['R_DOB', 'B_DOB', 'B_year', 'R_year'],axis=1, inplace=True)

In [28]:
ufc_final['R_age'] = ufc_final['R_age'].fillna(ufc_final['R_age'].median())
ufc_final['B_age'] = ufc_final['B_age'].fillna(ufc_final['B_age'].median())

In [29]:
ufc_final['R_Height'] = ufc_final['R_Height'].fillna(ufc_final['R_Height'].mean())
ufc_final['B_Height'] = ufc_final['B_Height'].fillna(ufc_final['B_Height'].mean())

In [30]:
from statistics import mode
ufc_final['B_Stance'] = ufc_final['B_Stance'].fillna(ufc_final['B_Stance'].mode()[0])
ufc_final['R_Stance'] = ufc_final['R_Stance'].fillna(ufc_final['R_Stance'].mode()[0])

In [31]:
enc = LabelEncoder()

In [32]:
data_enc1 = ufc_final['Weight_class']
data_enc1 = enc.fit_transform(data_enc1)

data_enc2 = ufc_final['R_Stance']
data_enc2 = enc.fit_transform(data_enc2)

data_enc3 = ufc_final['B_Stance']
data_enc3 = enc.fit_transform(data_enc3)

data_enc2 = pd.DataFrame(data_enc2, columns=['R_Stance'])
data_enc3 = pd.DataFrame(data_enc3, columns=['B_Stance'])

ufc_final[['R_Stance']] = data_enc2[['R_Stance']]
ufc_final[['B_Stance']] = data_enc3[['B_Stance']]

R_S = pd.get_dummies(ufc_final['R_Stance'])
B_S = pd.get_dummies(ufc_final['B_Stance'])

ufc_final = pd.concat([ufc_final,pd.get_dummies(ufc_final['win_by'], prefix='win_by')], axis=1)
ufc_final.drop(['win_by'], axis=1, inplace=True)

ufc_final['Winner_num'] = ufc_final.Winner.map({'Red':0, 'Blue':1, 'Draw':2})

In [33]:
B_S.drop([0,2], axis=1, inplace=True)
R_S.drop([0,2], axis=1, inplace=True)

In [34]:
B_S.columns = ['B_Southpaw','B_Orthodox','B_Switch']
R_S.columns = ['R_Southpaw','R_Orthodox','R_Switch']

In [35]:
ufc_final['B_Southpaw'] = B_S['B_Southpaw']
ufc_final['B_Orthodox']= B_S['B_Orthodox']
ufc_final['B_Switch'] = B_S['B_Switch']
ufc_final['R_Southpaw'] = R_S['R_Southpaw']
ufc_final['R_Orthodox']= R_S['R_Orthodox']
ufc_final['R_Switch'] = R_S['R_Switch']

ufc_final.drop(['R_Stance','B_Stance'], axis=1, inplace=True)

In [36]:
ufc_final = ufc_final[ufc_final['Winner_num']!=2]

In [37]:
ufc_final.drop(['date', 'location', 'Referee', 'last_round', 'last_round_time', 'Format', 'Winner','date_year','Fight_type','Weight_class'], axis=1, inplace=True)

In [38]:
df = ufc_final.copy()

In [39]:
df['R_SUB._ATT'] = df['R_SUB_ATT']
df['R_SIG_STR._pct'] = df['R_SIG_STR_pct']
df['B_SUB._ATT'] = df['B_SUB_ATT']
df['B_SIG_STR._pct'] = df['B_SIG_STR_pct']
df.drop(['B_SUB_ATT','B_SIG_STR_pct', 'R_SUB_ATT', 'R_SIG_STR_pct'], axis=1, inplace=True)

In [40]:
ufc_final.drop(['R_SIG_STR._att',
                'B_SIG_STR._att',
                'R_SIG_STR._landed',
                'B_SIG_STR._landed',
                'win_by_Could Not Continue',
                'win_by_DQ',
                'win_by_Decision - Majority',
                'win_by_Decision - Split',
                'win_by_Decision - Unanimous',
                'win_by_KO/TKO',
                'win_by_Other',
                'win_by_Overturned',
                'win_by_Submission',
                "win_by_TKO - Doctor's Stoppage"
               ], axis=1, inplace=True)

In [41]:
ufc_final['R_TOTAL_STR._pct'] = ufc_final['R_TOTAL_STR._landed']/ufc_final['R_TOTAL_STR._att']
ufc_final['R_BODY_pct'] = ufc_final['R_BODY_landed']/ufc_final['R_BODY_att']
ufc_final['R_CLINCH_pct'] = ufc_final['R_CLINCH_landed']/ufc_final['R_CLINCH_att']
ufc_final['R_DISTANCE_pct'] = ufc_final['R_DISTANCE_landed']/ufc_final['R_DISTANCE_att']
ufc_final['R_GROUND_pct'] = ufc_final['R_GROUND_landed']/ufc_final['R_GROUND_att']
ufc_final['R_HEAD_pct'] = ufc_final['R_HEAD_landed']/ufc_final['R_HEAD_att']
ufc_final['R_LEG_pct'] = ufc_final['R_LEG_landed']/ufc_final['R_LEG_att']
ufc_final['R_TD_pct'] = ufc_final['R_TD_landed']/ufc_final['R_TD_att']
ufc_final['R_SUB._pct'] = ufc_final['R_SUB_ATT']
ufc_final['R_SIG_STR._pct'] = ufc_final['R_SIG_STR_pct']

In [42]:
ufc_final['B_TOTAL_STR._pct'] = ufc_final['B_TOTAL_STR._landed']/ufc_final['B_TOTAL_STR._att']
ufc_final['B_BODY_pct'] = ufc_final['B_BODY_landed']/ufc_final['B_BODY_att']
ufc_final['B_CLINCH_pct'] = ufc_final['B_CLINCH_landed']/ufc_final['B_CLINCH_att']
ufc_final['B_DISTANCE_pct'] = ufc_final['B_DISTANCE_landed']/ufc_final['B_DISTANCE_att']
ufc_final['B_GROUND_pct'] = ufc_final['B_GROUND_landed']/ufc_final['B_GROUND_att']
ufc_final['B_HEAD_pct'] = ufc_final['B_HEAD_landed']/ufc_final['B_HEAD_att']
ufc_final['B_LEG_pct'] = ufc_final['B_LEG_landed']/ufc_final['B_LEG_att']
ufc_final['B_TD_pct'] = ufc_final['B_TD_landed']/ufc_final['B_TD_att']
ufc_final['B_SUB._pct'] = ufc_final['B_SUB_ATT']
ufc_final['B_SIG_STR._pct'] = ufc_final['B_SIG_STR_pct']

In [43]:
ufc_final.drop(['R_TOTAL_STR._landed',
                'R_TOTAL_STR._att',
                'B_TOTAL_STR._landed',
                'B_TOTAL_STR._att',
                'R_BODY_att',
                'R_BODY_landed',
                'R_CLINCH_att',
                'R_CLINCH_landed',
                'R_DISTANCE_att',
                'R_DISTANCE_landed',
                'R_GROUND_att',
                'R_GROUND_landed',
                'R_HEAD_att',
                'R_HEAD_landed',
                'R_LEG_att',
                'R_LEG_landed',
                'R_BODY_att',
                'R_BODY_landed',
                'R_TD_att',
                'R_TD_landed',
                'B_BODY_att',
                'B_BODY_landed',
                'B_CLINCH_att',
                'B_CLINCH_landed',
                'B_DISTANCE_att',
                'B_DISTANCE_landed',
                'B_GROUND_att',
                'B_GROUND_landed',
                'B_HEAD_att',
                'B_HEAD_landed',
                'B_LEG_att',
                'B_LEG_landed',
                'B_BODY_att',
                'B_BODY_landed',
                'B_TD_att',
                'B_TD_landed',
                'B_SUB_ATT',
                'R_SUB_ATT',
                'R_SIG_STR_pct',
                'B_SIG_STR_pct'], axis=1, inplace=True)

In [44]:
def get_pct(att, landed):
    pct = (landed / att) * 100
    return pct

In [45]:
def average_pct(r_pct, b_pct):
    average_pct = (r_pct.median() + b_pct.median())/2
    return average_pct

# Aumentar quantidade de dados

In [46]:
# def data_enhancement(df):
    
#     columns = df.columns
#     df.columns = df.columns.str.replace('R_','B_')
#     df['Winner_num'] = df['Winner_num'].replace(1,'blue')
#     df['Winner_num'] = df['Winner_num'].replace(0,'red')
#     df['Winner_num'] = df['Winner_num'].replace('red',0)
#     df['Winner_num'] = df['Winner_num'].replace('blue',1)
#     df.columns = columns
    
#     return df

# ufc_final2 = data_enhancement(ufc_final)
# ufc_final = pd.concat([ufc_final, ufc_final2],ignore_index = True)

In [47]:
columns  = ['KD', 'TD_pct', 'PASS', 'REV', 'Height', 'Weight',
            'Reach', 'age', 'Southpaw', 'Orthodox', 'Switch',
            'TOTAL_STR._pct', 'BODY_pct', 'CLINCH_pct', 'DISTANCE_pct',
            'GROUND_pct', 'HEAD_pct', 'LEG_pct', 'SUB._pct',
            'SIG_STR._pct', 'Wins']
R_columns = ufc_final.filter(regex=('R_')).columns[1:]
B_columns = ufc_final.filter(regex=('B_')).columns[1:]

# Funções para pegar dados dos lutadores

In [48]:
def get_fighter_df(fighter_R, fighter_B):
    
    f_r = df_fighters.loc[fighter_R]
    f_b = df_fighters.loc[fighter_B]

    R_columns = ufc_final.filter(regex=('R_')).columns[1:]
    B_columns = ufc_final.filter(regex=('B_')).columns[1:]
    
    f_r.index = R_columns
    f_b.index = B_columns
    
    fighter_r = f_r.to_dict()
    fighter_b = f_b.to_dict()

    fighter_r.update(fighter_b)
    
    fighter_test = pd.DataFrame(fighter_r, columns=fighter_r.keys() , index=[0])
    
    
    return fighter_test

In [49]:
def get_fighter_hist(name):
    
    r_f = ufc_final[ufc_final['R_fighter']==name][R_columns]
    b_f = ufc_final[ufc_final['B_fighter']==name][B_columns]

    
    r_f = r_f.rename(columns={'R_fighter':'Fighter'})
    b_f = b_f.rename(columns={'B_fighter':'Fighter'})
    
    r_f.columns = r_f.columns.str.replace("R_",'')
    b_f.columns = b_f.columns.str.replace("B_",'')
    
    fighter_history = pd.concat([r_f, b_f],axis=0)    

    return fighter_history.sort_values('age')

In [50]:
d = {}
for fighter in fighters:
    d["{0}".format(fighter)] = get_fighter_hist(fighter)

In [51]:
for fighter in fighters:
    d[fighter].mean()
    d[fighter] = d[fighter].apply(lambda x: x.fillna(x.mean(), axis=0))

In [52]:
def get_fighter_info(name):
    a = d[name].mean(axis=0)
    b = d[name].mean(axis=0)
    c = (a.values + b.values)/2
    k = {key: value for key, value in zip(columns, c.round(2))}
    return k

In [53]:
lista = []
for i in fighters:
    lista.append(get_fighter_info(i))


df_fighters = pd.DataFrame(data=lista, index=fighters)

In [54]:
empty_fighters = []
for fighter in fighters:
    if d[fighter].empty:
        empty_fighters.append(fighter)

df_fighters.dropna(axis=0, how='all', inplace=True)

In [55]:
df_fighters.to_excel('fighter_info.xlsx')

# Mudar ufc_final

In [56]:
cols = ['R_KD', 'R_TD_pct', 'R_PASS', 'R_REV', 'R_Height', 'R_Weight',
       'R_Reach', 'R_age', 'R_Southpaw', 'R_Orthodox', 'R_Switch',
       'R_TOTAL_STR._pct', 'R_BODY_pct', 'R_CLINCH_pct', 'R_DISTANCE_pct',
       'R_GROUND_pct', 'R_HEAD_pct', 'R_LEG_pct', 'R_SUB._pct',
       'R_SIG_STR._pct', 'B_KD', 'B_TD_pct', 'B_PASS', 'B_REV', 'B_Height',
       'B_Weight', 'B_Reach', 'B_age', 'B_Southpaw', 'B_Orthodox', 'B_Switch',
       'B_TOTAL_STR._pct', 'B_BODY_pct', 'B_CLINCH_pct', 'B_DISTANCE_pct',
       'B_GROUND_pct', 'B_HEAD_pct', 'B_LEG_pct', 'B_SUB._pct',
       'B_SIG_STR._pct']

In [57]:
def generate_samples(df, num_samples):
    new_rows = []

    for _ in range(num_samples):
        new_row = {}
        for column in df.columns:
            new_row[column] = np.random.choice(df[column])
        new_rows.append(new_row)

    return pd.DataFrame(new_rows)

In [58]:
def get_fights_df(fighter_R, fighter_B):
    
    f_r = df_fighters.loc[fighter_R]
    f_b = df_fighters.loc[fighter_B]

    R_columns = ufc_final.filter(regex=('R_')).columns[1:]
    B_columns = ufc_final.filter(regex=('B_')).columns[1:]
    
    f_r.index = R_columns
    f_b.index = B_columns
    
    fighter_r = f_r.to_dict()
    fighter_b = f_b.to_dict()

    fighter_r.update(fighter_b)
        
    
    return fighter_r

In [59]:
fights = ufc_final[['R_fighter','B_fighter']]
fights = list(fights.values)
f = [get_fights_df(fight[0],fight[1]) for fight in fights]

In [60]:
ufc_means = pd.DataFrame(f)

In [61]:
ufc_means[['R_fighter','B_fighter']] = ufc_final[['R_fighter','B_fighter']]
ufc_means['Winner_num'] = ufc_final['Winner_num']

In [62]:
ufc_means['R_Reach'].fillna(ufc_means['R_Height'],inplace=True)
ufc_means['B_Reach'].fillna(ufc_means['B_Height'],inplace=True)

In [63]:
# cols = ufc_means.columns
# ufc_means[cols].fillna(ufc_means[cols].median(),inplace=True)

In [64]:
def replace_nan(df, series):
    df[series].fillna(df[series].median(),inplace = True)
    return df

In [65]:
cols = ['R_KD', 'R_TD_pct', 'R_PASS', 'R_REV', 'R_Height', 'R_Weight',
       'R_Reach', 'R_age', 'R_Southpaw', 'R_Orthodox', 'R_Switch',
       'R_TOTAL_STR._pct', 'R_BODY_pct', 'R_CLINCH_pct', 'R_DISTANCE_pct',
       'R_GROUND_pct', 'R_HEAD_pct', 'R_LEG_pct', 'R_SUB._pct',
       'R_SIG_STR._pct', 'B_KD', 'B_TD_pct', 'B_PASS', 'B_REV', 'B_Height',
       'B_Weight', 'B_Reach', 'B_age', 'B_Southpaw', 'B_Orthodox', 'B_Switch',
       'B_TOTAL_STR._pct', 'B_BODY_pct', 'B_CLINCH_pct', 'B_DISTANCE_pct',
       'B_GROUND_pct', 'B_HEAD_pct', 'B_LEG_pct', 'B_SUB._pct',
       'B_SIG_STR._pct']

In [66]:
for col in cols:
    replace_nan(ufc_means, col)

In [67]:
ufc_final['R_Reach'].fillna(ufc_final['R_Height'],inplace=True)
ufc_final['B_Reach'].fillna(ufc_final['B_Height'],inplace=True)

In [68]:
for col in cols:
    replace_nan(ufc_final, col)

# Models

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from scipy import stats


import warnings
warnings.filterwarnings("ignore")

# Preparar dados

In [70]:
ufc_final = ufc_final.dropna(axis=0)
X = ufc_final.drop('Winner_num', axis=1)
y = ufc_final['Winner_num']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)
X_test, X_mc_test, y_test, y_mc_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Logistic regression

In [71]:
from sklearn import linear_model
log = linear_model.LogisticRegression()
log.fit(X_train.iloc[:,2:],y_train)

In [72]:
pred = log.predict(X_test.iloc[:,2:])

In [73]:
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average = 'macro')
recall = recall_score(y_test, pred, average = 'macro')

print('accuracy= %.2f%%, precision= %.2f%%, recall= %.2f%%, mse= ' % (accuracy*100, precision*100, recall*100), mean_squared_error(y_test, pred).round(2))

accuracy= 82.82%, precision= 80.93%, recall= 80.18%, mse=  0.17


In [74]:
roc_auc_score(y_test, pred).round(2)

0.8

In [75]:
logistic_model = log.fit(X.iloc[:,2:], y)

In [76]:
# save the model to disk
filename = 'logistic_raw_model.sav'
pickle.dump(logistic_model, open(filename, 'wb'))

# XGBClassifier

In [77]:
#from sklearn.model_selection import cross_val_predict
#from sklearn.metrics import make_scorer

#randmodel=XGBClassifier()

#randparams={'subsample':[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
#            'reg_alpha':[10e-5, 10e-3, 0.01, 0.1, 1, 10, 100],
#            'min_child_weight':[2,3,4,5,6,7,8,9,10,15,20,30],
#            'max_depth':[4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,55,100],
#            'gamma':[0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,2],
#            'eta':[0.001,0.003,0.005,0.007,0.01,0.03,0.05,0.07,0.1,0.3,0.5,0.7,0.9], 
#            'colsample_bytree':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}

#clf_model = RandomizedSearchCV(randmodel, param_distributions=randparams, n_iter=500, cv=7, scoring='roc_auc', n_jobs=5, verbose=3)

#search = clf_model.fit(X_train, y_train)

In [78]:
#print(search.best_params_, '\n', '\n', search.best_score_)

In [79]:
model_xgb = XGBClassifier(objective='binary:logistic',
                      subsample='0.9',
                      reg_alpha='0.1',
                      min_child_weight='2',
                      max_depth='25',
                      gamma='0.3',
                      eta='0.07',
                      colsample_bytree='0.6')

model_xgb.fit(X_train.iloc[:,2:], y_train)
pred = model_xgb.predict(X_test.iloc[:,2:])

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average = 'macro')
recall = recall_score(y_test, pred, average = 'macro')

print('accuracy= %.2f%%, precision= %.2f%%, recall= %.2f%%, mse= ' % (accuracy*100, precision*100, recall*100), mean_squared_error(y_test, pred).round(2))

accuracy= 85.70%, precision= 84.15%, recall= 83.57%, mse=  0.14


In [80]:
roc_auc_score(y_test, pred).round(2)

0.84

In [81]:
# save the model to disk
filename = 'xgb_raw_model.sav'
pickle.dump(model_xgb, open(filename, 'wb'))

In [82]:
# roc_auc_score(y_test, yhat).round(2)

# Naive-Bayes Classifier

In [83]:
import sklearn.naive_bayes

model_bayes = sklearn.naive_bayes.GaussianNB()

model_bayes.fit(X_train.iloc[:,2:], y_train)
pred = model_bayes.predict(X_test.iloc[:,2:])

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average = 'macro')
recall = recall_score(y_test, pred, average = 'macro')

print('accuracy= %.2f%%, precision= %.2f%%, recall= %.2f%%, mse= ' % (accuracy*100, precision*100, recall*100), mean_squared_error(y_test, pred).round(2))

accuracy= 80.04%, precision= 77.90%, recall= 80.06%, mse=  0.2


In [84]:
roc_auc_score(y_test, pred).round(2)

0.8

In [85]:
# save the model to disk
filename = 'nb_raw_model.sav'
pickle.dump(model_bayes, open(filename, 'wb'))

# SVM-SVC

In [86]:
from sklearn import svm
svm_model = svm.SVC(kernel='linear', gamma='auto', decision_function_shape='ovr', probability=True)

svm_model.fit(X_train.iloc[:,2:], y_train)
pred = svm_model.predict(X_test.iloc[:,2:])

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average = 'macro')
recall = recall_score(y_test, pred, average = 'macro')

print('accuracy= %.2f%%, precision= %.2f%%, recall= %.2f%%, mse= ' % (accuracy*100, precision*100, recall*100), mean_squared_error(y_test, pred).round(2))

accuracy= 83.10%, precision= 81.07%, recall= 81.07%, mse=  0.17


In [87]:
roc_auc_score(y_test, pred).round(2)

0.81

In [88]:
# save the model to disk
filename = 'svc_raw_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))

In [89]:
from sklearn.ensemble import VotingClassifier

In [90]:
voting_clf = VotingClassifier(
estimators=[
('xgb', model_xgb),
('svc', svm_model),
('nb', model_bayes),   
('lr', logistic_model)])
voting_clf.voting = "soft"
voting_clf.fit(X_train.iloc[:,2:], y_train)
pred = voting_clf.predict(X_test.iloc[:,2:])

In [91]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test.iloc[:,2:], y_test))

xgb = 0.8570102135561746
svc = 0.8310120705663882
nb = 0.8003714020427113
lr = 0.8282265552460538


In [92]:
voting_clf.score(X_test.iloc[:,2:], y_test)

0.8347260909935005

In [93]:
filename = 'ensemble_model_combined.sav'
pickle.dump(voting_clf, open(filename, 'wb'))

# Predictions

In [94]:
model_cols = X_train.iloc[:,2:].columns
cols_ = X_train.columns

In [95]:
combined_list = [[item1, item2] for item1, item2 in zip(X_mc_test.iloc[:,:2]['R_fighter'],
                                                        X_mc_test.iloc[:,:2]['B_fighter'])]

In [96]:
most_common_log_prediction = []
most_common_xgb_prediction = []
most_common_nb_prediction = []
most_common_svc_prediction = []
most_common_voting_clf_pred = []

for i in combined_list:
    a = generate_samples(get_fighter_hist(i[0]),1000)
    R_a = a.add_prefix('R_')
    R_a['R_fighter'] = i[0]
    b = generate_samples(get_fighter_hist(i[1]),1000)
    B_b = b.add_prefix('B_')
    B_b['B_fighter'] = i[1]
    samples = pd.concat([R_a, B_b], axis=1)
    samples = samples[model_cols]
    predictions_log = logistic_model.predict(samples)
    predictions_xgb = model_xgb.predict(samples)
    predictions_nb = model_bayes.predict(samples)
    predictions_svm = svm_model.predict(samples)
    voting_clf_pred = voting_clf.predict(samples)
    most_common_log_prediction.append(stats.mode(predictions_log)[0])
    most_common_xgb_prediction.append(stats.mode(predictions_xgb)[0])
    most_common_nb_prediction.append(stats.mode(predictions_nb)[0])
    most_common_svc_prediction.append(stats.mode(predictions_svm)[0])
    most_common_voting_clf_pred.append(stats.mode(voting_clf_pred)[0])

In [97]:
accuracy_log = accuracy_score(y_mc_test, most_common_log_prediction)
accuracy_xgb = accuracy_score(y_mc_test, most_common_xgb_prediction)
accuracy_nb = accuracy_score(y_mc_test, most_common_nb_prediction)
accuracy_svc = accuracy_score(y_mc_test, most_common_svc_prediction)
accuracy_voting = accuracy_score(y_mc_test, most_common_voting_clf_pred)

In [98]:
print(accuracy_log, accuracy_xgb, accuracy_nb, accuracy_svc, accuracy_voting)

0.6898792943361188 0.7084493964716806 0.6768802228412256 0.6945218198700093 0.6926648096564532


In [99]:
f=open('future_fights','r')
lutas = f.read().replace("\"\"", "").split('\n')[:-1]
f.close()

lutas = [x.replace('[','') for x in lutas]
lutas = [x.replace(']','') for x in lutas]
lutas = [x.replace("'",'') for x in lutas]
lutas = [x.split(',') for x in lutas]

In [100]:
fighters = []
most_common_log_prediction = []
most_common_xgb_prediction = []
most_common_nb_prediction = []
most_common_svc_prediction = []
most_common_voting_clf_pred = []

for i in lutas:
    try:
        a = generate_samples(get_fighter_hist(i[0]),500)
        R_a = a.add_prefix('R_')
        R_a['R_fighter'] = i[0]
        b = generate_samples(get_fighter_hist(i[1]),500)
        B_b = b.add_prefix('B_')
        B_b['B_fighter'] = i[1]
        samples = pd.concat([R_a, B_b], axis=1)
        samples = samples[model_cols]

        fighters.append([i[0],i[1]])

        predictions_log = logistic_model.predict(samples)
        predictions_xgb = model_xgb.predict(samples)
        predictions_nb = model_bayes.predict(samples)
        predictions_svm = svm_model.predict(samples)
        voting_clf_pred = voting_clf.predict(samples)
        most_common_log_prediction.append(stats.mode(predictions_log)[0])
        most_common_xgb_prediction.append(stats.mode(predictions_xgb)[0])
        most_common_nb_prediction.append(stats.mode(predictions_nb)[0])
        most_common_svc_prediction.append(stats.mode(predictions_svm)[0])
        most_common_voting_clf_pred.append(stats.mode(voting_clf_pred)[0])
    
    except:
        fighters.append('nan')
        most_common_log_prediction.append('nan')
        most_common_xgb_prediction.append('nan')
        most_common_nb_prediction.append('nan')
        most_common_svc_prediction.append('nan')
        most_common_voting_clf_pred.append('nan')

In [101]:
for i in range(len(lutas)):
    print(fighters[i],
          most_common_voting_clf_pred[i])

['Beneil Dariush', 'Arman Tsarukyan'] 0.0
['Jalin Turner', 'Bobby Green'] 0.0
['Rob Font', 'Deiveson Figueiredo'] 1.0
['Sean Brady', 'Kelvin Gastelum'] 0.0
['Clay Guida', 'Joaquim Silva'] 0.0
['Punahele Soriano', 'Dustin Stoltzfus'] 0.0
['Miesha Tate', 'Julia Avila'] 1.0
nan nan
['Drakkar Klose', 'Joe Solecki'] 1.0
nan nan
['Wellington Turman', 'Jared Gooden'] 0.0
['Veronica Hardy', 'Jamey-Lyn Horth'] 0.0
['Song Yadong', 'Chris Gutierrez'] 0.0
['Anthony Smith', 'Khalil Rountree Jr.'] 1.0
['Sumudaerji', 'Allan Nascimento'] 0.0
['Nasrat Haqparast', 'Jamie Mullarkey'] 1.0
['JunYong Park', 'Andre Muniz'] 0.0
['Song Kenan', 'Kevin Jousset'] 0.0
['Tatsuro Taira', 'Carlos Hernandez'] 0.0
['HyunSung Park', 'Shannon Ross'] 0.0
['Luana Santos', 'Stephanie Egger'] 1.0
nan nan
['Leon Edwards', 'Colby Covington'] 1.0
['Alexandre Pantoja', 'Brandon Royval'] 0.0
['Shavkat Rakhmonov', 'Stephen Thompson'] 0.0
['Tony Ferguson', 'Paddy Pimblett'] 1.0
['Vicente Luque', 'Ian Garry'] 1.0
['Josh Emmett', 'Gi