# Import Data and Split Rows for Win/Loss Analysis

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

fight_data_file = '../combined_data/combined_fight_data.csv'

def import_and_merge():
    fight_data = pd.read_csv(fight_data_file)
    
    #Add Blue and Red win columns
    fight_data['B_Winner'] = [1 if x=='Blue' else 0 for x in fight_data['Winner']]
    fight_data['R_Winner'] = [1 if x=='Red' else 0 for x in fight_data['Winner']]
    
    #Drop columns irrelevant to this prediction
    fight_data = fight_data.drop(columns=['Referee', 'no_of_rounds', 'Winner', 'date', 'end_method', 'end_how', 
                                          'end_round', 'attendance'])
    print('Fight Data Stats: ')
    print('Shape: ', fight_data.shape)
    #display(fight_data)
    
    #Separate fight data into individual fighter stats
    blue_fighters = fight_data.loc[:, [col for col in fight_data.columns if re.search('^R_', col)==None]]
    blue_fighters = blue_fighters.rename(columns=lambda x: re.sub('^B_', '', x))
    print('\nBlue fighter Stats: ')
    print('Shape: ', blue_fighters.shape)
    #display(blue_fighters)
    
    red_fighters = fight_data.loc[:, [col for col in fight_data.columns if re.search('^B_', col)==None]]
    red_fighters = red_fighters.rename(columns=lambda x: re.sub('^R_', '', x))
    print('\nRed fighter Stats: ')
    print('Shape: ', red_fighters.shape)
    #display(red_fighters)
    
    #Concatenate blue and red fighter stats
    fighters_data = pd.concat([blue_fighters, red_fighters], ignore_index=True)
    fighters_data = fighters_data.rename(columns={'total_time_fought(seconds)':'total_time_fought_seconds'})
    fighters_data = fighters_data.drop(columns='fighter')
    print('\nTotal fighter Stats: ')
    print('Shape: ', fighters_data.shape)
    
    # Create df without offence stats
    fighters_no_offence_stats = fighters_data.drop(columns=[col for col in fighters_data.columns if re.search('^avg_', col) != None])
    fighters_no_offence_stats = fighters_no_offence_stats.drop(columns='total_time_fought_seconds')
    print('\nFighters no offensive stats: ')
    print('Shape: ', fighters_no_offence_stats.shape)
    
    return (fighters_no_offence_stats, fighters_data)
    
data_no_offence_stats, data  = import_and_merge()

Fight Data Stats: 
Shape:  (5062, 147)

Blue fighter Stats: 
Shape:  (5062, 76)

Red fighter Stats: 
Shape:  (5062, 76)

Total fighter Stats: 
Shape:  (10124, 75)

Fighters no offensive stats: 
Shape:  (10124, 26)


# Clean and Structure Data

In [2]:
def clean_data(fighter_df):    
   #Fill missing values for height, reach, weight, age
    weight_class_means = {}
    weight_classes = np.unique(fighter_df['weight_class'])
    for weight_class in weight_classes:
        weight_class_idx = fighter_df['weight_class'] == weight_class
        
        null_idx = np.logical_and(pd.isnull(fighter_df['Height_cms']), weight_class_idx)
        fighter_df.loc[null_idx, 'Height_cms'] = np.nanmean(fighter_df.loc[weight_class_idx, 'Height_cms'])
        
        null_idx = np.logical_and(pd.isnull(fighter_df['Reach_cms']), weight_class_idx)
        fighter_df.loc[null_idx, 'Reach_cms'] = np.nanmean(fighter_df.loc[weight_class_idx, 'Reach_cms'])
        
        null_idx = np.logical_and(pd.isnull(fighter_df['Weight_lbs']), weight_class_idx)
        fighter_df.loc[null_idx, 'Weight_lbs'] = np.nanmean(fighter_df.loc[weight_class_idx, 'Weight_lbs'])
        
        null_idx = np.logical_and(pd.isnull(fighter_df['age']), weight_class_idx)
        fighter_df.loc[null_idx, 'age'] = np.nanmean(fighter_df.loc[weight_class_idx, 'age'])
    
    # Fill out missing stance
    fighter_df.loc[pd.isnull(fighter_df['Stance']), 'Stance'] = 'Orthodox'
    
    # Ger rid of fights without location
    fighter_df = fighter_df.loc[~pd.isnull(fighter_df['city']), :]
    
    # Fill missing elevations with 0
    fighter_df.loc[pd.isnull(fighter_df['location_elevation']), 'location_elevation'] = 0
    fighter_df.loc[pd.isnull(fighter_df['home_elevation']), 'home_elevation'] = 0
    
    # Replacet title bout with actual numbers
    fighter_df.loc[fighter_df['title_bout'] == 'True', 'title_bout'] = 1
    fighter_df.loc[fighter_df['title_bout'] == 'False', 'title_bout'] = 0
    
    #Drop rows with too many missing values
    if 'avg_BODY_att' in fighter_df.columns:
        fighter_df = fighter_df.loc[~pd.isnull(fighter_df['avg_BODY_att']), :]    
        
    return fighter_df

def structure_data(fighter_df):   
    # Split Locations  and hometowns into city and country
    if 'location' in fighter_df.columns:
        fighter_df['city'] = [str.lower(location.split(', ')[0]) for location in fighter_df['location']]
        fighter_df['country'] = [str.lower(location.split(', ')[-1]) for location in fighter_df['location']]
        fighter_df = fighter_df.drop(columns='location')
    
    if 'hometown' in fighter_df.columns:
        #First get rid of data with nan hometowns
        fighter_df['hometown_city'] = [str.lower(location.split(', ')[0]) for location in fighter_df['hometown']]
        fighter_df['hometown_country'] = [str.lower(location.split(', ')[-1]) for location in fighter_df['hometown']]
        fighter_df.drop(columns='hometown')
        
    return fighter_df
    

# Model Selection and Comparison

In [15]:
#Parameters for data with offence stats
def compare_models(knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels): # Receives already scales
    train_accuracy_df = pd.DataFrame(columns=['Logistic Regression'])
    accuracy_df = pd.DataFrame(columns=['Logistic Regression'])
    precision_df = pd.DataFrame(columns=['Logistic Regression'])
    recall_df = pd.DataFrame(columns=['Logistic Regression'])
    f1_df = pd.DataFrame(columns=['Logistic Regression'])
    
    idx = 0;
    kf = KFold(n_splits=7)
    
    for train_index, test_index in kf.split(features):
        X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=0)
        X_train_scaled, X_test_scaled = features.iloc[train_index, :], features.iloc[test_index, :]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]
        
        #Models   
        lr = LogisticRegression(**lr_params).fit(X_train_scaled, y_train)  
        
        #Update tables
        train_accuracy_df.loc[idx] = lr.score(X_train_scaled, y_train)       
        accuracy_df.loc[idx] = lr.score(X_test_scaled, y_test)
        precision_df.loc[idx] = precision_score(y_test, lr.predict(X_test_scaled))
        recall_df.loc[idx] = recall_score(y_test, lr.predict(X_test_scaled))
        
        f1_df.loc[idx] = f1_score(y_test, lr.predict(X_test_scaled))
        idx += 1
        
    #Display results
    display('------Train accuracy score:-------', train_accuracy_df.median())
    display('------Median accuracy score:-------', accuracy_df.median())
    display('------Median precision score:------', precision_df.median())
    display('------Median recall score:---------', recall_df.median())
    display('------Median F1 score:-------------', f1_df.median())
    
def evaluate_models_best_params(clean_data):
    clean_data['Winner'] = data['Winner'].copy()
    knn_params = {'algorithm': 'auto', 'n_neighbors': 15}
    lr_params = {'C': 0.001, 'solver': 'liblinear'}
    svc_params = {'C': 0.05204081639183673}
    nbayes_params = {'alpha': 14.571428571459185}
    rforest_params = {'max_depth': 10.46938775510204, 'max_features': 'auto', 'min_samples_split': 18}
    dummy_df = pd.get_dummies(clean_data)

    labels = dummy_df['Winner']
    features = dummy_df.drop(columns=['Winner'])
    compare_models(knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels)

# Evaluate

In [17]:
data = clean_data(data)
data = structure_data(data)

print('\nPERFORMANCE')
evaluate_models_best_params(data.copy())


PERFORMANCE


'------Train accuracy score:-------'

Logistic Regression    0.586551
dtype: float64

'------Median accuracy score:-------'

Logistic Regression    0.553463
dtype: float64

'------Median precision score:------'

Logistic Regression    0.634878
dtype: float64

'------Median recall score:---------'

Logistic Regression    0.547658
dtype: float64

'------Median F1 score:-------------'

Logistic Regression    0.552897
dtype: float64