## Import Fight Data and Merge with Hometown Data

In [38]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

fight_data_file = '../ufcdata/data.csv'
fight_data_file_prep = '../ufcdata/preprocessed_data.csv'
scorecard_data_file = '../scorecard/scorecard.csv'
hometown_data_file = '../elevations/fighters_elevation.csv'

def import_and_merge():
    fight_data = pd.read_csv(fight_data_file)
    hometown_data = pd.read_csv(hometown_data_file)
    
    #Add Blue and Red win columns
    fight_data['B_Winner'] = [1 if x=='Blue' else 0 for x in fight_data['Winner']]
    fight_data['R_Winner'] = [1 if x=='Red' else 0 for x in fight_data['Winner']]
    
    #Drop columns irrelevant to this prediction
    fight_data = fight_data.drop(columns=['Referee', 'no_of_rounds', 'Winner', 'date', 'R_Stance', 'B_Stance'])
    print('Fight Data Stats: ')
    print('Shape: ', fight_data.shape)
    #display(fight_data)
    
    #Separate fight data into individual fighter stats
    blue_fighters = fight_data.loc[:, [col for col in fight_data.columns if re.search('^R_', col)==None]]
    blue_fighters = blue_fighters.rename(columns=lambda x: re.sub('^B_', '', x))
    print('\nBlue fighter Stats: ')
    print('Shape: ', blue_fighters.shape)
    #display(blue_fighters)
    
    red_fighters = fight_data.loc[:, [col for col in fight_data.columns if re.search('^B_', col)==None]]
    red_fighters = red_fighters.rename(columns=lambda x: re.sub('^R_', '', x))
    print('\nRed fighter Stats: ')
    print('Shape: ', red_fighters.shape)
    #display(red_fighters)
    
    #Concatenate blue and red fighter stats
    fighters_data = pd.concat([blue_fighters, red_fighters])
    fighters_data = fighters_data.rename(columns={'total_time_fought(seconds)':'total_time_fought_seconds'})
    print('\nTotal fighter Stats: ')
    print('Shape: ', fighters_data.shape)
    
    # Create df without offence stats
    fighters_no_offence_stats = fighters_data.drop(columns=[col for col in fighters_data.columns if re.search('^avg_', col) != None])
    fighters_no_offence_stats = fighters_no_offence_stats.drop(columns='total_time_fought_seconds')
    print('\nFighters no offensive stats: ')
    print('Shape: ', fighters_no_offence_stats.shape)
    
    #Merge with home town data
    hometown_data['merge_key'] = hometown_data['Fighter Name'].apply(lambda x: ' '.join(sorted(str.lower(x).split())))
    hometown_data = hometown_data[~hometown_data.duplicated('merge_key', keep='first')]
    print('\nHometown Stats: ')
    print('Shape: ', hometown_data.shape)
    
    merged_df = fighters_data.copy()
    merged_df['merge_key'] = merged_df['fighter'].apply(lambda x: ' '.join(sorted(str.lower(x).split())))
    merged_df = pd.merge(left=merged_df, right=hometown_data, how='left', left_on='merge_key', right_on='merge_key')
    merged_df = merged_df.drop(columns=['Fighter Name', 'merge_key'])
    merged_df = merged_df.rename(columns={'Location':'hometown'})
    print('\nFighter + offensive + hometown data stats: ')
    print('Shape: ', merged_df.shape)
    #display(merged_df)

    
    return (fighters_no_offence_stats, fighters_data, merged_df)
    
data_no_offence_stats, data, data_w_hometown,  = import_and_merge()

Fight Data Stats: 
Shape:  (5144, 141)

Blue fighter Stats: 
Shape:  (5144, 72)

Red fighter Stats: 
Shape:  (5144, 72)

Total fighter Stats: 
Shape:  (10288, 72)

Fighters no offensive stats: 
Shape:  (10288, 23)

Hometown Stats: 
Shape:  (1331, 4)

Fighter + offensive + hometown data stats: 
Shape:  (10288, 74)



# Model Selection
- Separate location and home town into city and country
- Categorize weight_class, title_bout, Stance, and locations
- Convert elevation data into numeric data
- Tune parameters
- Visualize model performance

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score


def plot_missing_vals(df):
    columns = []
    nans_per_col = []
    

    for col in df.columns:
        num_nans = sum(pd.isnull(df[col]))
        #print('Num of NaNs in col ', col, ': ', num_nans)
        columns.append(col)
        nans_per_col.append(num_nans)
    
    print('Shape: ', df.shape)
    fig = plt.figure(figsize=(30, 13))
    ax = plt.axes()
    ax.bar(columns, nans_per_col)
    plt.xticks(rotation='vertical')
    plt.ylabel('# NaNs')
    plt.title('Number of missing data per feature')
    
def inspect_data(fighter_df):
    # Display missing value numbers in fighter data
    print('\nBefore dropping fightres with no offence/defence stats')
    #plot_missing_vals(fighter_df)
        
    #Remove rows with a lot of missing information
    if 'avg_BODY_att' in fighter_df.columns:
        fighter_df = fighter_df[~pd.isnull(fighter_df['avg_BODY_att'])]
    elif 'total_time_fought_seconds' in fighter_df.columns:
        fighter_df = fighter_df[~pd.isnull(fighter_df['total_time_fought_seconds'])]       
    print('\nAfter dropping fighters with no offence/defemce stats') 
    #plot_missing_vals(fighter_df)
    
    #Fighter age, weight, height, and reach distribution
    hrw_attrs_df = pd.DataFrame(fighter_df[['Height_cms', 'Reach_cms', 'Weight_lbs']])    
    fig = plt.figure(figsize=(30, 13))
    ax = plt.axes()
    hrw_attrs_df.boxplot()
    plt.title('Height, Reach and weight dsitributions')
    plt.ylim([100, 250])
    
    age_df = pd.DataFrame(fighter_df[['age']])   
    fig = plt.figure(figsize=(30, 13))
    ax = plt.axes()
    age_df.boxplot()
    plt.title('Age dsitributions')
       

def clean_data(fighter_df):    
   #Fill missing values for height, reach, weight, age
    weight_class_means = {}
    weight_classes = np.unique(fighter_df['weight_class'])
    for weight_class in weight_classes:
        weight_class_idx = fighter_df['weight_class'] == weight_class
        
        null_idx = np.logical_and(pd.isnull(fighter_df['Height_cms']), weight_class_idx)
        fighter_df.loc[null_idx, 'Height_cms'] = np.nanmean(fighter_df.loc[weight_class_idx, 'Height_cms'])
        
        null_idx = np.logical_and(pd.isnull(fighter_df['Reach_cms']), weight_class_idx)
        fighter_df.loc[null_idx, 'Reach_cms'] = np.nanmean(fighter_df.loc[weight_class_idx, 'Reach_cms'])
        
        null_idx = np.logical_and(pd.isnull(fighter_df['Weight_lbs']), weight_class_idx)
        fighter_df.loc[null_idx, 'Weight_lbs'] = np.nanmean(fighter_df.loc[weight_class_idx, 'Weight_lbs'])
        
        null_idx = np.logical_and(pd.isnull(fighter_df['age']), weight_class_idx)
        fighter_df.loc[null_idx, 'age'] = np.nanmean(fighter_df.loc[weight_class_idx, 'age'])
    
    # Replacet title bout with actual numbers
    fighter_df.loc[fighter_df['title_bout'] == 'True', 'title_bout'] = 1
    fighter_df.loc[fighter_df['title_bout'] == 'False', 'title_bout'] = 0
    
    #Drop rows with too many missing values
    if 'avg_BODY_att' in fighter_df.columns:
        fighter_df = fighter_df.loc[~pd.isnull(fighter_df['avg_BODY_att']), :]    
        
    return fighter_df

def structure_data(fighter_df):   
    # Split Locations  and hometowns into city and country
    if 'location' in fighter_df.columns:
        fighter_df['city'] = [str.lower(location.split(', ')[0]) for location in fighter_df['location']]
        fighter_df['country'] = [str.lower(location.split(', ')[-1]) for location in fighter_df['location']]
        fighter_df = fighter_df.drop(columns='location')
    
    if 'hometown' in fighter_df.columns:
        #First get rid of data with nan hometowns
        fighter_df['hometown_city'] = [str.lower(location.split(', ')[0]) for location in fighter_df['hometown']]
        fighter_df['hometown_country'] = [str.lower(location.split(', ')[-1]) for location in fighter_df['hometown']]
        fighter_df.drop(columns='hometown')
        
    return fighter_df
    

def compare_models(knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels): # Receives already scales
    accuracy_df = pd.DataFrame(columns=['KNN', 'Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest'])
    precision_df = pd.DataFrame(columns=['KNN', 'Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest'])
    recall_df = pd.DataFrame(columns=['KNN', 'Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest'])
    f1_df = pd.DataFrame(columns=['KNN', 'Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest'])
    
    idx = 0;
    for seed in range(1, 20):
        X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(features, labels, train_size=0.75, test_size=0.25, random_state=seed, stratify=labels)
        
        #Models
        knn = KNeighborsClassifier(**knn_params).fit(X_train_scaled, y_train)    
        lr = LogisticRegression(**lr_params).fit(X_train_scaled, y_train)  
        svc = LinearSVC(**svc_params).fit(X_train_scaled, y_train)
        n_bayes = BernoulliNB(**nbayes_params).fit(X_train_scaled, y_train)
        random_forest = RandomForestClassifier(n_estimators=50, **rforest_params).fit(X_train_scaled, y_train)
        
        #Update tables
        accuracy_df.loc[idx] = [knn.score(X_test_scaled, y_test), lr.score(X_test_scaled, y_test), svc.score(X_test_scaled, y_test), 
                                n_bayes.score(X_test_scaled, y_test), random_forest.score(X_test_scaled, y_test)]

        precision_df.loc[idx] = [precision_score(y_test, knn.predict(X_test_scaled)), precision_score(y_test, lr.predict(X_test_scaled)), 
                                 precision_score(y_test, svc.predict(X_test_scaled)), precision_score(y_test, n_bayes.predict(X_test_scaled)),
                                 precision_score(y_test, random_forest.predict(X_test_scaled))]

        recall_df.loc[idx] = [recall_score(y_test, knn.predict(X_test_scaled)), recall_score(y_test, lr.predict(X_test_scaled)), 
                              recall_score(y_test, svc.predict(X_test_scaled)), recall_score(y_test, n_bayes.predict(X_test_scaled)),
                              recall_score(y_test, random_forest.predict(X_test_scaled))]

        f1_df.loc[idx] = [f1_score(y_test, knn.predict(X_test_scaled)), f1_score(y_test, lr.predict(X_test_scaled)), 
                          f1_score(y_test, svc.predict(X_test_scaled)), f1_score(y_test, n_bayes.predict(X_test_scaled)),
                          f1_score(y_test, random_forest.predict(X_test_scaled))]
        idx += 1
        
    #Display results
    display('\nMedian accuracy score:', accuracy_df.median())
    display('\nMedian precision score:', precision_df.median())
    display('\nMedian recall score:', recall_df.median())
    display('\nMedian F1 score:', f1_df.median())
    
    fig = plt.figure(figsize=(15, 7))
    accuracy_df.boxplot()
    plt.title('Accuracy scores')
    
    fig = plt.figure(figsize=(15, 7))       
    precision_df.boxplot()
    plt.title('Precision score')
    
    fig = plt.figure(figsize=(15, 7))        
    recall_df.boxplot()
    plt.title('Recall score')

    fig = plt.figure(figsize=(15, 7))        
    f1_df.boxplot()
    plt.title('F1 score')
    
    
def parameter_tuning(df, scale):
    #inspect_data(df.copy())
    df = structure_data(df)
    df = clean_data(df)
    df = df.drop(columns=['fighter'])
    dummy_df = pd.get_dummies(df)
    
    labels = dummy_df['Winner']
    features = dummy_df.drop(columns=['Winner'])
    X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.75, test_size=0.25, random_state=0, stratify=labels)
    
    #Scale
    if scale:
        scaler = StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test

    
    # Search for best parameters
    knn = KNeighborsClassifier().fit(X_train_scaled, y_train)
    knn_tuner = GridSearchCV(knn, param_grid={'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'n_neighbors':list(range(1,51))}).fit(X_train_scaled, y_train)
    print('KNN Best params:', knn_tuner.best_params_ )
    
    lr = LogisticRegression().fit(X_train_scaled, y_train)
    lr_tuner =  GridSearchCV(lr, param_grid={'solver':['newton-cg', 'lbfgs', 'liblinear'], 'C':list(np.linspace(0.001,20,40))}).fit(X_train_scaled, y_train)
    print('LR Best params:', lr_tuner.best_params_)
    
    svc = LinearSVC().fit(X_train_scaled, y_train)
    svc_tuner = GridSearchCV(svc, param_grid={'C':list(np.linspace(0.0000000001, 0.15, 50))}).fit(X_train_scaled, y_train)
    print('SVC Best params:', svc_tuner.best_params_)
    
    n_bayes = BernoulliNB().fit(X_train_scaled, y_train)
    nbayes_tuner = GridSearchCV(n_bayes, param_grid={'alpha':list(np.linspace(0.0000000001, 21, 50))}).fit(X_train_scaled, y_train)
    print('NaiveBayes Best params:', nbayes_tuner.best_params_)
    
    random_forest = RandomForestClassifier(n_estimators=100, random_state=0).fit(X_train_scaled, y_train)
    rforest_tuner = GridSearchCV(random_forest, param_grid={'max_features':['auto', 'sqrt', 'log2'], 'max_depth':list(np.linspace(1,30,50)), 'min_samples_split':list(range(2,30))}).fit(X_train_scaled, y_train)
    print('RandomForest Best params:', rforest_tuner.best_params_)
    
    # Compare models with tuned parameters
    compare_models( knn_tuner.best_params_, lr_tuner.best_params_, svc_tuner.best_params_, nbayes_tuner.best_params_, rforest_tuner.best_params_, features, labels)
    return(knn_tuner.best_params_, lr_tuner.best_params_, svc_tuner.best_params_, nbayes_tuner.best_params_, rforest_tuner.best_params_, features, labels)
    


In [None]:
# Compare models for Data with no offensive stats
knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels = parameter_tuning(data_no_offence_stats, False)

In [None]:
# Compare models for data with offensive stats
knn_params_offence, lr_params_offence, svc_params_offence, nbayes_params_offence, rforest_params_offence, features_offence, labels_offence = parameter_tuning(data, False)

KNN Best params: {'algorithm': 'auto', 'n_neighbors': 15}
LR Best params: {'C': 0.001, 'solver': 'liblinear'}
SVC Best params: {'C': 0.05204081639183673}
NaiveBayes Best params: {'alpha': 14.571428571459185}


In [66]:
# Tuned Parameters for unsclaed data. Data with no offensive stats
def evaluate_models_best_params_no_offence(data):
    knn_params =  {'algorithm': 'auto', 'n_neighbors': 19}
    lr_params = {'C': 0.001, 'solver': 'newton-cg'}
    svc_params = {'C': 0.11326530614693876}
    nbayes_params =  {'alpha': 12.428571428612246}
    rforest_params = {'max_depth': 7.5102040816326525, 'max_features': 'auto', 'min_samples_split': 4}
    df = structure_data(data_no_offence_stats.copy())
    df = clean_data(df)
    df = df.drop(columns=['fighter'])
    dummy_df = pd.get_dummies(df)

    labels = dummy_df['Winner']
    features = dummy_df.drop(columns=['Winner'])
    compare_models(knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels)

#Parameters for data with offence stats
def evaluate_models_best_params(data):
    knn_params = {'algorithm': 'auto', 'n_neighbors': 15}
    lr_params = {'C': 0.001, 'solver': 'liblinear'}
    svc_params = {'C': 0.05204081639183673}
    nbayes_params = {'alpha': 14.571428571459185}
    rforest_params = {'max_depth': 10.46938775510204, 'max_features': 'auto', 'min_samples_split': 18}
    df = structure_data(data.copy())
    df = clean_data(df)
    df = df.drop(columns=['fighter'])
    dummy_df = pd.get_dummies(df)

    labels = dummy_df['Winner']
    features = dummy_df.drop(columns=['Winner'])
    compare_models(knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels)
    
#Second pair of parameters
# knn_params = {'algorithm': 'auto', 'n_neighbors': 49}
# lr_params = {'C': 9.23130769230769, 'solver': 'lbfgs'}
# svc_params = {'C': 0.003061224587755102}
# nbayes_params = {'alpha': 18.000000000014285}
# rforest_params = {'max_depth': 10.46938775510204, 'max_features': 'auto', 'min_samples_split': 18}
# df = structure_data(data_no_offence_stats.copy())
# df = clean_data(df)
# df = df.drop(columns=['fighter'])
# dummy_df = pd.get_dummies(df)

# labels = dummy_df['Winner']
# features = dummy_df.drop(columns=['Winner'])
# compare_models(knn_params, lr_params, svc_params, nbayes_params, rforest_params, features, labels)

In [None]:
# Explore correlation and find best correlated features
def plot_correlation(data_no_offence_stats):
    df = clean_data(data_no_offence_stats.copy())
    df = df.drop(columns=['fighter','location', 'weight_class'])
    dummy_df = pd.get_dummies(df)

    corr = dummy_df.corr()['Winner'][:]
    corr = corr.drop(['Winner']) * 100
    corr = corr.loc[~pd.isnull(corr)]
    corr = abs(corr).sort_values()
    
    fig = plt.figure(figsize=(30, 13))
    ax = plt.axes()
    ax.bar(corr.index, corr)
    plt.xticks(rotation='vertical')
    plt.ylabel('Correlation')
    
    most_corr_features = corr.index[-int(len(corr)/2):]
    return most_corr_features

most_corr_features_no_offence = plot_correlation(data_no_offence_stats)
most_corr_features = plot_correlation(data)

# See if models improve with most correlated features
print('*****ALL FEATURES********')
print('\nNo offence stats')
evaluate_models_best_params_no_offence(data_no_offence_stats.copy())
print('\n\nOffence stats')
evaluate_models_best_params(data.copy())

print('*****TOP CORRELATED FEATURES ONLY********')
print('\nNo offence stats')
evaluate_models_best_params_no_offence(data_no_offence_stats[most_corr_features_no_offence].copy())
print('\n\nOffence stats')
evaluate_models_best_params(data[most_corr_features])

*****ALL FEATURES********

No offence stats
KNN, LogisticR, SVC, NB, RandomF


'Median accuracy score:'

KNN                    0.529549
Logistic Regression    0.550933
SVC                    0.508165
Naive Bayes            0.516330
Random Forest          0.540824
dtype: float64

'Median precision score:'

KNN                    0.520408
Logistic Regression    0.547535
SVC                    0.491835
Naive Bayes            0.506446
Random Forest          0.538124
dtype: float64

'Median recall score:'

KNN                    0.524111
Logistic Regression    0.491700
SVC                    0.007115
Naive Bayes            0.656917
Random Forest          0.471146
dtype: float64

'Median F1 score:'

KNN                    0.525903
Logistic Regression    0.517471
SVC                    0.014073
Naive Bayes            0.571624
Random Forest          0.503129
dtype: float64



Offence stats
KNN, LogisticR, SVC, NB, RandomF


'Median accuracy score:'

KNN                    0.515759
Logistic Regression    0.571156
SVC                    0.505253
Naive Bayes            0.508118
Random Forest          0.546323
dtype: float64

'Median precision score:'

KNN                    0.519658
Logistic Regression    0.574586
SVC                    0.514460
Naive Bayes            0.511203
Random Forest          0.550459
dtype: float64

'Median recall score:'

KNN                    0.544423
Logistic Regression    0.581285
SVC                    0.201323
Naive Bayes            0.621928
Random Forest          0.562382
dtype: float64

'Median F1 score:'

KNN                    0.531494
Logistic Regression    0.579615
SVC                    0.297486
Naive Bayes            0.563597
Random Forest          0.553231
dtype: float64

*****TOP CORRELATED FEATURES ONLY********

No offence stats


In [None]:
# See if models improve with best parameters
data_no_offence_stats.median()

In [None]:
# Seek to improve logistic Regression with feature selection