## 1. Reading Data File

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline

plt.rcParams.update({'font.size': 18})
data_location = '../generated_data/combined_fight_data.csv'

data = pd.read_csv(data_location)
data.head()

## 2. Basic Data Stats

In [None]:
data.info(verbose=True, null_counts=True)

In [None]:
data.describe()

## 3. Exploring Categorical Columns

### 3.1 Weight Class

In [None]:
weight_class = pd.DataFrame(data['weight_class'].value_counts())
weight_class.reset_index(inplace=True)
weight_class.columns = ['weight_class', 'count']
weight_class.set_index('weight_class',inplace=True)
weight_class.plot.barh(figsize=(14, 12)).set_ylabel('Weight Class', fontsize=18)
plt.show()

### 3.2 Fighter Stance

In [None]:
# red fighter stance
R_Stance = pd.DataFrame(data['R_Stance'].value_counts())
R_Stance.reset_index(inplace=True)
R_Stance.columns = ['R_Stance', 'count']
R_Stance.set_index('R_Stance',inplace=True)
R_Stance.plot.barh(figsize=(10, 8)).set_ylabel('Red Fighter Stance', fontsize=16)

In [None]:
# Blue fighter stance
B_Stance = pd.DataFrame(data['B_Stance'].value_counts())
B_Stance.reset_index(inplace=True)
B_Stance.columns = ['B_Stance', 'count']
B_Stance.set_index('B_Stance',inplace=True)
plot = B_Stance.plot.barh(figsize=(10, 8)).set_ylabel('Blue Fighter Stance', fontsize=16)

### 3.3 End Method

In [None]:
end_method = pd.DataFrame(data['end_method'].value_counts())
end_method.reset_index(inplace=True)
end_method.columns = ['end_method', 'count']
end_method.set_index('end_method',inplace=True)
end_method.plot.barh(figsize=(12, 10)).set_ylabel('End Method', fontsize=16)

### 3.4 End How

In [None]:
data['end_how'].value_counts()

## 4.  Exploring Target Column Winner

In [None]:
# changing winner label to bool and keeping whether or not Red fighter won
data_copy = data.copy()
data_copy['Winner'] = data_copy['Winner'].apply(lambda x: True if x == 'Red' else False)

winner = pd.DataFrame(data_copy['Winner'].value_counts())
winner.reset_index(inplace=True)
winner.columns = ['Winner', 'count']
winner.set_index('Winner',inplace=True)
winner.plot.barh(figsize=(12, 10)).set_ylabel('Red Fighter Won', fontsize=16)

# Explore Null values and feature correlations

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({'font.size': 18})

def import_and_merge():
    fight_data = pd.read_csv(data_location)
    
    #Add Blue and Red win columns
    fight_data['B_Winner'] = [1 if x=='Blue' else 0 for x in fight_data['Winner']]
    fight_data['R_Winner'] = [1 if x=='Red' else 0 for x in fight_data['Winner']]
    
    #Drop columns irrelevant to this prediction
    fight_data = fight_data.drop(columns=['Referee', 'no_of_rounds', 'Winner', 'date', 'end_method', 'end_how', 
                                          'end_round', 'attendance'])
    print('Fight Data Stats: ')
    print('Shape: ', fight_data.shape)
    #display(fight_data)
    
    #Separate fight data into individual fighter stats
    blue_fighters = fight_data.loc[:, [col for col in fight_data.columns if re.search('^R_', col)==None]]
    blue_fighters = blue_fighters.rename(columns=lambda x: re.sub('^B_', '', x))
    print('\nBlue fighter Stats: ')
    print('Shape: ', blue_fighters.shape)
    #display(blue_fighters)
    
    red_fighters = fight_data.loc[:, [col for col in fight_data.columns if re.search('^B_', col)==None]]
    red_fighters = red_fighters.rename(columns=lambda x: re.sub('^R_', '', x))
    print('\nRed fighter Stats: ')
    print('Shape: ', red_fighters.shape)
    #display(red_fighters)
    
    #Concatenate blue and red fighter stats
    fighters_data = pd.concat([blue_fighters, red_fighters])
    fighters_data = fighters_data.rename(columns={'total_time_fought(seconds)':'total_time_fought_seconds'})
    print('\nTotal fighter Stats: ')
    print('Shape: ', fighters_data.shape)
    
    # Create df without offence stats
    fighters_no_offence_stats = fighters_data.drop(columns=[col for col in fighters_data.columns if re.search('^avg_', col) != None])
    fighters_no_offence_stats = fighters_no_offence_stats.drop(columns='total_time_fought_seconds')
    print('\nFighters no offensive stats: ')
    print('Shape: ', fighters_no_offence_stats.shape)
    
    return (fighters_no_offence_stats, fighters_data)
    
data_no_offence_stats, data  = import_and_merge()

In [None]:
def plot_correlation(df):
    df = df.drop(columns=['fighter','city', 'country', 'weight_class'])
    dummy_df = pd.get_dummies(df)

    corr = dummy_df.corr()['Winner'][:]
    corr = corr.drop(['Winner']) * 100
    corr = corr.loc[~pd.isnull(corr)]
    corr = abs(corr).sort_values()
    
    fig = plt.figure(figsize=(30, 10))
    ax = plt.axes()
    ax.bar(corr.index, corr)
    plt.xticks(rotation='vertical')
    plt.ylabel('Correlation (%)')
    plt.title('Correlation between features and fight outcomes')
    
    most_corr_features = corr.index[-int(len(corr)/2):]
    return most_corr_features

def plot_missing_vals(df):
    columns = []
    nans_per_col = []
    
    for col in df.columns:
        num_nans = sum(pd.isnull(df[col]))
        #print('Num of NaNs in col ', col, ': ', num_nans)
        columns.append(col)
        nans_per_col.append(num_nans)
    
    print('Shape: ', df.shape)
    fig = plt.figure(figsize=(30, 10))
    ax = plt.axes()
    ax.bar(columns, nans_per_col)
    plt.xticks(rotation='vertical')
    plt.ylabel('# NaNs')
    plt.title('Number of missing data per feature')

plot_missing_vals(data)
top_features = plot_correlation(data)

# Explore overall distribution (Not split by weight-class)
Explore the box and point plots of all features in the dataset. These plots were then looked at in more detail to draw insights

In [None]:
import seaborn as sns

def inspect_data(df, kind, columns):
    for column in columns:
        try:
            fig = plt.figure(figsize=(30, 13))
            sns.catplot(x='Winner', y=column, data=df, kind=kind);
            plt.show()
        except:
            pass

#Inspect data for all categorical features
kinds = ['box', 'point']
df = data.drop(columns=['fighter', 'city', 'country']).copy()
for kind in kinds:
    inspect_data(df, kind, df.columns)
    

# Explore distributions on a per weight-class basis
Explore box and point plots for features in dataset. Split by weight-class

In [None]:
def inspect_data_wc(df, kind, columns):
    for column in columns:
        try:
            fig = plt.figure(figsize=(30, 13))
            sns.catplot(x='Winner', y=column, data=df, kind=kind, hue='weight_class');
            plt.show()
        except:
            pass

#Inspect data for age, height, reach categorical features with weight_class as hue
kinds = ['box', 'point']
columns = ['age', 'Height_cms', 'Reach_cms', 'current_win_streak', 'current_lose_strak', 'total_rounds_fought']
df = data.drop(columns=['fighter', 'city', 'country'])
for kind in kinds:
    inspect_data_wc(df, kind, df.columns)