In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scipy.stats import pearsonr

#reading from csv and overview
#https://www.kaggle.com/datasets/aaronfriasr/ufc-fighters-statistics?rvi=1

import os
cwd = os.getcwd()
print(cwd)
os.chdir('C://Users//john8/Year 4/Data Analysis')

data= pd.read_csv("ufc-fighters-statistics.csv")

data.info()
data.head()
data.describe()

In [None]:
#Data cleaning

#Check for null values
data.isnull().sum() 

#height_cm                                        298
#weight_in_kg                                      87
#reach_in_cm                                     1927

#decide to drop rows where above are null

data.dropna(subset=['height_cm', 'weight_in_kg', 'reach_in_cm'], inplace=True)

data.isnull().sum()

#Keep or drop varaibles

#name - keep for reference but exclude from analysis
#nickname - keep for reference but exclude from analysis
#wins, losses - keep as important factor for analysis
#draws - drop to remove ambiguity and make model more interpretable and easier to understand
#height_cm, reach_in_cm - physical attributes important for analysis - keep
#weight_in_kg - drop as irrelevant, different weight classes in ufc
#stance - based on data overhead, 60% orthodox 20% null 19% other - decide to drop - not important
#date_of_birth - drop as not relevant

#significant_strikes_landed_per_minute, significant_striking_accuracy, significant_strikes_absorbed_per_minute, significant_strike_defence
#average_takedowns_landed_per_15_minutes, takedown_accuracy, takedown_defense, average_submissions_attempted_per_15_minutes                                                                                                                                  
#after careful consideration only keep what i think is top 3 most important factors to make a top 5 ufc fighters list
#keep significant_strikes_landed_per_minute, significant_strikes_absorbed_per_minute, takedown_accuracy
#above variables most relevant to a fighter's performance in terms of offensive and defensive capabilities

#drop columns
columns_to_drop = ['draws', 'weight_in_kg', 'stance', 'date_of_birth', 'significant_striking_accuracy', 'significant_strike_defence', 
                   'average_takedowns_landed_per_15_minutes', 'takedown_defense', 
                   'average_submissions_attempted_per_15_minutes']
data.drop(columns=columns_to_drop, inplace=True)

data.info()
data.head()
data.describe()

#round all values in the dataframe to two decimal places
data = data.round(2)

#check updated dataframe
print(data.head())

In [None]:
#OUTLIERS ############################################################

#Wins
#See if theres outliers for the Wins
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.wins)
plt.title("Boxplot of wins")
plt.ylabel("wins")
plt.show()

#one person with 80+ wins decide to drop
outlier_index = data[data['wins'] > 80].index
print("Number of rows before dropping wins outlier:", len(data))
#drop row with the outlier value
data.drop(index=outlier_index, inplace=True)
#confirm row has been dropped
print("Number of rows after dropping wins outlier:", len(data))

#Losses
#See if theres outliers for the Losses - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.losses)
plt.title("Boxplot of losses")
plt.ylabel("losses")
plt.show()

#height_cm
#See if theres outliers for the height_cm - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.height_cm)
plt.title("Boxplot of height_cm")
plt.ylabel("height_cm")
plt.show()

#reach_in_cm
#See if theres outliers for the reach_in_cm - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.reach_in_cm)
plt.title("Boxplot of reach_in_cm")
plt.ylabel("reach_in_cm")
plt.show()

#significant_strikes_landed_per_minute
#See if theres outliers for the significant_strikes_landed_per_minute - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.significant_strikes_landed_per_minute)
plt.title("Boxplot of significant_strikes_landed_per_minute")
plt.ylabel("significant_strikes_landed_per_minute")
plt.show()

#significant_strikes_absorbed_per_minute
#See if theres outliers for the significant_strikes_absorbed_per_minute - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.significant_strikes_absorbed_per_minute)
plt.title("Boxplot of significant_strikes_absorbed_per_minute")
plt.ylabel("significant_strikes_absorbed_per_minute")
plt.show()

#one person with 40+ absorbed strikes per minute - decide to drop as outlier
outlier_index = data[data['significant_strikes_absorbed_per_minute'] > 40].index
print("Number of rows before dropping significant_strikes_absorbed_per_minute outlier:", len(data))
#drop row with the outlier value
data.drop(index=outlier_index, inplace=True)
#confirm row has been dropped
print("Number of rows after dropping significant_strikes_absorbed_per_minute outlier:", len(data))

#takedown_accuracy
#See if theres outliers for the takedown_accuracy - its ok
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.boxplot(x=data.takedown_accuracy)
plt.title("Boxplot of takedown_accuracy")
plt.ylabel("takedown_accuracy")
plt.show()

In [None]:
#Multivariate Analaysis

 #   Column                                   Non-Null Count  Dtype  
#---  ------                                   --------------  -----  
# 0   name                                     2181 non-null   object - not used
# 1   nickname                                 1514 non-null   object - not used
# 2   wins                                     2181 non-null   int64 - predictor
# 3   losses                                   2181 non-null   int64 - response
# 4   height_cm                                2181 non-null   float64 - response
# 5   reach_in_cm                              2181 non-null   float64 - response
# 6   significant_strikes_landed_per_minute    2181 non-null   float64 - response
# 7   significant_strikes_absorbed_per_minute  2181 non-null   float64 - response
# 8   takedown_accuracy                        2181 non-null   float64 - response

#we are using wins as our dependant variable (predictor) and the other variables as our independant (response) variables

# Scatter plot of wins vs losses
plt.figure(figsize=(8, 6))
plt.title("Wins vs losses")
plt.xlabel("Wins")
plt.ylabel("losses")
plt.scatter(data['wins'], data['losses'], alpha=0.5)
plt.plot(np.unique(data['wins']), np.poly1d(np.polyfit(data['wins'], data['losses'], 1))(np.unique(data['wins'])), color='red')

# Scatter plot of wins vs height_cm
plt.figure(figsize=(8, 6))
plt.title("Wins vs height_cm")
plt.xlabel("Wins")
plt.ylabel("height_cm")
plt.scatter(data['wins'], data['height_cm'], alpha=0.5)
plt.plot(np.unique(data['wins']), np.poly1d(np.polyfit(data['wins'], data['height_cm'], 1))(np.unique(data['wins'])), color='red')

# Scatter plot of wins vs reach_in_cm
plt.figure(figsize=(8, 6))
plt.title("Wins vs reach_in_cm")
plt.xlabel("Wins")
plt.ylabel("reach_in_cm")
plt.scatter(data['wins'], data['reach_in_cm'], alpha=0.5)
plt.plot(np.unique(data['wins']), np.poly1d(np.polyfit(data['wins'], data['reach_in_cm'], 1))(np.unique(data['wins'])), color='red')

# Scatter plot of wins vs significant_strikes_landed_per_minute
plt.figure(figsize=(8, 6))
plt.title("Wins vs significant_strikes_landed_per_minute")
plt.xlabel("Wins")
plt.ylabel("significant_strikes_landed_per_minute")
plt.scatter(data['wins'], data['significant_strikes_landed_per_minute'], alpha=0.5)
plt.plot(np.unique(data['wins']), np.poly1d(np.polyfit(data['wins'], data['significant_strikes_landed_per_minute'], 1))(np.unique(data['wins'])), color='red')

# Scatter plot of wins vs significant_strikes_absorbed_per_minute
plt.figure(figsize=(8, 6))
plt.title("Wins vs significant_strikes_absorbed_per_minute")
plt.xlabel("Wins")
plt.ylabel("significant_strikes_absorbed_per_minute")
plt.scatter(data['wins'], data['significant_strikes_absorbed_per_minute'], alpha=0.5)
plt.plot(np.unique(data['wins']), np.poly1d(np.polyfit(data['wins'], data['significant_strikes_absorbed_per_minute'], 1))(np.unique(data['wins'])), color='red')

# Scatter plot of wins vs takedown_accuracy
plt.figure(figsize=(8, 6))
plt.title("Wins vs takedown_accuracy")
plt.xlabel("Wins")
plt.ylabel("takedown_accuracy")
plt.scatter(data['wins'], data['takedown_accuracy'], alpha=0.5)
plt.plot(np.unique(data['wins']), np.poly1d(np.polyfit(data['wins'], data['takedown_accuracy'], 1))(np.unique(data['wins'])), color='red')

<b> Scatterplot summary </b>
<ol>
    <li> all response variables look highly correlated to wins</li>
    <li> wins vs takedown_accuracy shows potential outliers of 0% and 100% accuracy </li>
</ol>

<b>Correlations</b>

In [40]:
predictor_variables = ['wins']
response_variables = ['losses', 'height_cm', 'reach_in_cm', 
                      'significant_strikes_landed_per_minute', 'significant_strikes_absorbed_per_minute', 
                      'takedown_accuracy']

for predictor_var in predictor_variables:
    for response_var in response_variables:
        corr_coef, p_value = pearsonr(data[predictor_var], data[response_var])
        print(f"Pearson correlation coefficient between '{predictor_var}' and '{response_var}': {corr_coef:.2f}")

Pearson correlation coefficient between 'wins' and 'losses': 0.71
Pearson correlation coefficient between 'wins' and 'height_cm': 0.10
Pearson correlation coefficient between 'wins' and 'reach_in_cm': 0.12
Pearson correlation coefficient between 'wins' and 'significant_strikes_landed_per_minute': -0.05
Pearson correlation coefficient between 'wins' and 'significant_strikes_absorbed_per_minute': -0.21
Pearson correlation coefficient between 'wins' and 'takedown_accuracy': 0.15


<b> Correlations summary </b>
<ol>
    <li> Correlation between wins and losses is 0.71. strong. An orange flag for multicolinearity as win and loss are complementary outcomes. Only include 1 in the regression</li>
    <li> Correlation analysis confirms what we have seen in the scatterplots. </li>
    <li> Reason for low coefficients is because of large dataset, and due to the increased variability in the data, making it more challenging to detect strong relationships between variables.</li>
    <li> Scaling and normalising must be properly handled to take care of low correlation values and help standardize the variables and improve the interpretability of coefficients in regression analysis. </li>
</ol>