# Data Preparation Stage

In [1]:
# importing necessary modules

import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math
import scipy.stats
from scipy.stats import beta

In [2]:
# setting up directions

direct = 'data'
curr = os.getcwd()
PATH = os.path.join(curr, direct)
os.chdir(PATH)

In [3]:
files = os.listdir()

In [4]:
def extract_csvs(files_list):
    """
    helps extracting data to one, concatenated DF from multiple csv files.
    """
    frames = []
    for file in files_list:
        df = pd.read_csv(file, header=0, sep=";", engine='python')
        frames.append(df)
    if len(frames) == len(files):
        print('Extracting validated')
        final_df = pd.concat(frames, ignore_index=True)
        return final_df
    else:
        print('Number of DataFrames is not matching number of files passed as an argument!')
        return None

In [5]:
main_df = extract_csvs(files)

Extracting validated


In [6]:
# dropping columns that we are not going to use in further analysis

cropped_df = main_df.drop(['Event', 'Method', 'Referee', 'Round', 'Time'], axis=1)

In [7]:
# changing values in event_date column to datetypes

cropped_df['Event_date'] = pd.to_datetime(cropped_df['Event_date'])

In [8]:
cropped_df

Unnamed: 0,Fighter,Opponent,Result,Event_date
0,Shamil Abdurakhimov,Curtis Blaydes,loss,2019-09-07
1,Shamil Abdurakhimov,Marcin Tybura,win,2019-04-20
2,Shamil Abdurakhimov,Andrei Arlovski,win,2018-09-15
3,Shamil Abdurakhimov,Chase Sherman,win,2017-11-25
4,Shamil Abdurakhimov,Derrick Lewis,loss,2016-12-09
...,...,...,...,...
12091,Julianna Pena,Sarah Moras,loss,2012-04-19
12092,Julianna Pena,Rachael Swatez,win,2011-12-15
12093,Julianna Pena,Stephanie Webber,win,2009-12-05
12094,Julianna Pena,Robyn Dunne,win,2009-08-15


In [9]:
cropped_df = cropped_df[cropped_df['Result'] != 'NC']

In [10]:
# creating dataframe that contains pairs of current UFC fighters that have fought against each other

def get_pairs(df):
    """
    function that enables to create pairs containing only ufc fighters that have fought against each other.
    """
    ufc_fighters = df['Fighter'].unique()
    df_pairs = df.copy()
    for index, row in df_pairs.iterrows():
        if row['Opponent'] not in ufc_fighters:
            df_pairs.drop(index, inplace=True)
    return df_pairs

In [11]:
pairs_df = get_pairs(cropped_df)

In [12]:
pairs_df

Unnamed: 0,Fighter,Opponent,Result,Event_date
0,Shamil Abdurakhimov,Curtis Blaydes,loss,2019-09-07
1,Shamil Abdurakhimov,Marcin Tybura,win,2019-04-20
2,Shamil Abdurakhimov,Andrei Arlovski,win,2018-09-15
3,Shamil Abdurakhimov,Chase Sherman,win,2017-11-25
4,Shamil Abdurakhimov,Derrick Lewis,loss,2016-12-09
...,...,...,...,...
12064,Weili Zhang,Tecia Torres,win,2019-03-02
12084,Julianna Pena,Nicco Montano,win,2019-07-13
12085,Julianna Pena,Valentina Shevchenko,loss,2017-01-28
12087,Julianna Pena,Jessica Eye,win,2015-10-03


In [13]:
def filter_unique_pairs(df):
    """
    function that filters through previously created pairs to filter out duplicates;
    for instance: fighter_A vs fighter_B is a duplicate of fighter_B vs fighter_A
    and out initial dataframe contains both fights, since they were collected from the perspective
    of two fighters.
    """
    df_unique = df.copy()
    uniques = []
    for index, row in df_unique.iterrows():
        if (row['Fighter'], row['Opponent']) in uniques:
            df_unique.drop(index, inplace=True)
        else:
            if (row['Opponent'], row['Fighter']) in uniques:
                df_unique.drop(index, inplace=True)
            else:
                uniques.append((row['Fighter'], row['Opponent']))
    return df_unique

In [14]:
unique_pairs_df = filter_unique_pairs(pairs_df)

In [15]:
unique_pairs_df

Unnamed: 0,Fighter,Opponent,Result,Event_date
0,Shamil Abdurakhimov,Curtis Blaydes,loss,2019-09-07
1,Shamil Abdurakhimov,Marcin Tybura,win,2019-04-20
2,Shamil Abdurakhimov,Andrei Arlovski,win,2018-09-15
3,Shamil Abdurakhimov,Chase Sherman,win,2017-11-25
4,Shamil Abdurakhimov,Derrick Lewis,loss,2016-12-09
...,...,...,...,...
11884,Maia Kahaunaele-Stevenson,Polyana Viana,loss,2018-02-03
11918,Tecia Torres,Weili Zhang,loss,2019-03-02
11921,Tecia Torres,Michelle Waterson,win,2017-12-02
11930,Tecia Torres,Paige VanZant,win,2013-01-05


In [16]:
def extract_common_opps(unique_pairs_df, df):
    """
    function that matches common opponents for each pair and creates separated dataframes based on that.
    """
    temp = []
    for index, row in unique_pairs_df.iterrows():
        condition_A = df['Fighter'] == row['Fighter']
        condition_B = df['Fighter'] == row['Opponent']
        A_frame = df[condition_A]
        B_frame = df[condition_B]
        opps_B = B_frame['Opponent'].tolist()
        opps_A = A_frame['Opponent'].tolist()
        query1 = (A_frame['Opponent'].isin(opps_B))
        query2 = (B_frame['Opponent'].isin(opps_A))
        fights_A = A_frame[query1]
        fights_B = B_frame[query2]
        head_pair = A_frame[A_frame['Opponent'] == row['Opponent']]
        final = pd.concat([head_pair, fights_A, fights_B], ignore_index=True)
        temp.append(final)
    return temp

In [17]:
temp = extract_common_opps(unique_pairs_df, cropped_df)

In [18]:
def discard_frames_without_common_ops(temp_list):
    """
    function that discards groups which are not containing any common opponents.
    """
    product = []
    for group in temp_list:
        if len(group) > 1:
            product.append(group)
    return product

In [19]:
groups = discard_frames_without_common_ops(temp)

In [20]:
groups

[               Fighter         Opponent Result Event_date
 0  Shamil Abdurakhimov    Marcin Tybura    win 2019-04-20
 1  Shamil Abdurakhimov  Andrei Arlovski    win 2018-09-15
 2  Shamil Abdurakhimov    Derrick Lewis   loss 2016-12-09
 3  Shamil Abdurakhimov  Timothy Johnson   loss 2015-04-04
 4        Marcin Tybura    Derrick Lewis   loss 2018-02-18
 5        Marcin Tybura  Andrei Arlovski    win 2017-06-17
 6        Marcin Tybura  Timothy Johnson   loss 2016-04-10,
                Fighter         Opponent Result Event_date
 0  Shamil Abdurakhimov  Andrei Arlovski    win 2018-09-15
 1  Shamil Abdurakhimov    Marcin Tybura    win 2019-04-20
 2      Andrei Arlovski    Marcin Tybura   loss 2017-06-17,
                Fighter       Opponent Result Event_date
 0  Shamil Abdurakhimov  Chase Sherman    win 2017-11-25
 1  Shamil Abdurakhimov    Walt Harris    win 2016-10-01
 2        Chase Sherman    Walt Harris   loss 2017-01-15,
                Fighter       Opponent Result Event_date
 0  

In [21]:
def create_triples(groups):
    """
    function that creates triples; each triple contains information about the fight between fighter A and fighter B
    and information about fights with common opponent.
    """
    triples = []
    for group in groups:
        fighter_A = group['Fighter'][0]
        fighter_B = group['Opponent'][0]
        uniques = []
        for index, row in group.iterrows():
            base = group.head(1)
            if (row['Fighter'] == fighter_A) and (row['Opponent'] != fighter_B):
                if row['Opponent'] not in uniques:
                    opp = row['Opponent']
                    condition = (group['Opponent'] == opp)
                    legs = group[condition]
                    t = pd.concat([base, legs], ignore_index=True)
                    triples.append(t)
    return triples

In [22]:
triples = create_triples(groups)

In [23]:
def add_triple_rating(triples):
    """
    function that will add Rating column based on the Result for convenience in further calculations
    """
    mapping = {
        'loss': 0,
        'draw': 1,
        'win': 2,
    }
    for triple in triples:
        triple['Rating'] = triple['Result'].map(mapping)

In [24]:
add_triple_rating(triples)

In [25]:
def triple_value_validation(triple):
    """
    helper function that will assign 1 if triple is containing valuable information and 0 otherwise
    """
    results = triple['Result']
    if results[0] == 'win' or results[0] == 'loss':
        if results[1] != results[2]:
            return 1
        else:
            return 0
    else:
        return 0
    
def triple_date_validation(triple):
    """
    helper function that will assign 1 if triple is valid in terms of chronology and 0 otherwise.
    """
    head_date = triple['Event_date'][0]
    if head_date > triple['Event_date'][1] and head_date > triple['Event_date'][2]:
        return 1
    else:
        return 0

def multiple_date_validation(triple):
    """
    helper function that will return a valid triple in terms of dates if possible, otherwise it will return None.
    """
    head_date = triple['Event_date'][0]
    fighter_A = triple['Fighter'][0]
    fighter_B = triple['Opponent'][0]
    A_cond = triple[triple['Fighter'] == fighter_A]
    B_cond = triple[triple['Fighter'] == fighter_B]
    A_slice = A_cond[A_cond['Event_date'] < head_date]
    B_slice = B_cond[B_cond['Event_date'] < head_date]
    if len(A_slice) != 0 and len(B_slice) != 0:
        base = triple.head(1)
        legs = pd.concat([A_slice, B_slice], ignore_index=True)
        valid_triple = pd.concat([base, legs], ignore_index=True)
        return valid_triple
    else:
        return None

def multiple_value_validation(triple):
    """
    helper function that will assign 1 if extended triple is containing valuable information and 0 otherwise.
    """
    if triple['Result'][0] == 'win' or triple['Result'][0] == 'loss':
        truncated_triple = triple[1:]
        fighter_A = triple['Fighter'][0]
        fighter_B = triple['Opponent'][0]
        A_slice = truncated_triple[truncated_triple['Fighter'] == fighter_A]
        B_slice = truncated_triple[truncated_triple['Fighter'] == fighter_B]
    
        if len(A_slice['Result'].value_counts()) > 1 or len(B_slice['Result'].value_counts() > 1):
            if A_slice['Rating'].mean() != B_slice['Rating'].mean():
                return 1
            else:
                return 0
        else:
            if A_slice['Result'][0] != B_slice['Result'][0]:
                return 1
            else:
                return 0
    else:
        return 0

In [26]:
def triple_analysis(triples):
    """
    function that does triple analysis and extracts only valid triples, either they are standard - 3 elements triples
    or extended ones.
    """
    std_valid_triples = []
    ext_valid_triples = []
    for triple in triples:
        if len(triple) == 3:
            if triple_date_validation(triple) == 1:
                if triple_value_validation(triple) == 1:
                    std_valid_triples.append(triple)
        else:
            validated_triple = multiple_date_validation(triple)
            if validated_triple is not None:
                if multiple_value_validation(validated_triple) == 1:
                    if len(validated_triple) == 3:
                        std_valid_triples.append(validated_triple)
                    else:
                        ext_valid_triples.append(validated_triple)
    return std_valid_triples, ext_valid_triples

In [27]:
std_triples, ext_triples = triple_analysis(triples)

In [28]:
def triple_mma_math(triples):
    """
    function that applies logic to separate triples that are supporting MMA Math deduction process and these which are not. 
    Moreover function is also calculating the score for each group.
    """
    valid_math = 0
    invalid_math = 0
    valid_math_triples = []
    invalid_math_triples = []
    for triple in triples:
        if triple['Rating'][0] == 2:
            if triple['Rating'][1] > triple['Rating'][2]:
                valid_math += 1
                valid_math_triples.append(triple)
            else:
                invalid_math += 1
                invalid_math_triples.append(triple)
        elif triple['Rating'][0] == 0:
            if triple['Rating'][1] < triple['Rating'][2]:
                valid_math += 1
                valid_math_triples.append(triple)
            else:
                invalid_math += 1
                invalid_math_triples.append(triple)
        else:
            print('WARNING: Invalid triple!')
            print(triple)
    
    if len(triples) == valid_math + invalid_math:
        print('Applying MMA math logic was successful')
    return valid_math, invalid_math, valid_math_triples, invalid_math_triples

def multiple_mma_math(triples):
    """
    function which is analogy to triple_mma_math function but applies to extended triples.
    """
    valid_math = 0
    invalid_math = 0
    valid_math_triples = []
    invalid_math_triples = []
    for triple in triples:
        truncated_triple = triple[1:]
        fighter_A = triple['Fighter'][0]
        fighter_B = triple['Opponent'][0]
        A_slice = truncated_triple[truncated_triple['Fighter'] == fighter_A]
        B_slice = truncated_triple[truncated_triple['Fighter'] == fighter_B]
        if triple['Rating'][0] == 2:
            if A_slice['Rating'].mean() > B_slice['Rating'].mean():
                if A_slice['Rating'].mean() >= 1:
                    valid_math += 1
                    valid_math_triples.append(triple)
            else:
                if B_slice['Rating'].mean() >= 1:
                    invalid_math += 1
                    invalid_math_triples.append(triple)
        elif triple['Rating'][0] == 0:
            if A_slice['Rating'].mean() < B_slice['Rating'].mean():
                if B_slice['Rating'].mean() >= 1:
                    valid_math += 1
                    valid_math_triples.append(triple)
            else:
                if A_slice['Rating'].mean() >= 1:
                    invalid_math += 1
                    invalid_math_triples.append(triple)
        else:
            print('WARNING: Invalid triple!')
            print(triple)
    
    if len(triples) == valid_math + invalid_math:
        print('Applying MMA math logic was successful')
    return valid_math, invalid_math, valid_math_triples, invalid_math_triples

In [29]:
valid_triple_score, invalid_triple_score, valid_math_triples, invalid_math_triples = triple_mma_math(std_triples)

Applying MMA math logic was successful


In [30]:
valid_triple_score

152

In [31]:
invalid_triple_score

135

In [32]:
add_triple_rating(ext_triples)

In [33]:
valid_multiple_score, invalid_multiple_score, valid_math_multiples, invalid_math_multiples =  multiple_mma_math(ext_triples)

In [34]:
valid_multiple_score

75

In [35]:
invalid_multiple_score

41

In [36]:
def test_random_triples():
    """
    additional function that is randomly showing triples in both categories - can be used for additional form of validation.
    """
    n = len(valid_math_triples)
    n2 = len(invalid_math_triples)
    ind1 = np.random.randint(1, n, size=5)
    ind2 = np.random.randint(1, n2, size=5)
    print('VALID MATH:')
    for indice in ind1:
        print(valid_math_triples[indice])
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    print('------------------------')
    print('------------------------')
    print('------------------------')
    print('INVALID MATH:')
    for indice in ind2:
        print(invalid_math_triples[indice])
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')

        
def test_random_multiples():
    """
    additional function that is randomly showing multiples in both categories - can be used for additional form of validation.
    """
    n = len(valid_math_multiples)
    n2 = len(invalid_math_multiples)
    ind1 = np.random.randint(1, n, size=5)
    ind2 = np.random.randint(1, n2, size=5)
    print('VALID MATH:')
    for indice in ind1:
        print(valid_math_multiples[indice])
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    print('------------------------')
    print('------------------------')
    print('------------------------')
    print('INVALID MATH:')
    for indice in ind2:
        print(invalid_math_multiples[indice])
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')

In [37]:
test_random_triples()

VALID MATH:
         Fighter       Opponent Result Event_date  Rating
0    Demian Maia  Gunnar Nelson    win 2015-12-12       2
1    Demian Maia     Rick Story    win 2012-10-13       2
2  Gunnar Nelson     Rick Story   loss 2014-10-04       0
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                Fighter              Opponent Result Event_date  Rating
0        Jan Blachowicz  Alexander Gustafsson   loss 2016-09-03       0
1        Jan Blachowicz           Jimi Manuwa   loss 2015-04-11       0
2  Alexander Gustafsson           Jimi Manuwa    win 2014-03-08       2
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                Fighter              Opponent Result Event_date  Rating
0        Daniel Cormier  Alexander Gustafsson    win 2015-10-03       2
1        Daniel Cormier       Anthony Johnson    win 2015-05-23       2
2  Alexander Gustafsson       Anthony Johnson   loss 2015-01-24       0
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [38]:
test_random_multiples()

VALID MATH:
                Fighter              Opponent Result Event_date  Rating
0     Katlyn Chookagian  Valentina Shevchenko   loss 2020-02-08       0
1     Katlyn Chookagian         Liz Carmouche   loss 2016-11-12       0
2  Valentina Shevchenko         Liz Carmouche    win 2019-08-10       2
3  Valentina Shevchenko         Liz Carmouche   loss 2010-09-30       0
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
                Fighter              Opponent Result Event_date  Rating
0           Jessica Eye  Valentina Shevchenko   loss 2019-06-08       0
1           Jessica Eye         Julianna Pena   loss 2015-10-03       0
2           Jessica Eye         Julianna Pena   loss 2015-10-03       0
3  Valentina Shevchenko         Julianna Pena    win 2017-01-28       2
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
            Fighter           Opponent Result Event_date  Rating
0   Andrei Arlovski   Alistair Overeem   loss 2016-05-08       0
1   Andrei

# Bayesian Analysis Part

**We will use Bayesian Reasoning by calculating Cumulative distribution function (CDF) for BETA distribution given the probability intervals.**

Equation: 

# Beta(p; α,β) = (p^(α-1)) * ((1-p)^(β-1))/beta(α,β)

where:

### p = the probability of an event
### α = how many times we observe an event we care about
### β = represent how many times event we didn't care about happen

In [39]:
def calculate_CDF(interval, a, b):
    """
    simple function that will calculate CDF given both variables alpha and beta + interval range
    """
    return beta.cdf(x=interval, a=a, b=b)[1] - beta.cdf(x=interval, a=a, b=b)[0]

### Only Triples

In [99]:
valid_triple_score

152

In [100]:
invalid_triple_score

135

In [40]:
# probability of true rate lying in beetween 0.45 and 0.55
calculate_CDF([0.45, 0.55], valid_triple_score, invalid_triple_score)

0.7515604629104752

In [41]:
# probability of true rate lying in beetween 0.40 and 0.60
calculate_CDF([0.40, 0.60], valid_triple_score, invalid_triple_score)

0.9920237832019733

In [96]:
# finding 95% CI Interval

lower_bound = beta.ppf(0.025, valid_triple_score, invalid_triple_score)
upper_bound = beta.ppf(0.975, valid_triple_score, invalid_triple_score)

In [97]:
lower_bound

0.47181944022812683

In [98]:
upper_bound

0.5870229720578248

### Triples + Extended Triples

In [42]:
all_triples_valid_score = valid_triple_score + valid_multiple_score
all_triples_invalid_score = invalid_triple_score + invalid_multiple_score

In [43]:
all_triples_valid_score

227

In [44]:
all_triples_invalid_score

176

In [45]:
# probability of true rate lying in beetween 0.45 and 0.55
calculate_CDF([0.45, 0.55], all_triples_valid_score, all_triples_invalid_score)

0.29455961533343183

In [46]:
# probability of true rate lying in beetween 0.40 and 0.60
calculate_CDF([0.40, 0.60], all_triples_valid_score, all_triples_invalid_score)

0.932257626514387

In [47]:
# finding 95% CI Interval

lower_bound = beta.ppf(0.025, all_triples_valid_score, all_triples_invalid_score)
upper_bound = beta.ppf(0.975, all_triples_valid_score, all_triples_invalid_score)

In [48]:
lower_bound

0.5146401052815123

In [49]:
upper_bound

0.6113158328668202

# Data Visualization Stage

In [53]:
from graphviz import Graph
from graphviz import Digraph

### Using graphviz to create Decission Diagraph for Data Processing in therein project

In [92]:
dot = Digraph('process', filename='process.gv', node_attr={'color': 'lightblue2', 'style': 'filled'})

In [93]:
dot.node('A', 'List of all the UFC Fighters in the current roster:    Has this fighter fought any other fighter on the UFC roster?')
dot.node('x', 'YES')
dot.node('y', 'NO')
dot.edges(['Ax', 'Ay'])

dot.node('B', 'Pairs of fighters from UFC roster that have fought each other:    Do they share common opponent?')
dot.edges(['xB'])
dot.node('z', 'YES')
dot.node('w', 'NO')
dot.edges(['Bz', 'Bw'])

dot.node('C', 'Pairs of UFC fighters that share a common opponent:    Have they fought this opponent before they have fought each other?')
dot.edges(['zC'])
dot.node('o', 'YES')
dot.node('p', 'NO')
dot.edges(['Co', 'Cp'])

dot.node('D', 'Include the result in the analysis!')
dot.node('E', 'Exclude the result from the analysis!')
dot.edges(['oD'])
dot.edges(['pE'])

In [94]:
dot.render()

'process.gv.pdf'

### Generating Grap Visualization for Triples

In [102]:
def create_triple_graph(t, n):
    """
    function that creates diagraph from triple.
    """
    dot = Digraph(f'triple{n}', filename=f'triple{n}.gv', node_attr={'color': 'lightblue2', 'style': 'filled'})
    dot.node('A', t['Fighter'][0])
    dot.node('B', t['Opponent'][0])
    dot.node('C', t['Opponent'][1])
    dot.edge('A', 'B', label=t['Result'][0], constraint='false')
    dot.edge('A', 'C', label=t['Result'][1])
    dot.edge('B', 'C', label=t['Result'][2])
    dot.render()

In [90]:
n = 1
for triple in valid_math_triples:
    create_triple_graph(triple, n)
    n += 1

In [91]:
n = 1
for triple in invalid_math_triples:
    create_triple_graph(triple, n)
    n += 1

# Conclusion

With a high degree of confidence we can say that the true rate of MMA math deduction accuracy lies between **48% - 58%** percent for data containing only triples and between **51%-61%** for combined data, which is either hardly at all, or just slightly above random outcome.