# Parse UFCData, Locations and Scorecard

### Import libraries

In [1]:
from pandas import read_csv, DataFrame, merge, concat
from datetime import datetime
from collections import Counter
from unidecode import unidecode
import numpy as np
import re, string

### Parse the csvs and create panda models

In [2]:
fight_data_file = '../ufcdata/data.csv'
scorecard_file = '../scorecard/scorecard.csv'
location_file = '../elevations/locations_elevation.csv'
fighter_location_file = '../elevations/fighters_elevation.csv'

fights = read_csv(fight_data_file)
scorecards = read_csv(scorecard_file)
locations = read_csv(location_file)

def create_key(row):   
    f1 = row['R_fighter'] if 'R_fighter' in row.index else row['winner']
    f2 = row['B_fighter'] if 'B_fighter' in row.index else row['loser']
    dt = row['date']

    f1 = sorted(f1.split())
    f2 = sorted(f2.split())

    fighters = sorted([' '.join(f1), ' '.join(f2)])
    return ' '.join(fighters), dt


fights.sort_values(by='date', inplace=True)
fights['date'] = fights['date'].apply(lambda dt: datetime.strptime(dt.strip(), '%Y-%m-%d'))
fights['R_fighter'] = fights['R_fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
fights['B_fighter'] = fights['B_fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
fights['key'] = fights.apply(create_key, axis=1)

scorecards.sort_values(by='date', inplace=True)
scorecards['date'] = scorecards['date'].apply(lambda dt: datetime.strptime(dt.strip(), '%Y-%m-%d'))
scorecards['winner'] = scorecards['winner'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
scorecards['loser'] = scorecards['loser'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
scorecards['how'] = scorecards['how'].astype('category')
scorecards['key'] = scorecards.apply(create_key, axis=1)

print('Fights: {}'.format(fights.shape))
print('Scorecards: {}'.format(scorecards.shape))

Fights: (5144, 146)
Scorecards: (5491, 13)


### Combine fights and scorecards

In [3]:
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return distance[row][col]
        ##return "The strings are {} edits away".format(distance[row][col])

In [4]:
sub_scorecards = scorecards.loc[:, ['key', 'method', 'how', 'round', 'event_name', 'venue', 'city', 'country', 'attendance', 'winner', 'loser']]
sub_scorecards.rename(columns={'method': 'end_method', 'how': 'end_how', 'round': 'end_round'}, inplace=True)

fights_scorecard = merge(left=fights, right=sub_scorecards, how='left', on='key')
fights_scorecard.drop(columns=['location', 'key'], inplace=True)
fights_scorecard.sort_values(by='date', inplace=True)
print('Fights Scorecards: {}'.format(fights_scorecard.shape))

Fights Scorecards: (5146, 154)


In [5]:
test = fights_scorecard.loc[fights_scorecard['city'].isnull()].tail(10)
test = test.loc[:, fights.columns]
test.drop(columns=['location', 'key'], inplace=True)
display(test.shape)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


(10, 144)

In [6]:
missin_fights_scorecard = None
events = test.groupby('date')

sub_scorecards = scorecards.loc[:, ['method', 'how', 'round', 'event_name', 'venue', 'city', 'country', 'attendance', 'winner', 'loser', 'date']]
sub_scorecards.rename(columns={'method': 'end_method', 'how': 'end_how', 'round': 'end_round', 'date': 'date2'}, inplace=True)

for date in events.groups:
    fights = events.get_group(date)
    scorecard_fights = sub_scorecards[sub_scorecards['date2'] == date]
    
    for fight_index, fight in fights.iterrows():
        winner_x = fight['R_fighter'] if fight['Winner'] == 'Red' else fight['B_fighter']
        loser_x = fight['R_fighter'] if fight['Winner'] == 'Blue' else fight['B_fighter']
        
        index_one = 0
        index_two = 0
        score = 10000000000
        
        for score_index, score_row in scorecard_fights.iterrows():
            winner_y_tmp = score_row['winner']
            loser_y_tmp = score_row['loser']
            
            tmp_score = np.mean([
                levenshtein_ratio_and_distance(winner_y_tmp, winner_x), 
                levenshtein_ratio_and_distance(loser_y_tmp, loser_x)
            ])

            if (tmp_score < score):
                index_one = fight_index
                index_two = score_index
                score = tmp_score
        
        
        actual_fight = fights.loc[index_one]
        actual_scorecard = scorecard_fights.loc[index_two]
        
        data_list = [[]]
        col_list = []
        
        for data in list(actual_fight.values):
            data_list[0].append(data)
        for data in list(actual_scorecard.values):
            data_list[0].append(data)
            
        for data in actual_fight.index:
            col_list.append(data)
        for data in actual_scorecard.index:
            col_list.append(data)
        
        actual_row = DataFrame(data=data_list, columns=col_list)
        actual_row.drop(columns=['date2'], inplace=True)
        display(actual_row)


NameError: name 'tmp_score' is not defined