# Parse UFCData, Altitudes and Scorecard

### Import libraries

In [1]:
from pandas import read_csv, DataFrame, merge, concat
from datetime import datetime, timedelta
from collections import Counter
from unidecode import unidecode
import numpy as np
import re, string

### Parse the csvs and create panda models

In [2]:
fight_data_file = '../data_joining/data_with_location_elevation.csv'
scorecard_file = '../scorecard/scorecard.csv'
fighter_location_file = '../elevations/fighters_elevation.csv'

fights = read_csv(fight_data_file)
scorecards = read_csv(scorecard_file)
locations = read_csv(fighter_location_file)

print(fights.shape, scorecards.shape, locations.shape)

(5144, 146) (5491, 12) (1335, 3)


### Clean up the data

In [3]:
def create_key(row):   
    f1 = row['R_fighter_parsed'] if 'R_fighter_parsed' in row.index else row['winner_parsed']
    f2 = row['B_fighter_parsed'] if 'B_fighter_parsed' in row.index else row['loser_parsed']
    dt = row['date']

    f1 = sorted(f1.split())
    f2 = sorted(f2.split())

    fighters = sorted([' '.join(f1), ' '.join(f2)])
    return ' '.join(fighters), dt

fights.drop(fights[fights['Winner'] == 'Draw'].index, inplace=True)
fights['R_fighter_parsed'] = fights['R_fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
fights['B_fighter_parsed'] = fights['B_fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
fights['date'] = fights['date'].apply(lambda dt: datetime.strptime(dt.strip(), '%Y-%m-%d'))
fights['key'] = fights.apply(create_key, axis=1)
fights.sort_values(by='date', inplace=True)

scorecards.sort_values(by='date', inplace=True)
scorecards['date'] = scorecards['date'].apply(lambda dt: datetime.strptime(dt.strip(), '%Y-%m-%d'))
scorecards['winner_parsed'] = scorecards['winner'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
scorecards['loser_parsed'] = scorecards['loser'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
scorecards['how'] = scorecards['how'].astype('category')
scorecards['city'] = scorecards['city'].astype('category')
scorecards['country'] = scorecards['country'].astype('category')
scorecards['key'] = scorecards.apply(create_key, axis=1)

print('Fights: {}'.format(fights.shape))
print('Scorecards: {}'.format(scorecards.shape))

Fights: (5061, 149)
Scorecards: (5491, 15)


### Combine fights and scorecards

In [4]:
fights_scorecard = merge(left=fights, right=scorecards, how='left', on='key')
print('Fights Scorecards: {}'.format(fights_scorecard.shape))

Fights Scorecards: (5062, 163)


### Fuzzy string matcher

In [5]:
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return distance[row][col]

### Rows without scorecards

In [6]:
invalid_rows = fights_scorecard.loc[fights_scorecard['city'].isnull()]
print('Invalid Rows: {}'.format(invalid_rows.shape))

Invalid Rows: (540, 163)


### Find the missing scorecards

In [7]:
events = invalid_rows.groupby('date_x')

cols = None
new_data = []

for date in events.groups:
    fights = events.get_group(date)
    scorecard_fights = scorecards[scorecards['date'] == date]
    
    if (scorecard_fights.shape[0] < 1):
        print('Date not found: {}'.format(date))
        new_date = date + timedelta(days=1)
        
        scorecard_fights = scorecards[scorecards['date'] == new_date]
        print('Trying Date {} - {}'.format(new_date, 'Found' if scorecard_fights.shape[0] > 0 else 'Not found'))
        
        if scorecard_fights.shape[0] < 1:
            continue

    for fight_index, fight in fights.iterrows():
        winner_x = fight['R_fighter'] if fight['Winner'] == 'Red' else fight['B_fighter']
        loser_x = fight['R_fighter'] if fight['Winner'] == 'Blue' else fight['B_fighter']
        
        index_one = 0
        index_two = 0
        score = 10000000000
        
        for score_index, score_row in scorecard_fights.iterrows():
            winner_y_tmp = score_row['winner']
            loser_y_tmp = score_row['loser']
            
            tmp_score = np.mean([
                levenshtein_ratio_and_distance(winner_y_tmp, winner_x), 
                levenshtein_ratio_and_distance(loser_y_tmp, loser_x)
            ])

            if (tmp_score < score):
                index_one = fight_index
                index_two = score_index
                score = tmp_score
        
        
        actual_fight = fights.loc[index_one]
        actual_scorecard = scorecard_fights.loc[index_two]
        
        obj = dict()
        
        for index in actual_fight.index:
            obj[index] = actual_fight[index]
            
        for index in actual_scorecard.index:
            if index == 'date':
                obj['date_y'] = actual_scorecard[index]
            elif index != 'key':
                obj[index] = actual_scorecard[index]
        
        cols = list(obj.keys())
        new_data.append(obj)

missing_fights_scorecard = DataFrame(data=new_data, columns=cols)

Date not found: 2005-08-06 00:00:00
Trying Date 2005-08-07 00:00:00 - Not found
Date not found: 2010-02-20 00:00:00
Trying Date 2010-02-21 00:00:00 - Found
Date not found: 2011-02-26 00:00:00
Trying Date 2011-02-27 00:00:00 - Found
Date not found: 2012-02-25 00:00:00
Trying Date 2012-02-26 00:00:00 - Found
Date not found: 2012-03-02 00:00:00
Trying Date 2012-03-03 00:00:00 - Found
Date not found: 2012-12-14 00:00:00
Trying Date 2012-12-15 00:00:00 - Found
Date not found: 2013-03-02 00:00:00
Trying Date 2013-03-03 00:00:00 - Found
Date not found: 2013-12-06 00:00:00
Trying Date 2013-12-07 00:00:00 - Found
Date not found: 2014-11-07 00:00:00
Trying Date 2014-11-08 00:00:00 - Found
Date not found: 2015-05-09 00:00:00
Trying Date 2015-05-10 00:00:00 - Found
Date not found: 2015-09-26 00:00:00
Trying Date 2015-09-27 00:00:00 - Found
Date not found: 2015-11-14 00:00:00
Trying Date 2015-11-15 00:00:00 - Found
Date not found: 2016-03-19 00:00:00
Trying Date 2016-03-20 00:00:00 - Found
Date not

### Update the fights with the new scorecards found

In [8]:
for index, row in missing_fights_scorecard.iterrows():
    original_row = fights_scorecard[(fights_scorecard['key'] == row['key'])]
    original_index = original_row.index
    
    cols_to_modify = list(original_row.columns[-14:])
    for col in cols_to_modify:
        fights_scorecard.at[original_index, col] = row[col]

### Final stats for scorecards

In [9]:
invalid_rows = fights_scorecard.loc[fights_scorecard['city'].isnull()]
print('New Invalid Rows: {}'.format(invalid_rows.shape))

New Invalid Rows: (8, 163)


### Clean up the model

In [10]:
fights_scorecard.drop(columns=['R_fighter_parsed', 'B_fighter_parsed', 'location', 'key', 'winner', 'loser', 'event_name', 'date_y', 'venue', 'url', 'winner_parsed', 'loser_parsed'], inplace=True)
fights_scorecard.rename(columns={'date_x': 'date', 'method': 'end_method', 'how': 'end_how', 'round': 'end_round'}, inplace=True)

fights_scorecard['R_fighter'] = fights_scorecard['R_fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
fights_scorecard['B_fighter'] = fights_scorecard['B_fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))
fights_scorecard.sort_values(by='date', inplace=True)

print(fights_scorecard.shape)
fights_scorecard.tail()

(5062, 151)


Unnamed: 0,R_fighter,B_fighter,Referee,date,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,B_current_win_streak,...,R_Weight_lbs,B_age,R_age,location_elevation,end_method,end_how,end_round,city,country,attendance
5050,henry cejudo,marlon moraes,Marc Goddard,2019-06-08,Red,True,Bantamweight,5,0.0,4.0,...,135.0,31.0,32.0,182.02,tko,punches,3.0,chicago,usa,16083.0
5049,tai tuivasa,blagoy ivanov,Dan Miragliotta,2019-06-08,Blue,False,Heavyweight,3,0.0,1.0,...,264.0,32.0,26.0,182.02,decision,"29–28, 30–27, 30–27",3.0,chicago,usa,16083.0
5060,eddie wineland,grigorii popov,Kevin MacDonald,2019-06-08,Red,False,Bantamweight,3,0.0,0.0,...,135.0,35.0,34.0,182.02,ko,punch,2.0,chicago,usa,16083.0
5054,ricardo lamas,calvin kattar,Dan Miragliotta,2019-06-08,Blue,False,Featherweight,3,0.0,1.0,...,145.0,31.0,37.0,182.02,ko,punches,1.0,chicago,usa,16083.0
5061,jimmie rivera,petr yan,Kevin MacDonald,2019-06-08,Blue,False,Bantamweight,3,0.0,4.0,...,135.0,26.0,29.0,182.02,decision,"29–28, 29–28, 30–27",3.0,chicago,usa,16083.0


In [11]:
locations.rename(columns={'Elevation': 'elevation', 'Fighter Name': 'fighter', 'Location': 'location'}, inplace=True)
locations['elevation'] = locations['elevation'].apply(lambda x: re.sub('[^0-9]', '', x))
locations['fighter'] = locations['fighter'].apply(lambda x: unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().strip()))

locations.tail()

Unnamed: 0,fighter,location,elevation
1330,kevin rosier,"Tampa, Florida",146
1331,emmanuel yarborough,"September 5, 1964Rahway, New Jersey, USA",7
1332,gerard gordeau,"Den Haag, Netherlands",1
1333,art jimmerson,"St. Louis, Missouri",142
1334,teila tuli,"Honolulu, Hawaii",6


In [12]:
all_fighter_names = np.concatenate((fights_scorecard['R_fighter'].values, fights_scorecard['B_fighter'].values))  
all_fighter_names = np.unique(all_fighter_names)
locations.drop_duplicates(subset='fighter', inplace=True)

In [13]:
def get_elevation(series):
    if (series.shape[0] < 1):
        return None
    
    return int(series['elevation'].values[0])

fights_scorecard['R_home_elevation'] = fights_scorecard['R_fighter'].apply(lambda x: get_elevation(locations[locations['fighter'] == x]))
fights_scorecard['B_home_elevation'] = fights_scorecard['B_fighter'].apply(lambda x: get_elevation(locations[locations['fighter'] == x]))

fights_scorecard.to_csv('combined_fight_data.csv', index=False)
fights_scorecard.tail()

Unnamed: 0,R_fighter,B_fighter,Referee,date,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,B_current_win_streak,...,R_age,location_elevation,end_method,end_how,end_round,city,country,attendance,R_home_elevation,B_home_elevation
5050,henry cejudo,marlon moraes,Marc Goddard,2019-06-08,Red,True,Bantamweight,5,0.0,4.0,...,32.0,182.02,tko,punches,3.0,chicago,usa,16083.0,331.0,
5049,tai tuivasa,blagoy ivanov,Dan Miragliotta,2019-06-08,Blue,False,Heavyweight,3,0.0,1.0,...,26.0,182.02,decision,"29–28, 30–27, 30–27",3.0,chicago,usa,16083.0,,2290.0
5060,eddie wineland,grigorii popov,Kevin MacDonald,2019-06-08,Red,False,Bantamweight,3,0.0,0.0,...,34.0,182.02,ko,punch,2.0,chicago,usa,16083.0,195.0,
5054,ricardo lamas,calvin kattar,Dan Miragliotta,2019-06-08,Blue,False,Featherweight,3,0.0,1.0,...,37.0,182.02,ko,punches,1.0,chicago,usa,16083.0,2.0,35.0
5061,jimmie rivera,petr yan,Kevin MacDonald,2019-06-08,Blue,False,Bantamweight,3,0.0,4.0,...,29.0,182.02,decision,"29–28, 29–28, 30–27",3.0,chicago,usa,16083.0,,90.0


In [14]:
invalid_rows = fights_scorecard.loc[(fights_scorecard['R_home_elevation'].isnull()) | (fights_scorecard['B_home_elevation'].isnull())]

print('Fights with no elevation data {}'.format(invalid_rows.shape))

Fights with no elevation data (1824, 153)


In [None]:
## This will print out stats for closest fighter name matches
from spark_score_fighter_name_matches import create_fighter_scores

invalid_rows_x = fights_scorecard.loc[fights_scorecard['R_home_elevation'].isnull()]['R_fighter'].values
invalid_rows_y = fights_scorecard.loc[fights_scorecard['B_home_elevation'].isnull()]['B_fighter'].values

missing_fighter_names = np.concatenate((invalid_rows_x, invalid_rows_y))  
missing_fighter_names = np.unique(missing_fighter_names)

names_avail = np.array(locations['fighter'].values)

missing_file = 'missing_fighter_names.txt'
avail_file = 'available_fighter_names.txt'
output_file = 'fighter_name_location_scores.csv'

mising_f_write = open(missing_file, 'w')
mising_f_write.write('\n'.join(missing_fighter_names))
mising_f_write.close()

names_f_write = open(avail_file, 'w')
names_f_write.write('\n'.join(names_avail))
names_f_write.close()

create_fighter_scores(missing_file, avail_file, output_file)
scores = read_csv(output_file)
min_scores = scores.loc[scores.groupby('x')['score'].idxmin()]
min_scores.sort_values(by='score', inplace=True, ascending=False)
min_scores.reset_index(inplace=True)
min_scores.drop(columns=['index'], inplace=True)
min_scores.tail(30)