# Get and parse scorecards

### Import libraries

In [1]:
from pandas import read_csv, Series, DataFrame, concat
from datetime import datetime
from collections import Counter
from bs4 import BeautifulSoup
from requests import get
import re

### Parse the csv and create panda models

In [2]:
def parse_location(location):
    words = location.strip().lower().split(', ')
    length = len(words)
    return Series((words[0], words[1])) if length == 2 else Series((words[0], words[-1]))


data = read_csv('../raw_data/data.csv')
data['date'] = data['date'].apply(lambda dt: datetime.strptime(dt, '%Y-%m-%d'))
data[['city', 'country']] = data.apply(lambda row: Series(parse_location(row['location'])), axis=1)

counter = Counter(data['R_fighter'].append(data['B_fighter']))
fighter_counts = DataFrame(zip(counter.keys(), counter.values()), columns=['fighter', 'count'])
fighter_counts.sort_values(by='count', inplace=True, ascending=False)

locations = data.groupby(['date', 'city', 'country']).size().reset_index(name='no_fights_in_card')
locations.sort_values(by='date', inplace=True)

display(data.head(), fighter_counts.head(), locations.head())

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,city,country
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0,chicago,usa
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0,chicago,usa
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0,chicago,usa
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0,chicago,usa
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0,chicago,usa


Unnamed: 0,fighter,count
272,Donald Cerrone,32
72,Jim Miller,32
152,Jeremy Stephens,30
476,Michael Bisping,29
155,Diego Sanchez,29


Unnamed: 0,date,city,country,no_fights_in_card
0,1993-11-12,denver,usa,8
1,1994-03-11,denver,usa,15
2,1994-09-09,charlotte,usa,6
3,1994-12-16,tulsa,usa,8
4,1995-04-07,charlotte,usa,9


### Save the html for list of UFC events

In [3]:
list_ufc_url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'
past_events_list_parser = BeautifulSoup(get(list_ufc_url).text, 'html.parser')

In [4]:
def past_event_parse(event):
    tds = event.findAll('td')
    locs = tds[4].text.strip().lower().split(', ')
    attendance = re.sub('\D', '', tds[5].text.strip())
    
    country = locs[-1].replace('.', '')
    city = locs[0].replace('.', '')
    
    return {
        'event_name': tds[1].text.strip(),
        'url': 'https://en.wikipedia.org' + tds[1].find('a')['href'] if tds[1].find('a') is not None else '',
        'date': datetime.strptime(tds[2].text.strip(), '%b %d, %Y'),
        'venue': tds[3].text.strip(),
        'city': locs[0].replace('.', '').strip(),
        'country': country.strip() if country != 'us' else 'usa',
        'attendance': int(attendance) if len(attendance) > 0 else None,
    }

past_events = DataFrame(map(past_event_parse, past_events_list_parser.find('table', {'id': 'Past_events'}).findAll('tr')[1:][::-1]))
past_events.drop(past_events.loc[past_events['url'] == ''].index, inplace=True)
past_events.reset_index(inplace=True, drop=True)

display(past_events.tail(), past_events.shape)

Unnamed: 0,event_name,url,date,venue,city,country,attendance
501,UFC Fight Night: Maia vs. Askren,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,2019-10-26,Singapore Indoor Stadium,kallang,singapore,7155.0
502,UFC 244: Masvidal vs. Diaz,https://en.wikipedia.org/wiki/UFC_244,2019-11-02,Madison Square Garden,new york city,usa,20143.0
503,UFC Fight Night: Zabit vs. Kattar,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,2019-11-09,CSKA Arena,moscow,russia,11305.0
504,UFC Fight Night: Błachowicz vs. Jacaré,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,2019-11-16,Ginásio do Ibirapuera,são paulo,brazil,10344.0
505,UFC on ESPN: Overeem vs. Rozenstruik,https://en.wikipedia.org/wiki/UFC_on_ESPN:_Ove...,2019-12-07,Capital One Arena,washington,usa,10816.0


(506, 7)

### Save the htmls for each UFC night

In [5]:
past_events['html'] = past_events['url'].apply(lambda url: BeautifulSoup(get(url).text, 'html.parser') if len(url) > 0 else '')

### Parse and split each UFC night into different fights

In [6]:
def parse_event(event):
    if len(event) == 0:
        return []
    
    fights = []
    rows = event.find('table', {'class': 'toccolours'}).findAll('tr')

    for fight in rows:
        if fight.find('th') is None:
            cols = fight.findAll('td')
            
            winner = cols[1].text.strip()
            loser = cols[3].text.strip()
            
            win_condition = cols[4].text.strip().lower()
            method = ''
            how = ''
            if 'decision' in win_condition:
                method = 'decision'
                
                splits = win_condition.split(' (')
                if win_condition == 'decision':
                    how = ''
                elif len(splits) < 3:
                    how = re.sub('[^a-z ]+', '', re.sub('decision', '', win_condition).strip()).strip()
                else:
                    how = re.sub('[()]+', '', win_condition.split(' (')[2]).strip()
            else:
                method = win_condition.split(' (')[0] .strip()
                how = ' '.join(re.sub('[^a-z ]+', '', win_condition).split()[1:])
                
            n_round = cols[5].text.strip()
            n_round = int(n_round) if len(n_round) > 0 else None
            
            method = 'disqualification' if method == 'dq' else method
            method = method if 'submission' not in method else 'submission'
            method = method if 'referee stoppage' not in method else 'tko'
            
            how = how.replace('dq', '')
            how = how.replace('submission', '')
            how = how.replace('disqualification', '')

            fights.append([winner.strip(), loser.strip(), method.strip(), how.strip(), n_round])
    return fights


past_events['full_fight_info'] = past_events['html'].apply(lambda html: parse_event(html))
past_events = past_events.explode('full_fight_info')
past_events.reset_index(inplace=True)

fights = past_events['full_fight_info'].apply(Series)
fights.rename(columns={0: 'winner', 1: 'loser', 2: 'method', 3: 'how', 4: 'round'}, inplace=True)

all_fights = concat([past_events, fights], axis=1)
all_fights.drop(columns=['index', 'html', 'full_fight_info'], inplace=True)
all_fights = all_fights[['winner', 'loser', 'method', 'how', 'round', 'event_name', 'date', 'venue', 'city', 'country', 'attendance', 'url']]

### Cleaning up some rows

In [7]:
invalid_rows = all_fights.loc[all_fights['winner'].isnull() | all_fights['loser'].isnull() | all_fights['date'].isnull() | all_fights['url'] == ''].index
all_fights.drop(invalid_rows, inplace=True)

invalid_rows = all_fights.loc[(all_fights['winner'] == '') | (all_fights['loser'] == '')].index
all_fights.drop(invalid_rows, inplace=True)

all_fights['winner'] = all_fights['winner'].apply(lambda name: re.sub('\(.*?\)', '', name))
all_fights['loser'] = all_fights['loser'].apply(lambda name: re.sub('\(.*?\)', '', name))
all_fights.reset_index(inplace=True, drop=True)

display(all_fights.tail(), all_fights.shape)

  result = method(y)


Unnamed: 0,winner,loser,method,how,round,event_name,date,venue,city,country,attendance,url
5499,Billy Quarantillo,Jacob Kilburn,submission,triangle choke,2.0,UFC on ESPN: Overeem vs. Rozenstruik,2019-12-07,Capital One Arena,washington,usa,10816.0,https://en.wikipedia.org/wiki/UFC_on_ESPN:_Ove...
5500,Bryce Mitchell,Matt Sayles,submission,twister,1.0,UFC on ESPN: Overeem vs. Rozenstruik,2019-12-07,Capital One Arena,washington,usa,10816.0,https://en.wikipedia.org/wiki/UFC_on_ESPN:_Ove...
5501,Joe Solecki,Matt Wiman,decision,"30–26, 30–26, 30–27",3.0,UFC on ESPN: Overeem vs. Rozenstruik,2019-12-07,Capital One Arena,washington,usa,10816.0,https://en.wikipedia.org/wiki/UFC_on_ESPN:_Ove...
5502,Virna Jandiroba,Mallory Martin,submission,rearnaked choke,2.0,UFC on ESPN: Overeem vs. Rozenstruik,2019-12-07,Capital One Arena,washington,usa,10816.0,https://en.wikipedia.org/wiki/UFC_on_ESPN:_Ove...
5503,Makhmud Muradov,Trevor Smith,ko,punch,3.0,UFC on ESPN: Overeem vs. Rozenstruik,2019-12-07,Capital One Arena,washington,usa,10816.0,https://en.wikipedia.org/wiki/UFC_on_ESPN:_Ove...


(5504, 12)

### Save the scorecard data as csv

In [8]:
# Columns
# winner (string): who won
# loser (string): who lost
# method (category): how they won - knockout, submissions, decision, etc.
# how (string/category): more specifics on how they won - head kick, punch, etc.
# round (int): which round they won
# event_name (string): UFC event name
# date (date)
# venue (string)
# city (string)
# country (string)
# attendance (int): number of people at the event
# url (string): url for the event

all_fights.to_csv('../generated_data/scorecard.csv', index=False)

In [9]:
for col in ['winner', 'loser', 'method', 'how', 'round', 'city', 'country']:
    display(all_fights.groupby(col).size().reset_index(name='count').sort_values(by=['count'], ascending=False))

Unnamed: 0,winner,count
657,Jon Jones,25
363,Donald Cerrone,22
337,Demian Maia,22
606,Jim Miller,20
463,Georges St-Pierre,19
...,...,...
400,Emil Weber Meek,1
399,Elvis Sinosic,1
969,Milana Dudieva,1
970,Miles Johns,1


Unnamed: 0,loser,count
850,Jeremy Stephens,16
1900,Vitor Belfort,15
634,Frank Mir,14
287,Chael Sonnen,14
1406,Nick Diaz,14
...,...,...
656,Gavin Tucker,1
650,Galore Bofando,1
1425,Nissen Osterneck,1
1427,Noe Hernandez,1


Unnamed: 0,method,count
1,decision,2407
6,submission,1175
7,tko,1160
4,ko,614
5,no contest,51
0,,42
3,draw,39
2,disqualification,15
8,walkover,1


Unnamed: 0,how,count
392,punches,981
169,"30–27, 30–27, 30–27",436
403,rearnaked choke,398
87,"29–28, 29–28, 29–28",375
313,guillotine choke,182
...,...,...
190,"48–46, 48–47, 48–46",1
189,"48–46, 48–46, 49–46",1
187,"48–46, 48–45, 48–46",1
186,"48–45, 48–46, 49–46",1


Unnamed: 0,round,count
2,3.0,2650
0,1.0,1468
1,2.0,878
4,5.0,224
3,4.0,39


Unnamed: 0,city,count
75,las vegas,1223
120,rio de janeiro,119
78,london,114
143,são paulo,102
4,anaheim,93
...,...,...
15,bayamón,9
88,miami,8
23,bossier city,8
156,yokohama,7


Unnamed: 0,country,count
25,usa,3593
2,brazil,429
3,canada,367
22,uk,255
1,australia,173
11,japan,82
21,sweden,72
12,mexico,72
9,germany,67
5,china,62
