# Get and parse scorecards

### Import libraries

In [1]:
from pandas import read_csv, Series, DataFrame, concat
from datetime import datetime
from collections import Counter
from bs4 import BeautifulSoup
from requests import get
import re

### Parse the csv and create panda models

In [2]:
def parse_location(location):
    words = location.strip().lower().split(', ')
    length = len(words)
    return Series((words[0], words[1])) if length == 2 else Series((words[0], words[-1]))


data = read_csv('ufcdata/data.csv')
data['date'] = data['date'].apply(lambda dt: datetime.strptime(dt, '%Y-%m-%d'))
data[['city', 'country']] = data.apply(lambda row: Series(parse_location(row['location'])), axis=1)

counter = Counter(data['R_fighter'].append(data['B_fighter']))
fighter_counts = DataFrame(zip(counter.keys(), counter.values()), columns=['fighter', 'count'])
fighter_counts.sort_values(by='count', inplace=True, ascending=False)

locations = data.groupby(['date', 'city', 'country']).size().reset_index(name='no_fights_in_card')
locations.sort_values(by='date', inplace=True)

display(data.head(), fighter_counts.head(), locations.head())

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,city,country
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0,chicago,usa
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0,chicago,usa
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0,chicago,usa
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0,chicago,usa
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0,chicago,usa


Unnamed: 0,fighter,count
272,Donald Cerrone,32
72,Jim Miller,32
152,Jeremy Stephens,30
476,Michael Bisping,29
155,Diego Sanchez,29


Unnamed: 0,date,city,country,no_fights_in_card
0,1993-11-12,denver,usa,8
1,1994-03-11,denver,usa,15
2,1994-09-09,charlotte,usa,6
3,1994-12-16,tulsa,usa,8
4,1995-04-07,charlotte,usa,9


### Save the html for list of UFC events

In [3]:
list_ufc_url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'
past_events_list_parser = BeautifulSoup(get(list_ufc_url).text, 'html.parser')

In [4]:
def past_event_parse(event):
    tds = event.findAll('td')
    locs = tds[4].text.strip().lower().split(', ')
    attendance = re.sub('\D', '', tds[5].text.strip())
    
    return {
        'event_name': tds[1].text.strip(),
        'url': 'https://en.wikipedia.org' + tds[1].find('a')['href'] if tds[1].find('a') is not None else '',
        'date': datetime.strptime(tds[2].text.strip(), '%b %d, %Y'),
        'venue': tds[3].text.strip(),
        'city': locs[0],
        'country': locs[-1] if locs[-1] != 'u.s.' else 'usa',
        'attendance': int(attendance) if len(attendance) > 0 else None,
    }

past_events = DataFrame(map(past_event_parse, past_events_list_parser.find('table', {'id': 'Past_events'}).findAll('tr')[1:][::-1]))
display(past_events.head(), past_events[past_events.url == ''])

Unnamed: 0,event_name,url,date,venue,city,country,attendance
0,UFC 1: The Beginning,https://en.wikipedia.org/wiki/UFC_1,1993-11-12,McNichols Sports Arena,denver,usa,7800.0
1,UFC 2: No Way Out,https://en.wikipedia.org/wiki/UFC_2,1994-03-11,Mammoth Gardens,denver,usa,2000.0
2,UFC 3: The American Dream,https://en.wikipedia.org/wiki/UFC_3,1994-09-09,Grady Cole Center,charlotte,usa,
3,UFC 4: Revenge of the Warriors,https://en.wikipedia.org/wiki/UFC_4,1994-12-16,Expo Square Pavilion,tulsa,usa,5857.0
4,UFC 5: The Return of the Beast,https://en.wikipedia.org/wiki/UFC_5,1995-04-07,Independence Arena,charlotte,usa,6000.0


Unnamed: 0,event_name,url,date,venue,city,country,attendance
59,UFC Ultimate Fight Night,,2005-08-06,Cox Pavilion,las vegas,usa,


### Save the htmls for each UFC night

In [5]:
past_events['html'] = past_events['url'].apply(lambda url: BeautifulSoup(get(url).text, 'html.parser') if len(url) > 0 else '')

### Parse and split each UFC night into different fights

In [6]:
def parse_event(event):
    if len(event) == 0:
        return []
    
    fights = []
    rows = event.find('table', {'class': 'toccolours'}).findAll('tr')

    for fight in rows:
        if fight.find('th') is None:
            cols = fight.findAll('td')
            
            winner = cols[1].text.strip()
            loser = cols[3].text.strip()
            
            win_condition = cols[4].text.strip().lower()
            method = ''
            how = ''
            if 'decision' in win_condition:
                method = 'decision'
                
                splits = win_condition.split(' (')
                if win_condition == 'decision':
                    how = ''
                elif len(splits) < 3:
                    how = re.sub('[^a-z ]+', '', re.sub('decision', '', win_condition).strip()).strip()
                else:
                    how = re.sub('[()]+', '', win_condition.split(' (')[2]).strip()
            else:
                method = win_condition.split(' (')[0] .strip()
                how = ' '.join(re.sub('[^a-z ]+', '', win_condition).split()[1:])
                
            n_round = cols[5].text.strip()
            n_round = int(n_round) if len(n_round) > 0 else None
            
            fights.append([winner, loser, method, how, n_round])
    return fights


past_events['full_fight_info'] = past_events['html'].apply(lambda html: parse_event(html))
past_events = past_events.explode('full_fight_info')
past_events.reset_index(inplace=True)

fights = past_events['full_fight_info'].apply(Series)
fights.rename(columns={0: 'winner', 1: 'loser', 2: 'method', 3: 'how', 4: 'round'}, inplace=True)

all_fights = concat([past_events, fights], axis=1)
all_fights.drop(columns=['index', 'html', 'full_fight_info'], inplace=True)
all_fights = all_fights[['winner', 'loser', 'method', 'how', 'round', 'event_name', 'date', 'venue', 'city', 'country', 'attendance', 'url']]
display(all_fights.tail())

Unnamed: 0,winner,loser,method,how,round,event_name,date,venue,city,country,attendance,url
5476,Karl Roberson,Roman Kopylov,submission,rearnaked choke,3.0,UFC Fight Night: Zabit vs. Kattar,2019-11-09,CSKA Arena,moscow,russia,11305.0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
5477,David Zawada,Abubakar Nurmagomedov,submission,triangle choke,1.0,UFC Fight Night: Zabit vs. Kattar,2019-11-09,CSKA Arena,moscow,russia,11305.0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
5478,Roosevelt Roberts,Alexander Yakovlev,decision,"29–28, 29–28, 29–28",3.0,UFC Fight Night: Zabit vs. Kattar,2019-11-09,CSKA Arena,moscow,russia,11305.0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
5479,Pannie Kianzad,Jessica-Rose Clark,decision,"30–27, 30–27, 30–27",3.0,UFC Fight Night: Zabit vs. Kattar,2019-11-09,CSKA Arena,moscow,russia,11305.0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
5480,Davey Grant,Grigorii Popov,decision,"29–28, 28–29, 29–28",3.0,UFC Fight Night: Zabit vs. Kattar,2019-11-09,CSKA Arena,moscow,russia,11305.0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...


### Save the scorecard data as csv

In [None]:
all_fights.to_csv('ufcdata/scorecard.csv')