In [1]:
def get_page_html(url):
    try_num = 1
    response = 0
    while(response != 200):
        page = requests.get(url)
        response = page.status_code
        if(response != 200):
            print('Webpage fetching failed with error {}'.format(response))
            if(try_num == 3):
                print('Tried 3 times and failed, moving on')
                return 'Failed'
            else:
                print('Sleeping for 3 seconds before retrying')
                try_num += 1
                sleep(3)
    return page.text

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from time import sleep

list_ufc_url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'

In [3]:
def get_elevation(url):
    page_html = get_page_html(url)
    parser = BeautifulSoup(page_html, 'html.parser')
    tags = parser.select('table.infobox.vcard > tbody > tr')
    elevation_str = ''
    for tag in tags:
        if('elevation' in tag.get_text().lower()):
            elevation_str = tag.get_text().lower()
            break
            
    if(elevation_str != ''):
        regex = '([\d]+|[\d\,\d]+|[\d\.\d]+|[\d\,]+[\.\d]+)(\sm)'
        try:
            elevation, elevation_unit = re.search(regex, elevation_str).group().split()
            elevation = re.sub(',', '', elevation)
            return (elevation + ' ' + elevation_unit).strip()
        except:
            return None
    return None

def get_column_value(row, col_num, get_url):
    try:
        if(get_url):
            return (row.select('td')[col_num].select('a')[0]['href']).strip()
        else:
            return (row.select('td')[col_num].get_text()).strip()
    except:
        return None

def get_past_events(url):
    page_html = get_page_html(url)
    parser = BeautifulSoup(page_html, 'html.parser')
    
    past_event_table = parser.select('table#Past_events.sortable.wikitable')[0]
    table_rows = past_event_table.select('tbody > tr')
    
    data = pd.DataFrame(columns=['Event Id', 'Event', 'Event Url', 'Date', 'Venue', 'Location', 'Location Url', 'Elevation'])

    for index in range(1, len(table_rows)):
        row = table_rows[index]
        if(get_column_value(row, 5, False) == 'Cancelled'):
            print('Skipping a cancelled event')
            continue
        event_id = get_column_value(row, 0, False)
        event_name = get_column_value(row, 1, False)
        event_url = get_column_value(row, 1, True)
        event_date = get_column_value(row, 2, False)
        event_venue = get_column_value(row, 3, False)
        event_location = get_column_value(row, 4, False)
        event_location_url = get_column_value(row, 4, True)
        elevation = get_elevation('https://en.wikipedia.org' + event_location_url)
        
        data.loc[len(data)+1] = [event_id, event_name, event_url, event_date, event_venue, event_location, event_location_url, elevation]
        print('Retrieved event:', event_id, 'with elevation:', elevation)
        
    return data

past_events_data = get_past_events(list_ufc_url)

Retrieved event: 501 with elevation: None
Retrieved event: 500 with elevation: 10 m
Retrieved event: 499 with elevation: None
Retrieved event: 498 with elevation: 43 m
Retrieved event: 497 with elevation: 14.6 m
Retrieved event: 496 with elevation: 31 m
Retrieved event: 495 with elevation: 91 m
Retrieved event: 494 with elevation: 2240 m
Retrieved event: 493 with elevation: 152 m
Retrieved event: 492 with elevation: 27 m
Retrieved event: 491 with elevation: 943.7 m
Retrieved event: 490 with elevation: 48 m
Retrieved event: 489 with elevation: 43 m
Retrieved event: 488 with elevation: 4 m
Retrieved event: 487 with elevation: 645 m
Retrieved event: 486 with elevation: 198 m
Retrieved event: 485 with elevation: 9 m
Retrieved event: 484 with elevation: 610 m
Retrieved event: 483 with elevation: 264 m
Retrieved event: 482 with elevation: 294 m
Retrieved event: 481 with elevation: 182.02 m
Retrieved event: 480 with elevation: 28 m
Retrieved event: 479 with elevation: 154 m
Retrieved event: 4

In [4]:
past_events_data.head()

Unnamed: 0,Event Id,Event,Event Url,Date,Venue,Location,Location Url,Elevation
1,501,UFC Fight Night: Magomedsharipov vs. Kattar,/wiki/UFC_Fight_Night:_Magomedsharipov_vs._Kattar,"Nov 9, 2019",CSKA Arena,"Moscow, Russia",/wiki/Moscow,
2,500,UFC 244: Masvidal vs. Diaz,/wiki/UFC_244,"Nov 2, 2019",Madison Square Garden,"New York City, New York, U.S.","/wiki/New_York_City,_New_York",10 m
3,499,UFC Fight Night: Maia vs. Askren,/wiki/UFC_Fight_Night:_Maia_vs._Askren,"Oct 26, 2019",Singapore Indoor Stadium,"Kallang, Singapore",/wiki/Kallang,
4,498,UFC on ESPN: Reyes vs. Weidman,/wiki/UFC_on_ESPN:_Reyes_vs._Weidman,"Oct 18, 2019",TD Garden,"Boston, Massachusetts, U.S.","/wiki/Boston,_Massachusetts",43 m
5,497,UFC Fight Night: Joanna vs. Waterson,/wiki/UFC_Fight_Night:_Joanna_vs._Waterson,"Oct 12, 2019",Amalie Arena,"Tampa, Florida, U.S.","/wiki/Tampa,_Florida",14.6 m


In [5]:
past_events_data.loc[past_events_data['Location Url'] == '/wiki/Las_Vegas_Valley', 'Elevation'] = '610 m'

In [6]:
empty_locations = past_events_data[past_events_data['Elevation'].isnull()]['Location']
print('Missing elevation for', len(list(set(empty_locations))), 'locations.')
print(list(set(empty_locations)))

Missing elevation for 25 locations.
['Marina Bay, Singapore', 'Rosemont, Illinois, U.S.', 'Brooklyn, New York, U.S.', 'Summerlin, Nevada, U.S.', 'Moscow, Russia', 'Uncasville, Connecticut, U.S.', 'Dublin, Ireland', 'Hamburg, Germany', 'Gold Coast, Australia', 'Fort Campbell, Kentucky, U.S.', 'Pasay, Philippines', 'Chiba, Japan', 'Newcastle upon Tyne, England, U.K.', 'Brisbane, Australia', 'Saitama, Japan', 'Perth, Australia', 'Adelaide, Australia', 'Sydney, Australia', 'Yokohama, Japan', 'Fort Hood, Texas, U.S.', 'Kallang, Singapore', 'Saint Petersburg, Russia', 'Belfast, Northern Ireland, U.K.', 'Glasgow, Scotland, U.K.', 'Mashantucket, Connecticut, U.S.']


In [7]:
missing_elevations = {
    'Saint Petersburg, Russia' : '175.9 m',
    'Adelaide, Australia': '727 m',
    'Sydney, Australia': '130 m',
    'Saitama, Japan': '20 m',
    'Dublin, Ireland': '85 m',
    'Uncasville, Connecticut, U.S.': '335 m',
    'Rosemont, Illinois, U.S.': '194.8 m',
    'Brisbane, Australia': '27 m',
    'Fort Hood, Texas, U.S.': '219 m',
    'Belfast, Northern Ireland, U.K.': '80 m',
    'Newcastle upon Tyne, England, U.K.': '56 m',
    'Gold Coast, Australia': '16 m',
    'Perth, Australia': '15 m',
    'Glasgow, Scotland, U.K.': '6 m',
    'Hamburg, Germany': '116.2 m',
    'Fort Campbell, Kentucky, U.S.': '168 m',
    'Moscow, Russia': '156 m',
    'Marina Bay, Singapore': '0 m',
    'Pasay, Philippines': '12 m',
    'Mashantucket, Connecticut, U.S.': '90 m',
    'Chiba, Japan': '0 m',
    'Kallang, Singapore': '15 m',
    'Yokohama, Japan': '43 m',
    'Summerlin, Nevada, U.S.': '822 m',
    'Brooklyn, New York, U.S.': '66 m'
}

In [8]:
for location in empty_locations:
    past_events_data.loc[past_events_data['Location'] == location, 'Elevation'] = missing_elevations[location]

In [9]:
past_events_data[past_events_data['Elevation'].isnull()]

Unnamed: 0,Event Id,Event,Event Url,Date,Venue,Location,Location Url,Elevation


In [10]:
event_urls = past_events_data['Event Url']
print(event_urls)

1      /wiki/UFC_Fight_Night:_Magomedsharipov_vs._Kattar
2                                          /wiki/UFC_244
3                 /wiki/UFC_Fight_Night:_Maia_vs._Askren
4                   /wiki/UFC_on_ESPN:_Reyes_vs._Weidman
5             /wiki/UFC_Fight_Night:_Joanna_vs._Waterson
                             ...                        
497                                          /wiki/UFC_5
498                                          /wiki/UFC_4
499                                          /wiki/UFC_3
500                                          /wiki/UFC_2
501                                          /wiki/UFC_1
Name: Event Url, Length: 501, dtype: object


In [11]:
fighters = dict()

def get_fighter_url(cols, col_num):
    try:
        return cols[col_num].text.strip(), cols[col_num].find('a')['href'].strip()
    except:
        return None, None

event = 501

for url in event_urls:
    print('Working on event:', event)
    event -= 1
    if(url == None):
        continue
    page_html = get_page_html('https://en.wikipedia.org' + url)
    if(page_html == 'Failed'):
        continue
    parser = BeautifulSoup(page_html, 'html.parser')
    
    fight_rows = parser.find('table', {'class': 'toccolours'}).findAll('tr')
    
    for fight in fight_rows:
        if fight.find('th') is None:
            cols = fight.findAll('td')
            
            fighter1_name, fighter1_url  = get_fighter_url(cols, 1)
            fighter2_name, fighter2_url = get_fighter_url(cols, 3)
            
            if(fighter1_name != None and fighter1_url != None):
                fighters[fighter1_name] = fighter1_url
                
            if(fighter2_name != None and fighter2_url != None):
                fighters[fighter2_name] = fighter2_url

Working on event: 501
Working on event: 500
Working on event: 499
Working on event: 498
Working on event: 497
Working on event: 496
Working on event: 495
Working on event: 494
Working on event: 493
Working on event: 492
Working on event: 491
Working on event: 490
Working on event: 489
Working on event: 488
Working on event: 487
Working on event: 486
Working on event: 485
Working on event: 484
Working on event: 483
Working on event: 482
Working on event: 481
Working on event: 480
Working on event: 479
Working on event: 478
Working on event: 477
Working on event: 476
Working on event: 475
Working on event: 474
Working on event: 473
Working on event: 472
Working on event: 471
Working on event: 470
Working on event: 469
Working on event: 468
Working on event: 467
Working on event: 466
Working on event: 465
Working on event: 464
Working on event: 463
Working on event: 462
Working on event: 461
Working on event: 460
Working on event: 459
Working on event: 458
Working on event: 457
Working on

In [12]:
sleep(1)
def get_location(table_rows, fighter_name):
    location_url = None
    location = None
    
    fighting_out_of = table_rows.find('th', string='Fighting out of')
    if(fighting_out_of != None):
        element = fighting_out_of.find_parent().select('td')[0].find('a')
        if(element == None):
            element = fighting_out_of.find_parent().select('td')[0]
        location = ''
        while(True):
            if((element == None) or (element.name != None and element.name == 'br')):
                break
            try:
                location += element.get_text()
            except:
                location += element
            element = element.next_sibling            
        try:
            location = re.search('([^\[]+).*', location).group(1)
            location_url = fighting_out_of.find_parent().select('td > a')[0]['href']
            return location, location_url
        except:
            print('cannot get location using fighting out of for', fighter_name)
    
    residence = table_rows.find('th', string='Residence')
    if(residence == None):
        residence = table_rows.find('th', string='Residence:')
    if(residence != None):
        try:
            location = residence.find_parent().select('td')[0].get_text().strip()
            location_url = residence.find_parent().select('td > a')[0]['href']
            return location, location_url
        except:
            print('cannot get location using residence for', fighter_name)
    
    born = table_rows.find('th', string='Born:')
    if(born == None):
        born = table_rows.find('th', string='Born')
    if(born != None):
        try:
            regex = '.*[\)|\]]([^\[]+)'
            location = re.search(regex, born.find_parent().select('td')[0].get_text().strip()).group(1)
            location_url = born.find_parent().select('td > a')[0]['href']
            return location, location_url
        except:
            print('cannot get location using born for', fighter_name)
    else:
        print('cannot get location for', fighter_name)
    
    return location, location_url

def get_fighters_location(fighters):
    items_left = len(fighters)
    data = pd.DataFrame(columns=['Fighter Name', 'Fighter Url', 'Location', 'Location Url', 'Elevation'])
    for fighter_name, fighter_url in fighters.items():
        print('Fighter:', fighter_name, 'Left:', items_left)
        items_left -= 1
        page_html = get_page_html('https://en.wikipedia.org' + fighter_url)
        if(page_html == 'Failed'):
            print('Failed to get page html')
            print()
            continue
        parser = BeautifulSoup(page_html, 'html.parser')
        try:
            table_rows = parser.select('table.infobox.vcard > tbody')[0]
        except:
            print('No info table for', fighter_name)
            print()
            continue

        location, location_url = get_location(table_rows, fighter_name)
        
        elevation = None
        if(location_url != None):
            elevation = get_elevation('https://en.wikipedia.org' + location_url)
            print('Location:', location, 'Elevation:', elevation)
        print()
        
        data.loc[len(data)+1] = [fighter_name, fighter_url, location, location_url, elevation]
    return data

fighter_locations = get_fighters_location(fighters)
fighter_locations

Fighter: Zabit Magomedsharipov Left: 1610
Location: Khasavyurt, Dagestan, Russia Elevation: 130 m

Fighter: Calvin Kattar Left: 1609
Location: Methuen, Massachusetts, United States Elevation: 35 m

Fighter: Alexander Volkov Left: 1608
Location: Moscow, Russia Elevation: None

Fighter: Greg Hardy Left: 1607
Location: Millington, Tennessee Elevation: 80 m

Fighter: Danny Roberts Left: 1606
Location: Liverpool, England, U.K. Elevation: 70 m

Fighter: Ed Herman Left: 1605
Location: Portland, Oregon, United States Elevation: 15.2 m

Fighter: Anthony Rocco Martin Left: 1604
Location: Boston, Massachusetts, United States Elevation: 43 m

Fighter: Ramazan Emeev Left: 1603
Location: Makhachkala, Dagestan, Russia Elevation: 10 m

Fighter: Shamil Gamzatov Left: 1602
Location: Makhachkala, Dagestan, Russia  Elevation: 10 m

Fighter: Magomed Ankalaev Left: 1601
Location: Makhachkala, Dagestan, Russia Elevation: 10 m

Fighter: Dalcha Lungiambula Left: 1600
No info table for Dalcha Lungiambula

Fight

Unnamed: 0,Fighter Name,Fighter Url,Location,Location Url,Elevation
1,Zabit Magomedsharipov,/wiki/Zabit_Magomedsharipov,"Khasavyurt, Dagestan, Russia",/wiki/Khasavyurt,130 m
2,Calvin Kattar,/wiki/Calvin_Kattar,"Methuen, Massachusetts, United States","/wiki/Methuen,_Massachusetts",35 m
3,Alexander Volkov,/wiki/Alexander_Volkov_(fighter),"Moscow, Russia",/wiki/Moscow,
4,Greg Hardy,/wiki/Greg_Hardy,"Millington, Tennessee","/wiki/Millington,_Tennessee",80 m
5,Danny Roberts,/wiki/Danny_Roberts_(fighter),"Liverpool, England, U.K.",/wiki/Liverpool,70 m
...,...,...,...,...,...
1586,Emmanuel Yarborough,/wiki/Emmanuel_Yarborough,"September 5, 1964Rahway, New Jersey, U.S.","/wiki/Rahway,_New_Jersey",7 m
1587,Johnny Rhodes,/wiki/Johnny_Rhodes_(fighter),,,
1588,Gerard Gordeau,/wiki/Gerard_Gordeau,"Den Haag, Netherlands",/wiki/Den_Haag,1 m
1589,Art Jimmerson,/wiki/Art_Jimmerson,"St. Louis, Missouri",/wiki/St._Louis,142 m
