In [106]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

list_ufc_url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'

In [107]:
def get_page_html(url):
    try_num = 1
    response = 0
    while(response != 200):
        page = requests.get(url)
        response = page.status_code
        if(response != 200):
            print('Webpage fetching failed with error {}'.format(response))
            if(try_num == 3):
                print('Tried 3 times and failed, moving on')
                return 'Failed'
            else:
                print('Sleeping for 3 seconds before retrying')
                try_num += 1
                sleep(3)
    return page.text

In [119]:
def get_elevation(url):
    page_html = get_page_html(url)
    parser = BeautifulSoup(page_html, 'html.parser')
    tags = parser.select('table.infobox.vcard > tbody > tr')
    elevation_str = ''
    for tag in tags:
        if('elevation' in tag.get_text().lower()):
            elevation_str = tag.get_text().lower()
            break
            
    if(elevation_str != ''):
        regex = '([\d]+|[\d\,\d]+|[\d\.\d]+|[\d\,]+[\.\d]+)(\sm)'
        elevation, elevation_unit = re.search(regex, elevation_str).group().split()
        elevation = re.sub(',', '', elevation)
        return (elevation + ' ' + elevation_unit).strip()
    
    return None

def get_column_value(row, col_num, get_url):
    try:
        if(get_url):
            return (row.select('td')[col_num].select('a')[0]['href']).strip()
        else:
            return (row.select('td')[col_num].get_text()).strip()
    except:
        return None

def get_past_events(url):
    page_html = get_page_html(url)
    parser = BeautifulSoup(page_html, 'html.parser')
    
    past_event_table = parser.select('table#Past_events.sortable.wikitable')[0]
    table_rows = past_event_table.select('tbody > tr')
    
    data = pd.DataFrame(columns=['Event Id', 'Event', 'Event Url', 'Date', 'Venue', 'Location', 'Location Url', 'Elevation'])

    for index in range(1, len(table_rows)):
        row = table_rows[index]
        if(get_column_value(row, 5, False) == 'Cancelled'):
            print('Skipping a cancelled event')
            continue
        event_id = get_column_value(row, 0, False)
        event_name = get_column_value(row, 1, False)
        event_url = get_column_value(row, 1, True)
        event_date = get_column_value(row, 2, False)
        event_venue = get_column_value(row, 3, False)
        event_location = get_column_value(row, 4, False)
        event_location_url = get_column_value(row, 4, True)
        elevation = get_elevation('https://en.wikipedia.org' + event_location_url)
        
        data.loc[len(data)+1] = [event_id, event_name, event_url, event_date, event_venue, event_location, event_location_url, elevation]
        print('Retrieved event:', event_id, 'with elevation:', elevation)
        
    return data

past_events_data = get_past_events(list_ufc_url)

Retrieved event: 501 with elevation: None
Retrieved event: 500 with elevation: 10 m
Retrieved event: 499 with elevation: None
Retrieved event: 498 with elevation: 43 m
Retrieved event: 497 with elevation: 14.6 m
Retrieved event: 496 with elevation: 31 m
Retrieved event: 495 with elevation: 91 m
Retrieved event: 494 with elevation: 2240 m
Retrieved event: 493 with elevation: 152 m
Retrieved event: 492 with elevation: 27 m
Retrieved event: 491 with elevation: 943.7 m
Retrieved event: 490 with elevation: 48 m
Retrieved event: 489 with elevation: 43 m
Retrieved event: 488 with elevation: 4 m
Retrieved event: 487 with elevation: 645 m
Retrieved event: 486 with elevation: 198 m
Retrieved event: 485 with elevation: 9 m
Retrieved event: 484 with elevation: 610 m
Retrieved event: 483 with elevation: 264 m
Retrieved event: 482 with elevation: 294 m
Retrieved event: 481 with elevation: 182.02 m
Retrieved event: 480 with elevation: 28 m
Retrieved event: 479 with elevation: 154 m
Retrieved event: 4

Retrieved event: 310 with elevation: 93 m
Retrieved event: 309 with elevation: 10 m
Retrieved event: 308 with elevation: 1629 m
Retrieved event: 307 with elevation: 610 m
Retrieved event: 306 with elevation: 28 m
Retrieved event: 305 with elevation: 43 m
Retrieved event: 304 with elevation: 610 m
Retrieved event: 303 with elevation: 719 m
Retrieved event: 302 with elevation: 331 m
Retrieved event: 301 with elevation: None
Retrieved event: 300 with elevation: 610 m
Retrieved event: 299 with elevation: 405 m
Retrieved event: 298 with elevation: 2240 m
Retrieved event: 297 with elevation: 863 m
Retrieved event: 296 with elevation: None
Retrieved event: 295 with elevation: 1020 m
Retrieved event: 294 with elevation: 241.9 m
Retrieved event: 293 with elevation: 28 m
Retrieved event: 292 with elevation: 610 m
Retrieved event: 291 with elevation: None
Retrieved event: 290 with elevation: 1172 m
Retrieved event: 289 with elevation: None
Retrieved event: 288 with elevation: 9 m
Retrieved event:

Retrieved event: 118 with elevation: None
Retrieved event: 117 with elevation: 140 m
Retrieved event: 116 with elevation: 332 m
Retrieved event: 115 with elevation: 320 m
Retrieved event: 114 with elevation: 264 m
Retrieved event: 113 with elevation: None
Retrieved event: 112 with elevation: None
Retrieved event: 111 with elevation: None
Retrieved event: 110 with elevation: 11 m
Retrieved event: 109 with elevation: None
Retrieved event: 108 with elevation: 233 m
Retrieved event: 107 with elevation: 1629 m
Retrieved event: 106 with elevation: 275 m
Retrieved event: 105 with elevation: None
Retrieved event: 104 with elevation: None
Retrieved event: 103 with elevation: None
Retrieved event: 102 with elevation: None
Retrieved event: 101 with elevation: None
Retrieved event: 100 with elevation: 4 m
Retrieved event: 099 with elevation: 147 m
Retrieved event: 098 with elevation: 48 m
Retrieved event: 097 with elevation: None
Retrieved event: 096 with elevation: 11 m
Retrieved event: 095 with 

In [120]:
past_events_data.head()

Unnamed: 0,Event Id,Event,Event Url,Date,Venue,Location,Location Url,Elevation
1,501,UFC Fight Night: Magomedsharipov vs. Kattar,/wiki/UFC_Fight_Night:_Magomedsharipov_vs._Kattar,"Nov 9, 2019",CSKA Arena,"Moscow, Russia",/wiki/Moscow,
2,500,UFC 244: Masvidal vs. Diaz,/wiki/UFC_244,"Nov 2, 2019",Madison Square Garden,"New York City, New York, U.S.","/wiki/New_York_City,_New_York",10 m
3,499,UFC Fight Night: Maia vs. Askren,/wiki/UFC_Fight_Night:_Maia_vs._Askren,"Oct 26, 2019",Singapore Indoor Stadium,"Kallang, Singapore",/wiki/Kallang,
4,498,UFC on ESPN: Reyes vs. Weidman,/wiki/UFC_on_ESPN:_Reyes_vs._Weidman,"Oct 18, 2019",TD Garden,"Boston, Massachusetts, U.S.","/wiki/Boston,_Massachusetts",43 m
5,497,UFC Fight Night: Joanna vs. Waterson,/wiki/UFC_Fight_Night:_Joanna_vs._Waterson,"Oct 12, 2019",Amalie Arena,"Tampa, Florida, U.S.","/wiki/Tampa,_Florida",14.6 m


In [128]:
past_events_data.loc[past_events_data['Location Url'] == '/wiki/Las_Vegas_Valley', 'Elevation'] = '610 m'

In [137]:
empty_locations = past_events_data[past_events_data['Elevation'].isnull()]['Location']
print('Missing elevation for', len(list(set(empty_locations))), 'locations.')

Missing elevation for 25 locations.
