# Fight Location Elevations

In [1]:
# method to get the webpage
def get_page_html(url):
    try_num = 1
    response = 0
    while(response != 200):
        page = requests.get(url)
        response = page.status_code
        if(response != 200):
            if(try_num == 3):
                return 'Failed'
            else:
                try_num += 1
                sleep(3)
    return page.text

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from time import sleep

# the url to get the fight locations
list_ufc_url = 'https://en.wikipedia.org/wiki/List_of_UFC_events'

In [None]:
# helper method to get the elevations
def get_elevation(url):
    page_html = get_page_html(url)
    if(page_html == 'Failed'):
        return None
    parser = BeautifulSoup(page_html, 'html.parser')
    tags = parser.select('table.infobox.vcard > tbody > tr')
    elevation_str = ''
    for tag in tags:
        if('elevation' in tag.get_text().lower()):
            elevation_str = tag.get_text().lower()
            break
            
    if(elevation_str != ''):
        regex = '([\d]+|[\d\,\d]+|[\d\.\d]+|[\d\,]+[\.\d]+)(\sm)'
        try:
            elevation, elevation_unit = re.search(regex, elevation_str).group().split()
            elevation = re.sub(',', '', elevation)
            return (elevation + ' ' + elevation_unit).strip()
        except:
            return None
    return None


# helper method to get the value of the column
def get_column_value(row, col_num, get_url):
    try:
        if(get_url):
            return (row.select('td')[col_num].select('a')[0]['href']).strip()
        else:
            return (row.select('td')[col_num].get_text()).strip()
    except:
        return None

    
# method to get elevations for all the events
def get_past_events(url):
    page_html = get_page_html(url)
    parser = BeautifulSoup(page_html, 'html.parser')
    
    past_event_table = parser.select('table#Past_events.sortable.wikitable')[0]
    table_rows = past_event_table.select('tbody > tr')
    
    event_urls = []
    venue_elevations = dict()
    count = 0
    
    for index in range(1, len(table_rows)):
        row = table_rows[index]
        # if the event was cancelled, skip
        if(get_column_value(row, 5, False) == 'Cancelled'):
            continue

        event_url = get_column_value(row, 1, True)
        if(event_url != None):
            event_urls.append(event_url)
        
        location = get_column_value(row, 4, False)
        location = re.sub('U\.S\.', 'USA', location)
        location = re.sub('U\.K\.', 'United Kingdom', location)    
        location_url = get_column_value(row, 4, True)
        
        if(location not in venue_elevations):            
            location_elevation = get_elevation('https://en.wikipedia.org' + location_url)
            if(location_elevation != None):
                count += 1
                venue_elevations[location] = location_elevation
                print('Retrieved elevations: {:3d}'.format(count), end='\r', flush=True)

    return venue_elevations, event_urls

elevations_data, event_urls = get_past_events(list_ufc_url)
elevations_df = pd.DataFrame.from_dict(elevations_data, columns=['location_elevation'], orient='index')
elevations_df.rename_axis('location', inplace=True)

In [4]:
elevations_df.head()

Unnamed: 0_level_0,location_elevation
location,Unnamed: 1_level_1
"São Paulo, Brazil",760 m
"New York City, New York, USA",10 m
"Boston, Massachusetts, USA",43 m
"Tampa, Florida, USA",14.6 m
"Melbourne, Australia",31 m


In [5]:
# the missing elevations that are added manually
missing_elevations = {
    'Saint Petersburg, Russia' : '175.9 m',
    'Adelaide, Australia': '727 m',
    'Sydney, Australia': '130 m',
    'Saitama, Japan': '20 m',
    'Dublin, Ireland': '85 m',
    'Uncasville, Connecticut, USA': '335 m',
    'Rosemont, Illinois, USA': '194.8 m',
    'Brisbane, Australia': '27 m',
    'Fort Hood, Texas, USA': '219 m',
    'Belfast, Northern Ireland, United Kingdom': '80 m',
    'Newcastle, England, United Kingdom': '56 m',
    'Gold Coast, Australia': '16 m',
    'Perth, Australia': '15 m',
    'Glasgow, Scotland, United Kingdom': '6 m',
    'Hamburg, Germany': '116.2 m',
    'Fort Campbell, Kentucky, USA': '168 m',
    'Moscow, Russia': '156 m',
    'Marina Bay, Singapore': '0 m',
    'Manila, Philippines': '12 m',
    'Mashantucket, Connecticut, USA': '90 m',
    'Chiba, Japan': '0 m',
    'Kallang, Singapore': '15 m',
    'Yokohama, Japan': '43 m',
    'Summerlin, Nevada, USA': '822 m',
    'Brooklyn, New York, USA': '66 m',
    'Killeen, Texas, USA': '270 m',
    'Ledyard, Connecticut, USA': '90 m',
    'San Juan, Puerto Rico': '8 m'
}

In [6]:
# adding missing elevations
for location, elevation in missing_elevations.items():
    elevations_df.loc[location] = elevation

In [7]:
# creating the csv
elevations_df.to_csv('locations_elevation_updated.csv')

# Fighter hometown and their elevations

In [None]:
for url in event_urls:
    print(url)

In [None]:
# helper method to get the url of the fighter
def get_fighter_url(cols, col_num):
    try:
        return cols[col_num].text.strip(), cols[col_num].find('a')['href'].strip()
    except:
        return None, None

# getting url for all fighters from all events
def extract_fighter_urls(urls):
    fighter_urls = dict()
    fighter_url_count = 0
    event_count = 0
    total_events = len(urls)
    
    for url in urls:
        event_count += 1
        page_html = get_page_html('https://en.wikipedia.org' + url)
        if(page_html == 'Failed'):
            continue
        parser = BeautifulSoup(page_html, 'html.parser')
        fight_rows = parser.find('table', {'class': 'toccolours'}).findAll('tr')
    
        for fight in fight_rows:
            if fight.find('th') is None:
                cols = fight.findAll('td')
            
                fighter1_name, fighter1_url  = get_fighter_url(cols, 1)
                fighter2_name, fighter2_url = get_fighter_url(cols, 3)
            
                if((fighter1_name != None and fighter1_url != None) and (fighter1_name not in fighter_urls)):
                    fighter_urls[fighter1_name] = fighter1_url
                    fighter_url_count += 1
                    print('Event: {:3d} of {}. Retrieved urls: {:3d}'.format(event_count, total_events, fighter_url_count), end='\r', flush=True)

                if((fighter2_name != None and fighter2_url != None) and (fighter2_name not in fighter_urls)):
                    fighter_urls[fighter2_name] = fighter2_url
                    fighter_url_count += 1
                    print('Event: {:3d} of {}. Retrieved urls: {:3d}'.format(event_count, total_events, fighter_url_count), end='\r', flush=True)
    return fighter_urls
                    
fighter_urls = extract_fighter_urls(event_urls)

In [None]:
# helper metod to get the location and its url
def get_location(table_rows, fighter_name):
    location_url = None
    location = None
    
    fighting_out_of = table_rows.find('th', string='Fighting out of')
    if(fighting_out_of != None):
        element = fighting_out_of.find_parent().select('td')[0].find('a')
        if(element == None):
            element = fighting_out_of.find_parent().select('td')[0]
        location = ''
        while(True):
            if((element == None) or (element.name != None and element.name == 'br')):
                break
            try:
                location += element.get_text()
            except:
                location += element
            element = element.next_sibling            
        try:
            location = re.search('([^\[]+).*', location).group(1)
            location_url = fighting_out_of.find_parent().select('td > a')[0]['href']
            return location, location_url
        except:
            pass
    
    residence = table_rows.find('th', string='Residence')
    if(residence == None):
        residence = table_rows.find('th', string='Residence:')
    if(residence != None):
        try:
            location = residence.find_parent().select('td')[0].get_text().strip()
            location_url = residence.find_parent().select('td > a')[0]['href']
            return location, location_url
        except:
            pass
    
    born = table_rows.find('th', string='Born:')
    if(born == None):
        born = table_rows.find('th', string='Born')
    if(born != None):
        try:
            regex = '.*[\)|\]]([^\[]+)'
            location = re.search(regex, born.find_parent().select('td')[0].get_text().strip()).group(1)
            location_url = born.find_parent().select('td > a')[0]['href']
            return location, location_url
        except:
            pass
    return location, location_url


# method to get the fighter's hometown and its location
def get_fighters_location(fighters):
    print('Total number of fighters:', len(fighters))
    
    count = 0
    skipped_count = 0
    
    data = pd.DataFrame(columns=['Fighter Name', 'Location', 'Elevation'])
    data.set_index('Fighter Name', inplace=True)
    
    for fighter_name, fighter_url in fighters.items():
        page_html = get_page_html('https://en.wikipedia.org' + fighter_url)
        if(page_html == 'Failed'):
            skipped_count += 1
            continue

        parser = BeautifulSoup(page_html, 'html.parser')
        try:
            table_rows = parser.select('table.infobox.vcard > tbody')[0]
        except:
            skipped_count += 1
            continue

        location, location_url = get_location(table_rows, fighter_name)
        
        if(location == None or location_url == None):
            count += 1
            continue    
        location = re.sub('U\.S\.', 'USA', location)
        location = re.sub('U\.K\.', 'United Kingdom', location)    

        if(location_url != None):
            elevation = get_elevation('https://en.wikipedia.org' + location_url)
            if(elevation != None):
                data.loc[fighter_name] = [location, elevation]                
                count += 1
            else:
                skipped_count += 1
        else:
            skipped_count += 1
        
        print('Fighter locations retrieved: {:4d}, skipped: {:3d}'.format(count, skipped_count), end='\r', flush=True)

    return data

fighter_locations_df = get_fighters_location(fighter_urls)
fighter_locations_df

In [None]:
# creating the csv
fighter_locations_df.to_csv('fighters_elevation.csv')