# UFC_Stats Web Scraping

## importing pertinent libraries

In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
import dask.dataframe as dd
from datetime import datetime

In [2]:
from datetime import datetime

today = datetime.today()
current_date= today.strftime("%B %d, %Y")
current_datetime = datetime.strptime(current_date, "%B %d, %Y" ) 

## Extracting UFC urls

In [2]:
#creating a list of unique fighter URLs available on website
fighter_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_url_list.append(fighter.get('href'))

#removing duplicate URLs
fighter_url_list = list(set(fighter_url_list))

In [3]:
fighter_url_list[:15]

['http://www.ufcstats.com/fighter-details/9abc648e76c4493a',
 'http://www.ufcstats.com/fighter-details/da603332ad41f165',
 'http://www.ufcstats.com/fighter-details/c487223b0289bda9',
 'http://www.ufcstats.com/fighter-details/6eaec40f724852f4',
 'http://www.ufcstats.com/statistics/fighters?char=a&page=3',
 'http://www.ufcstats.com/fighter-details/3738e68d2261e60f',
 'http://www.ufcstats.com/fighter-details/0e9869d712e81f8f',
 'http://www.ufcstats.com/fighter-details/aa6e591c2a2cdecd',
 'http://www.ufcstats.com/fighter-details/1fc64507a0cb38cf',
 'http://www.ufcstats.com/fighter-details/184b955181bdef52',
 'http://www.ufcstats.com/fighter-details/cad24459b28592ca',
 'http://www.ufcstats.com/fighter-details/d53482bef23235ba',
 'http://www.ufcstats.com/fighter-details/196ed28337adc630',
 'http://www.ufcstats.com/fighter-details/a08ddd04eaffd81d',
 'http://www.ufcstats.com/fighter-details/38b50fd1e1b5b656']

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [3]:
#removing invalid URLs
for url in fighter_url_list:
    if 'fighter-details' not in url:
        fighter_url_list.remove(url)

## Creating Dataframe Building Blocks

In [3]:
def info_generator(url):
    '''Meant to parse any url via beuatiful soup
    
    Args:
        url(str)
        '''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [6]:
#creating a soup list to reduce number of website requests
basic_soup_list = list(map(info_generator, fighter_url_list))

In [5]:
def fighter_history_fetcher(soup):
    '''This function is meant to extract the html information for each fighter as a list of urls. '''
    
    soup_holder = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    list_of_fights = []
    
    for x in soup_holder:
        fight_holder = []
        
        fight_holder.append(x.get('href'))
        
        list_of_fights.append(fight_holder) 
        
    return list_of_fights

### Generating basic statistics lists

In [127]:
def base_stats_soup_generator(soup):
    '''Meant to identiy statistics in the context of UFC Stats urls.
    
    Args:
        soup(BeautifulSoup object) : must be an unaltered soup'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [7]:
stats_soups = list(map(base_stats_soup_generator, basic_soup_list))

NameError: name 'basic_soup_list' is not defined

In [9]:
fighter_history_url_list = list(map(fighter_history_fetcher, basic_soup_list))
fighter_history_url_list[:3]

NameError: name 'basic_soup_list' is not defined

fighter_history_url_list generated lists 

In [10]:
def compound_flattener(fight_url_list):
    flattened_fight_urls = list(itertools.chain(*fight_url_list))
    return flattened_fight_urls

In [11]:
fighter_history_url_list = list(map(compound_flattener, fighter_history_url_list))

NameError: name 'fighter_history_url_list' is not defined

In [8]:
fighter_history_url_list[:3]

NameError: name 'fighter_history_url_list' is not defined

### Basic Statistics DataFrame Construction

In [17]:
def fighter_name(soup):
    
    name = soup.find('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return name.text.strip()#[0].text.strip()

In [18]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [19]:
def basic_stats_height(soup):
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        pass
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return (inches + height_to_inches)

In [20]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        pass
    else:
        weight = int(weight)
    return weight 

In [21]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        pass
    else:
        reach = int(reach)
    return reach

In [22]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    return stance

In [23]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    return DOB

In [24]:
fighter_names = list(map(fighter_name, basic_soup_list))

fighter_records = list(map(fighter_record, basic_soup_list))

fighter_heights =  list(map(basic_stats_height, stats_soups))

fighter_weights = list(map(basic_stats_weight, stats_soups))

fighter_reachs = list(map(basic_stats_reach, stats_soups))

fighter_stances = list(map(basic_stats_stance, stats_soups))

fighter_DOBs = list(map(basic_stats_DOB, stats_soups))

In [25]:
fighter_basic_stats_df = pd.DataFrame()

fighter_basic_stats_df['name'] = fighter_names

fighter_basic_stats_df['record'] = fighter_records

fighter_basic_stats_df['height'] = fighter_heights

fighter_basic_stats_df['weight'] = fighter_weights

fighter_basic_stats_df['reach'] = fighter_reachs

fighter_basic_stats_df['stance'] = fighter_stances

fighter_basic_stats_df['DOB'] = fighter_DOBs

In [26]:
fighter_basic_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Jim Alers,13-3-0 (1 NC),69.0,145,71,Orthodox,"Oct 14, 1986"
1,Mike Aina,12-6-1 (1 NC),69.0,155,--,Orthodox,--
2,Alex Andrade,10-5-0,71.0,200,--,Orthodox,"May 14, 1974"
3,Juan Alcain,1-2-0,,--,--,,--
4,Jose Aldo,28-6-0,67.0,135,70,Orthodox,"Sep 09, 1986"


In [27]:
fighter_basic_stats_df.to_csv('fighter_basic_stats_csv', index=False)

### Career Statistics DataFrame Construction

In [28]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [29]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        pass
    else:
        SLpM = float(SLpM)
        
    return SLpM

In [30]:
def career_stats_StrAcc(soup):
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        pass
    else:
        StrAcc = float(StrAcc)
        
    return StrAcc

In [31]:
def career_stats_SApM(soup):
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        pass
    else:
        SApM = float(SApM)
        
    return SApM

In [32]:
def career_stats_StrDef(soup):
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        pass
    else:
        StrDef = float(StrDef)
        
    return StrDef

In [33]:
def career_stats_TDAvg(soup):
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        pass
    else:
        TDAvg = float(TDAvg)
        
    return TDAvg

In [34]:
def career_stats_TDAcc(soup):
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        pass
    else:
        TDAcc = float(TDAcc)
        
    return TDAcc

In [35]:
def career_stats_TDDef(soup):
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        pass
    else:
        TDDef = float(TDDef)
        
    return TDDef

In [36]:
def career_stats_SubAvg(soup):
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        pass
    else:
        SubAvg = float(SubAvg)
        
    return SubAvg

In [37]:
fighter_SLpMs = list(map(career_stats_SLpM, stats_soups))

fighter_StrAccs = list(map(career_stats_StrAcc, stats_soups))

fighter_SApMs = list(map(career_stats_SApM, stats_soups))

fighter_StrDefs =list(map(career_stats_StrDef, stats_soups))

fighter_TDAvgs = list(map(career_stats_TDAvg, stats_soups))

fighter_TDAccs = list(map(career_stats_TDAcc, stats_soups))

fighter_TDDefs = list(map(career_stats_TDDef, stats_soups))

fighter_SubAvgs = list(map(career_stats_SubAvg, stats_soups))

In [38]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names

fighter_career_stats_df['SLpMs'] = fighter_SLpMs

fighter_career_stats_df['StrAccs'] = fighter_StrAccs

fighter_career_stats_df['SApMs'] = fighter_SApMs

fighter_career_stats_df['StrDefs'] = fighter_StrDefs

fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs

fighter_career_stats_df['TDAccs'] = fighter_TDAccs

fighter_career_stats_df['TDDefs'] = fighter_TDDefs

fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [39]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Jim Alers,2.9,40.0,4.38,56.0,2.26,41.0,80.0,0.3
1,Mike Aina,2.87,33.0,6.33,59.0,0.0,0.0,71.0,0.0
2,Alex Andrade,0.2,36.0,2.6,53.0,0.0,0.0,25.0,0.8
3,Juan Alcain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Jose Aldo,3.47,44.0,3.18,65.0,0.64,65.0,91.0,0.1


In [40]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

### Constructing Fight Event and Fight List Database

In [12]:
#creating a list of unique fighter URLs available on website
event_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/events/completed?page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
events = soup.findAll('a', attrs={'href': re.compile('http:')})
for event in events:
    event_url_list.append(event.get('href'))

#removing duplicate URLs
event_url_list = list(set(event_url_list))

In [34]:
#removing invalid URLs
for url in event_url_list:
    if 'event-details' not in url:
        event_url_list.remove(url)
    
for url in event_url_list:  
    if len(url) <  54:
        event_url_list.remove(url)

In [14]:
event_soups = list(map(info_generator, event_url_list))

In [36]:
event_url_list, len(event_url_list)

(['http://www.ufcstats.com/event-details/83d0de122f2f9664',
  'http://www.ufcstats.com/event-details/caced97768818230',
  'http://www.ufcstats.com/event-details/4c8d6fde2dde07c4',
  'http://www.ufcstats.com/event-details/49590e0508b2c19f',
  'http://www.ufcstats.com/event-details/749685d24e2cac50',
  'http://www.ufcstats.com/event-details/d5ae8074631762fc',
  'http://www.ufcstats.com/event-details/53e533db1b8e9712',
  'http://www.ufcstats.com/event-details/4956f60b7fa57c1a',
  'http://www.ufcstats.com/event-details/a79bfbc01b2264d6',
  'http://www.ufcstats.com/event-details/2eae41f61776c60f',
  'http://www.ufcstats.com/event-details/fa8b9e6b0c2269f8',
  'http://www.ufcstats.com/event-details/b0550072e5f0afa7',
  'http://www.ufcstats.com/event-details/e7bc606d269896aa',
  'http://www.ufcstats.com/event-details/2c104b7e59a72629',
  'http://www.ufcstats.com/event-details/f341f9551ba744e2',
  'http://www.ufcstats.com/event-details/ee457ef1e1c326c1',
  'http://www.ufcstats.com/event-details

In [37]:
def event_date_fetcher(event_soup):
    date_holder = event_soup.findChildren('li', attrs={'class': re.compile('b-list__')})
    
    try:
        date = date_holder[0].text.split(':')[1].strip()
        
        return date 
        
    except:
        
        pass    

In [38]:
def event_fights_parser(event_soup):
        
    holder = event_soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    event_fights_holder = []
    
    for x in range(0, len(holder)):
        
        event_fights_holder.append(holder[x].get('href'))
        
    return event_fights_holder

In [39]:
def event_name_fetcher(event_soup):
    event_title = event_soup.find('h2', attrs={'class': re.compile('b-content__')}).text.strip()
    return event_title

In [40]:
event_database_df = pd.DataFrame()

#adding event_urls to dataframe
event_database_df['event'] = event_url_list

#adding event_dates to dataframe and converting to time series objects
event_dates_list = list(map(event_date_fetcher, event_soups))

event_database_df['date'] = event_dates_list

event_database_df['date'] = pd.to_datetime(event_database_df['date'])

#adding event_name to dataframe
event_names_list = list(map(event_name_fetcher,event_soups))

event_database_df['name'] = event_names_list

ValueError: Length of values does not match length of index

In [None]:
event_database_df

In [None]:
event_database_df.info()

In [None]:
def fight_participants_fetcher(event_soup):
    
    fight_pairs = []
    
    name_holder = event_soup.findChildren('a', attrs={'class': re.compile('b-link')})

    fighter1_list_holder, fighter2_list_holder = name_holder[0::2], name_holder[1::2]

    fighter1_list = []
    fighter2_list = []

    for fighter1 in fighter1_list_holder:
        fighter1_list.append(fighter1.text.strip())
        
    for fighter2 in fighter2_list_holder:
        fighter2_list.append(fighter2.text.strip())
      
    fighters_in_ring = zip(fighter1_list, fighter2_list)
    
    for duo in fighters_in_ring:
        fight_pairs.append(duo)
        
    return fight_pairs

--------------------------------------FIX IT FIX IT FIX IT FIX IT FIX IT FIX IT--------------------------------------------

In [125]:
event_soups[15].findChildren('td', attrs={'class': re.compile('b-fight-details__table')})[0].text.strip().split()

['draw', 'draw']

In [130]:
edge_case_number = []
for number, event_soup in enumerate(event_soups):
    try:
        fighter1_strikes = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[3].text.strip()
        fighter2_strikes = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[4].text.strip()
    except:
        print(f'Edge Case: {number}, {event_url_list[number]}')
        edge_case_number.append(number)
        
    try: 
        fighter1_strikes = 0 if ''==fighter1_strikes else int(fighter1_strikes)
    except:
        print(f'Edge Case: {number}, {event_url_list[number]}')
        edge_case_number.append(number)
        
    try: 
        fighter2_strikes = 0 if ''==fighter1_strikes else int(fighter2_strikes)
    except:
        print(f'Edge Case: {number}, {event_url_list[number]}')
        edge_case_number.append(number)

Edge Case: 15, http://www.ufcstats.com/event-details/ee457ef1e1c326c1
Edge Case: 45, http://www.ufcstats.com/event-details/47b7e4e60813b7b2
Edge Case: 54, http://www.ufcstats.com/event-details/c32eab6c2119e989
Edge Case: 54, http://www.ufcstats.com/event-details/c32eab6c2119e989
Edge Case: 162, http://www.ufcstats.com/event-details/df05aa15b2d66f57
Edge Case: 242, http://www.ufcstats.com/event-details/abcf7e55a0a9ed89
Edge Case: 295, http://www.ufcstats.com/event-details/de3ed2e152520c8d
Edge Case: 305, http://www.ufcstats.com/event-details/0aa92558424ced9e
Edge Case: 374, http://www.ufcstats.com/event-details/ef7fa30364cbe7f2
Edge Case: 431, http://www.ufcstats.com/event-details/2e04a3b4a2011b97
Edge Case: 509, http://www.ufcstats.com/event-details/322a56923b396b4d


In [131]:
sorted(set(edge_case_number))

[15, 45, 54, 162, 242, 295, 305, 374, 431, 509]

--------------------------------------FIX IT FIX IT FIX IT FIX IT FIX IT FIX IT--------------------------------------------

In [132]:
def fight_takedowns(event_soup):
        
    fighter1_takedowns = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[5].text.strip()
    fighter2_takedowns = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[6].text.strip()

    
    fighter1_takedowns = 0 if ''==fighter1_takedowns else int(fighter1_takedowns) 
    fighter2_takedowns = 0 if ''==fighter2_takedowns else int(fighter2_takedowns)
    
    return fighter1_takedowns, fighter2_takedowns

In [71]:
def fight_submissions(event_soup):
        
    fighter1_submissions = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[7].text.strip()
    fighter2_submissions = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[8].text.strip()
    
    fighter1_submissions = 0 if ''==fighter1_submissions else int(fighter1_submissions) 
    fighter2_submissions = 0 if ''==fighter2_submissions else int(fighter2_submissions)
    
    return fighter1_submissions, fighter2_submissions

In [72]:
def fight_passes(event_soup):
        
    fighter1_passes = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[9].text.strip()
    fighter2_passes = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[10].text.strip()
    
    fighter1_passes = 0 if ''==fighter1_passes else int(fighter1_passes) 
    fighter2_passes = 0 if ''==fighter2_passes else int(fighter2_passes)
    
    return fighter1_passes, fighter2_passes

In [73]:
def fight_weight_class(event_soup):
        
    weight_class = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[11]
    
    return weight_class.text.strip()

In [74]:
def fight_win_method(event_soup):
        
    win_method = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[12]
    win_method_detail = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[13]
    
    return win_method.text.strip(), win_method_detail.text.strip()

In [75]:
def fight_final_round(event_soup):
        
    ending_round = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[14].text.strip()
    
    blank = ''
    
    ending_round = 0 if blank==ending_round else int(ending_round)
    
    return ending_round

In [76]:
def fight_final_round_time(event_soup):
    
    ending_round_time = event_soup.findAll('p', attrs={'class': re.compile('b-fight-details__table')})[15].text.strip()
    
    return ending_round_time

In [133]:
event_fight_list = list(map(event_fights_parser, event_soups))

matchup_list = list(map(fight_participants_fetcher, event_soups))

fighter_strike_count_list = list(map(fight_strikes,event_soups))
    
fight_takedown_list = list(map(fight_takedowns,event_soups))

#fight_submission_list = list(map(fight_submissions,event_soups))
    
#fight_pass_list = list(map(fight_passes,event_soups))

#fight_weigth_class_list = list(map(fight_weight_class,event_soups))
    
fight_win_method_list = list(map(fight_win_method,event_soups))

#fight_final_round_list = list(map(fight_final_round,event_soups))
    
fight_final_round_time_list = list(map(fight_final_round_time,event_soups))

In [137]:
len(fighter_strike_count_list)

520

In [20]:
event_database_df['matchups'] = matchup

In [17]:
event_database_df['fight_list'] = event_fight_list

In [135]:
event_database_df.to_csv('event_database_csv', index=False)

PROTOCODE ==================================