# UFC_Stats Web Scraping

## importing pertinent libraries

In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
import dask.dataframe as dd

## Extracting UFC urls

In [2]:
#creating a list of unique fighter URLs available on website
fighter_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_url_list.append(fighter.get('href'))

#removing duplicate URLs
fighter_url_list = list(set(fighter_url_list))
fighter_url_list

['http://www.ufcstats.com/fighter-details/2b074403b7c6cdb4',
 'http://www.ufcstats.com/fighter-details/bd92cf5da5413d2a',
 'http://www.ufcstats.com/fighter-details/1ffc38f67785797b',
 'http://www.ufcstats.com/fighter-details/d26934530dc5b248',
 'http://www.ufcstats.com/fighter-details/33a331684283900f',
 'http://www.ufcstats.com/fighter-details/36541f1e6c5d4955',
 'http://www.ufcstats.com/fighter-details/9b28292abe3166d5',
 'http://www.ufcstats.com/fighter-details/d0f3959b4a9747e6',
 'http://www.ufcstats.com/fighter-details/c482c8605455a213',
 'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
 'http://www.ufcstats.com/fighter-details/6fd953151d981979',
 'http://www.ufcstats.com/fighter-details/210935fd21670f6d',
 'http://www.ufcstats.com/fighter-details/67a992d4cff22466',
 'http://www.ufcstats.com/fighter-details/2af2f2e26c4c0402',
 'http://www.ufcstats.com/fighter-details/79cb2a690b9ba5e8',
 'http://www.ufcstats.com/fighter-details/79ded75550efc139',
 'http://www.ufcstats.co

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [3]:
#removing invalid URLs
for url in fighter_url_list:
    if 'fighter-details' not in url:
        fighter_url_list.remove(url)

fighter_url_list[:15]   

['http://www.ufcstats.com/fighter-details/2b074403b7c6cdb4',
 'http://www.ufcstats.com/fighter-details/bd92cf5da5413d2a',
 'http://www.ufcstats.com/fighter-details/1ffc38f67785797b',
 'http://www.ufcstats.com/fighter-details/d26934530dc5b248',
 'http://www.ufcstats.com/fighter-details/33a331684283900f',
 'http://www.ufcstats.com/fighter-details/36541f1e6c5d4955',
 'http://www.ufcstats.com/fighter-details/9b28292abe3166d5',
 'http://www.ufcstats.com/fighter-details/d0f3959b4a9747e6',
 'http://www.ufcstats.com/fighter-details/c482c8605455a213',
 'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
 'http://www.ufcstats.com/fighter-details/6fd953151d981979',
 'http://www.ufcstats.com/fighter-details/210935fd21670f6d',
 'http://www.ufcstats.com/fighter-details/67a992d4cff22466',
 'http://www.ufcstats.com/fighter-details/2af2f2e26c4c0402',
 'http://www.ufcstats.com/fighter-details/79cb2a690b9ba5e8']

## Creating Soup Dataframes

### Information DataFrames

In [4]:
def fighter_info_generator(url):
    '''This function parses basic request to a url'''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [5]:
#creating a soup list to reduce number of website requests
basic_soup_list = list(map(fighter_info_generator, fighter_url_list))

In [6]:
def base_stats_soup(soup):
    '''This function identifies all statistics for later extraction'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [7]:
def fighter_history_fetcher(soup):
    '''This function is meant to extract the html information for each fighter as a list of urls. '''
    
    soup_holder = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    list_of_fights = []
    
    for x in soup_holder:
        fight_holder = []
        
        fight_holder.append(x.get('href'))
        
        list_of_fights.append(fight_holder) 
        
    return list_of_fights

#### Construction of soup_database

In [8]:
'''soup_database is meant to hold all soups creating during the scrapping process to minimize the requests to website'''

soup_database = pd.DataFrame()

soup_database['url'] = fighter_url_list
soup_database['base_soup'] = basic_soup_list

fighter_history_urls_list = list(map(fighter_history_fetcher, basic_soup_list))
soup_database['fight_urls'] = fighter_history_urls_list

In [19]:
def stats_soup(soup):
    '''parses soups down to the level of stats for faster, fluid parsing'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [20]:
stats_soups = list(map(stats_soup, basic_soup_list))

In [21]:
soup_database['stats_soup'] = stats_soups

In [22]:
def fight_url_flattener(fight_url_list):
    flattened_fight_urls = []
    holder = fight_url_list.split(',')
    
    for x in range(0, len(holder)):
        
        flattened_fight_urls.append(holder[x].replace("'", "").replace("[", "").replace("]", "").replace(" ", ""))
    
    return flattened_fight_urls

In [23]:
soup_database_df.fight_urls = list(map(fight_url_flattener, soup_database_df.fight_urls))

In [24]:
soup_database_df.to_csv('soup_database_csv', index=False)

#### Basic Statistics DataFrame Construction

In [25]:
def fighter_name(soup):
    '''soup: this soup must be parsed at the class level via the stats soup'''
    name = soup.findChildren('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return name[0].text.strip()

In [26]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [27]:
def basic_stats_height(soup):
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        pass
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return (inches + height_to_inches)

In [28]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        pass
    else:
        weight = int(weight)
    return weight 

In [29]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        pass
    else:
        reach = int(reach)
    return reach

In [30]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    return stance

In [31]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    return DOB

In [32]:
fighter_names = list(map(fighter_name, basic_soup_list))

fighter_records = list(map(fighter_record, basic_soup_list))

fighter_heights =  list(map(basic_stats_height, stats_soups))

fighter_weights = list(map(basic_stats_weight, stats_soups))

fighter_reachs = list(map(basic_stats_reach, stats_soups))

fighter_stances = list(map(basic_stats_stance, stats_soups))

fighter_DOBs = list(map(basic_stats_DOB, stats_soups))

In [33]:
fighter_basic_stats_df = pd.DataFrame()

fighter_basic_stats_df['name'] = fighter_names
fighter_basic_stats_df['record'] = fighter_records
fighter_basic_stats_df['height'] = fighter_heights
fighter_basic_stats_df['weight'] = fighter_weights
fighter_basic_stats_df['reach'] = fighter_reachs
fighter_basic_stats_df['stance'] = fighter_stances
fighter_basic_stats_df['DOB'] = fighter_DOBs

In [34]:
fighter_basic_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Thomas Almeida,22-3-0,67.0,135,70,Southpaw,"Jul 31, 1991"
1,Anthony Alves,1-3-0,,--,--,,--
2,Lowell Anderson,0-1-0,66.0,160,--,Orthodox,--
3,Gilbert Aldana,6-2-0,73.0,250,--,Orthodox,"Aug 25, 1977"
4,Eddie Alvarez,29-6-0 (1 NC),69.0,155,69,Orthodox,"Jan 11, 1984"


In [35]:
fighter_basic_stats_df.to_csv('fighter_basic_stats_csv', index=False)

#### Career Statistics DataFrame Construction

In [36]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [37]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        pass
    else:
        SLpM = float(SLpM)
        
    return SLpM

In [38]:
def career_stats_StrAcc(soup):
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        pass
    else:
        StrAcc = float(StrAcc)
        
    return StrAcc

In [39]:
def career_stats_SApM(soup):
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        pass
    else:
        SApM = float(SApM)
        
    return SApM

In [40]:
def career_stats_StrDef(soup):
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        pass
    else:
        StrDef = float(StrDef)
        
    return StrDef

In [41]:
def career_stats_TDAvg(soup):
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        pass
    else:
        TDAvg = float(TDAvg)
        
    return TDAvg

In [42]:
def career_stats_TDAcc(soup):
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        pass
    else:
        TDAcc = float(TDAcc)
        
    return TDAcc

In [43]:
def career_stats_TDDef(soup):
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        pass
    else:
        TDDef = float(TDDef)
        
    return TDDef

In [44]:
def career_stats_SubAvg(soup):
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        pass
    else:
        SubAvg = float(SubAvg)
        
    return SubAvg

In [45]:
fighter_SLpMs = list(map(career_stats_SLpM, stats_soups))

fighter_StrAccs = list(map(career_stats_StrAcc, stats_soups))

fighter_SApMs = list(map(career_stats_SApM, stats_soups))

fighter_StrDefs =list(map(career_stats_StrDef, stats_soups))

fighter_TDAvgs = list(map(career_stats_TDAvg, stats_soups))

fighter_TDAccs = list(map(career_stats_TDAcc, stats_soups))

fighter_TDDefs = list(map(career_stats_TDDef, stats_soups))

fighter_SubAvgs = list(map(career_stats_SubAvg, stats_soups))

In [46]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names
fighter_career_stats_df['SLpMs'] = fighter_SLpMs
fighter_career_stats_df['StrAccs'] = fighter_StrAccs
fighter_career_stats_df['SApMs'] = fighter_SApMs
fighter_career_stats_df['StrDefs'] = fighter_StrDefs
fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs
fighter_career_stats_df['TDAccs'] = fighter_TDAccs
fighter_career_stats_df['TDDefs'] = fighter_TDDefs
fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [47]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Thomas Almeida,5.66,45.0,4.65,64.0,0.0,0.0,75.0,0.0
1,Anthony Alves,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Lowell Anderson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Gilbert Aldana,2.91,54.0,6.26,26.0,2.57,40.0,33.0,0.0
4,Eddie Alvarez,4.32,41.0,4.39,55.0,2.92,36.0,92.0,0.5


In [48]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

#### Constructing Fight Event and Fight List Database

In [9]:
fight_soup_urls = []

for url_list in soup_database.fight_urls:
    for url in url_list:
        if url not in fight_soup_urls:
            fight_soup_urls.append(url) 
        else:
            pass

In [10]:
#flattening url list for smoother iteration
fight_soup_urls = list(itertools.chain(*fight_soup_urls))

In [11]:
fight_soups = list(map(fighter_info_generator, fight_soup_urls))

In [188]:
fight_soups_df = pd.DataFrame()

In [None]:
#fight_soups_df['soups'] = fight_soups

In [95]:
def event_title_fetcher(fight_soup):
    holder = fight_soup.find('a', attrs={'href': re.compile('event-details')})
    return holder.text.strip()

In [178]:
event_titles = set(list(map(event_title_fetcher, fight_soups)))

461

In [190]:
fight_soups_df['event'] = list(event_titles)

In [152]:
def fight_participants(fight_soup):
    holder = fight_soup.findAll('h3', attrs={'class': re.compile('person-name')})
    return holder[0].text.strip(), holder[1].text.strip()

In [187]:
match_fighters = set(list(map(fight_participants, fight_soups)))

In [172]:
master_match_list = []
for fighters in match_fighters:
    ring = []
    for fighter in fighters:
        ring.append(fighter)
    master_match_list.append(ring) 

In [197]:
master_match_list

[['Akihiro Gono', 'Daniel Acacio'],
 ['Houston Alexander', 'Keith Jardine'],
 ['Brian Ebersole', 'Omari Akhmedov'],
 ['Matt Arroyo', 'John Kolosci'],
 ['Raphael Assuncao', 'Jameel Massouh'],
 ['Mariya Agapova', 'Hannah Cifers'],
 ['Thiago Alves', 'Matt Hughes'],
 ['Alen Amedovski', 'John Phillips'],
 ['Cristiane Justino', 'Hitomi Akano'],
 ['Eugene Jackson', 'Royce Alger'],
 ['Ricardo Abreu', 'Jake Collier'],
 ['Hidehiko Yoshida', 'David Abbott'],
 ['Scott Askham', 'Chris Dempsey'],
 ['Alex Serdyukov', 'John Alessio'],
 ['Pedro Rizzo', 'Andrei Arlovski'],
 ['Conor Heun', 'Magno Almeida'],
 ['Talita Bernardo', 'Viviane Araujo'],
 ['Tom Aaron', 'Matt Ricehouse'],
 ['Jose Aldo', 'Frankie Edgar'],
 ['Eryk Anders', 'Tim Williams'],
 ['Sam Alvey', 'Alex Nicholson'],
 ['Eddie Alvarez', 'Joachim Hansen'],
 ['Chase Sherman', 'Shamil Abdurakhimov'],
 ['Marcus Aurelio', 'Takanori Gomi'],
 ['Jared Cannonier', 'Cyril Asker'],
 ['Julia Avila', 'Gina Mazany'],
 ['Chas Skelly', 'Jim Alers'],
 ['Marcin

In [49]:
fight_soups_df.to_csv('fight_soups_csv', index=False)

PROTOCODE ==================================

In [196]:
for event in fight_soups_df.event[:5]:
    for soup in fight_soups:
        if event in soup:
            print(event)

In [180]:
for fight in fight_soups:
    

In [186]:
event_titles.head()

AttributeError: 'set' object has no attribute 'head'

In [124]:
for tester in tester_list:
    holder = tester.findAll('h3', attrs={'class': re.compile('person-name')})
    print(holder[0].text.strip(), holder[1].text.strip(), '\n') 

Thomas Almeida Rob Font 

Jimmie Rivera Thomas Almeida 

Thomas Almeida Albert Morales 

Thomas Almeida Cody Garbrandt 



In [174]:
master_match_list

[['Akihiro Gono', 'Daniel Acacio'],
 ['Houston Alexander', 'Keith Jardine'],
 ['Brian Ebersole', 'Omari Akhmedov'],
 ['Matt Arroyo', 'John Kolosci'],
 ['Raphael Assuncao', 'Jameel Massouh'],
 ['Mariya Agapova', 'Hannah Cifers'],
 ['Thiago Alves', 'Matt Hughes'],
 ['Alen Amedovski', 'John Phillips'],
 ['Cristiane Justino', 'Hitomi Akano'],
 ['Eugene Jackson', 'Royce Alger'],
 ['Ricardo Abreu', 'Jake Collier'],
 ['Hidehiko Yoshida', 'David Abbott'],
 ['Scott Askham', 'Chris Dempsey'],
 ['Alex Serdyukov', 'John Alessio'],
 ['Pedro Rizzo', 'Andrei Arlovski'],
 ['Conor Heun', 'Magno Almeida'],
 ['Talita Bernardo', 'Viviane Araujo'],
 ['Tom Aaron', 'Matt Ricehouse'],
 ['Jose Aldo', 'Frankie Edgar'],
 ['Eryk Anders', 'Tim Williams'],
 ['Sam Alvey', 'Alex Nicholson'],
 ['Eddie Alvarez', 'Joachim Hansen'],
 ['Chase Sherman', 'Shamil Abdurakhimov'],
 ['Marcus Aurelio', 'Takanori Gomi'],
 ['Jared Cannonier', 'Cyril Asker'],
 ['Julia Avila', 'Gina Mazany'],
 ['Chas Skelly', 'Jim Alers'],
 ['Marcin

In [67]:
fighter_basic_stats_df = dd.read_csv('fighter_basic_stats_csv')
fighter_career_stats_df = dd.read_csv('fighter_career_stats_csv')
soup_database_df = pd.read_csv('soup_database_csv')
fight_soups_df = pd.read_csv('fight_soups_csv')

In [18]:
for x in fight_soups_df.soups[:4]:
    print(x)

<!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>
    Stats | UFC
  </title>
<meta content="" name="description"/>
<meta content="" name="viewport"/>
<link href="/blocks/main.css?ver=15923" rel="stylesheet"/>
<script src="/js/vendor/modernizr-2.6.2.min.js"></script>
<script>
    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-2855164-1', 'auto');
    ga('send'

In [52]:
len(fight_soups_df.soups[42])

34806

In [50]:
fights = set(soup_database_df.fight_urls)
print(len(fights), len(fight_soups_df))

144 709


In [161]:
target_list = []
target = tester_fight_url_list.split(',')
for x in range(0, len(target)):
    target_list.append(target[x].replace("'", "").replace("[", "").replace("]", "").replace(" ", ""))

In [182]:
soup_database_df.fight_urls[0:4]

0    [http://www.ufcstats.com/fight-details/e5908d0...
1    [http://www.ufcstats.com/fight-details/81bd1c1...
2    [http://www.ufcstats.com/fight-details/a16d8d0...
3    [http://www.ufcstats.com/fight-details/b70dfff...
Name: fight_urls, dtype: object

In [175]:
test_list = list(map(fight_url_flattener, tester_fight_url_list))
test_list

[['http://www.ufcstats.com/fight-details/671f42536454e541',
  'http://www.ufcstats.com/fight-details/b8c21a311f3f7953',
  'http://www.ufcstats.com/fight-details/4fe799b4fa1afb76',
  'http://www.ufcstats.com/fight-details/b0e16dd491bc4fea',
  'http://www.ufcstats.com/fight-details/051124b25d576906'],
 ['http://www.ufcstats.com/fight-details/7b80dd233e200dd2'],
 ['http://www.ufcstats.com/fight-details/9a27064991a9e9d4']]