# UFC_Stats Web Scraping

## importing pertinent libraries

In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
import dask.dataframe as dd

## Extracting UFC urls

In [39]:
#creating a list of unique fighter URLs available on website
fighter_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_url_list.append(fighter.get('href'))

#removing duplicate URLs
fighter_url_list = list(set(fighter_url_list))
fighter_url_list

['http://www.ufcstats.com/fighter-details/9199e0735b83dd32',
 'http://www.ufcstats.com/fighter-details/1cf1310684a841f5',
 'http://www.ufcstats.com/fighter-details/0541480fbf719d86',
 'http://www.ufcstats.com/fighter-details/d26934530dc5b248',
 'http://www.ufcstats.com/fighter-details/c487223b0289bda9',
 'http://www.ufcstats.com/fighter-details/25b31165758402dd',
 'http://www.ufcstats.com/fighter-details/399afbabc02376b5',
 'http://www.ufcstats.com/fighter-details/210935fd21670f6d',
 'http://www.ufcstats.com/fighter-details/9bcfb40dbcd50568',
 'http://www.ufcstats.com/fighter-details/41b34b7f11f6d085',
 'http://www.ufcstats.com/fighter-details/78d48d3874dacafd',
 'http://www.ufcstats.com/fighter-details/6cadc0a0ba7dc015',
 'http://www.ufcstats.com/fighter-details/7bd94b60d7521e4a',
 'http://www.ufcstats.com/fighter-details/063649e21bc9d6d5',
 'http://www.ufcstats.com/fighter-details/d221ee27afc7a60e',
 'http://www.ufcstats.com/fighter-details/c4fe2e9a06ea5bcb',
 'http://www.ufcstats.co

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [40]:
#removing invalid URLs
for url in fighter_url_list:
    if 'fighter-details' not in url:
        fighter_url_list.remove(url)

fighter_url_list[:15]   

['http://www.ufcstats.com/fighter-details/9199e0735b83dd32',
 'http://www.ufcstats.com/fighter-details/1cf1310684a841f5',
 'http://www.ufcstats.com/fighter-details/0541480fbf719d86',
 'http://www.ufcstats.com/fighter-details/d26934530dc5b248',
 'http://www.ufcstats.com/fighter-details/c487223b0289bda9',
 'http://www.ufcstats.com/fighter-details/25b31165758402dd',
 'http://www.ufcstats.com/fighter-details/399afbabc02376b5',
 'http://www.ufcstats.com/fighter-details/210935fd21670f6d',
 'http://www.ufcstats.com/fighter-details/9bcfb40dbcd50568',
 'http://www.ufcstats.com/fighter-details/41b34b7f11f6d085',
 'http://www.ufcstats.com/fighter-details/78d48d3874dacafd',
 'http://www.ufcstats.com/fighter-details/6cadc0a0ba7dc015',
 'http://www.ufcstats.com/fighter-details/7bd94b60d7521e4a',
 'http://www.ufcstats.com/fighter-details/063649e21bc9d6d5',
 'http://www.ufcstats.com/fighter-details/d221ee27afc7a60e']

## Creating Soup Dataframes

### Information DataFrames

In [41]:
def fighter_info_generator(url):
    '''This function parses basic request to a url'''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [42]:
#creating a soup list to reduce number of website requests
basic_soup_list = list(map(fighter_info_generator, fighter_url_list))

In [43]:
def base_stats_soup(soup):
    '''This function identifies all statistics for later extraction'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [6]:
def fighter_history_fetcher(soup):
    '''This function is meant to extract the html information for each fighter as a list of urls. '''
    
    soup_holder = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    list_of_fights = []
    
    for x in soup_holder:
        fight_holder = []
        
        fight_holder.append(x.get('href'))
        
        list_of_fights.append(fight_holder) 
        
    return list_of_fights

#### Construction of soup_database

In [44]:
'''soup_database is meant to hold all soups creating during the scrapping process to minimize the requests to website'''

soup_database = pd.DataFrame()

soup_database['url'] = fighter_url_list
soup_database['base_soup'] = basic_soup_list

fighter_history_urls_list = list(map(fighter_history_fetcher, basic_soup_list))
soup_database['fight_urls'] = fighter_history_urls_list

In [45]:
fight_soup_urls = []

for url_list in soup_database.fight_urls:
    for url in url_list:
        if url not in fight_soup_urls:
            fight_soup_urls.append(url) 
        else:
            pass

In [46]:
#flattening url list for smoother iteration
fight_soup_urls = list(itertools.chain(*fight_soup_urls))

In [None]:
fight_soups = list(map(fighter_info_generator, fight_soup_urls))

In [48]:
'''fight_soups_df is meant to hold all fight soups for later parcing. Running this cells takes a lot of time.'''

fight_soups_df = pd.DataFrame()

fight_soups_df['soups'] = fight_soups

TypeError: __init__() missing 4 required positional arguments: 'dsk', 'name', 'meta', and 'divisions'

In [15]:
soup_database.head()

Unnamed: 0,url,base_soup,fight_urls
0,http://www.ufcstats.com/fighter-details/956dca...,"[html, \n, [if lt IE 7]> <html class=""no-...",[[http://www.ufcstats.com/fight-details/e5908d...
1,http://www.ufcstats.com/fighter-details/04643f...,"[html, \n, [if lt IE 7]> <html class=""no-...",[[http://www.ufcstats.com/fight-details/81bd1c...
2,http://www.ufcstats.com/fighter-details/7c6e87...,"[html, \n, [if lt IE 7]> <html class=""no-...",[[http://www.ufcstats.com/fight-details/a16d8d...
3,http://www.ufcstats.com/fighter-details/d15651...,"[html, \n, [if lt IE 7]> <html class=""no-...",[[http://www.ufcstats.com/fight-details/b70dff...
4,http://www.ufcstats.com/fighter-details/4b37a0...,"[html, \n, [if lt IE 7]> <html class=""no-...",[[http://www.ufcstats.com/fight-details/260241...


In [7]:
def stats_soup(soup):
    '''parses soups down to the level of stats for faster, fluid parsing'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [17]:
stats_soups = list(map(stats_soup, basic_soup_list))

In [18]:
soup_database['stats_soup'] = stats_soups

In [8]:
def fight_url_flattener(fight_url_list):
    flattened_fight_urls = []
    holder = fight_url_list.split(',')
    
    for x in range(0, len(holder)):
        
        flattened_fight_urls.append(holder[x].replace("'", "").replace("[", "").replace("]", "").replace(" ", ""))
    
    return flattened_fight_urls

In [177]:
soup_database_df.fight_urls = list(map(fight_url_flattener, soup_database_df.fight_urls))

In [184]:
soup_database_df.to_csv('soup_database_csv', index=False)

#### Basic Statistics DataFrame Construction

In [9]:
def fighter_name(soup):
    '''soup: this soup must be parsed at the class level via the stats soup'''
    name = soup.findChildren('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return name[0].text.strip()

In [10]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [11]:
def basic_stats_height(soup):
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        pass
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return (inches + height_to_inches)

In [12]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        pass
    else:
        weight = int(weight)
    return weight 

In [13]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        pass
    else:
        reach = int(reach)
    return reach

In [14]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    return stance

In [15]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    return DOB

In [48]:
fighter_names = list(map(fighter_name, basic_soup_list))

fighter_records = list(map(fighter_record, basic_soup_list))

fighter_heights =  list(map(basic_stats_height, stats_soups))

fighter_weights = list(map(basic_stats_weight, stats_soups))

fighter_reachs = list(map(basic_stats_reach, stats_soups))

fighter_stances = list(map(basic_stats_stance, stats_soups))

fighter_DOBs = list(map(basic_stats_DOB, stats_soups))

NameError: name 'basic_soup_list' is not defined

In [28]:
fighter_basic_stats_df = pd.DataFrame()

fighter_basic_stats_df['name'] = fighter_names
fighter_basic_stats_df['record'] = fighter_records
fighter_basic_stats_df['height'] = fighter_heights
fighter_basic_stats_df['weight'] = fighter_weights
fighter_basic_stats_df['reach'] = fighter_reachs
fighter_basic_stats_df['stance'] = fighter_stances
fighter_basic_stats_df['DOB'] = fighter_DOBs

In [29]:
fighter_basic_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,David Avellan,2-1-0,69.0,185,--,Orthodox,--
1,Marcus Aurelio,21-10-0,70.0,155,74,Orthodox,"Aug 18, 1973"
2,Ildemar Alcantara,21-8-0,74.0,185,78,Orthodox,"Nov 18, 1982"
3,Sam Alvey,33-14-0 (1 NC),74.0,205,75,Southpaw,"May 06, 1986"
4,Pablo Alfonso,6-5-0,68.0,145,--,,"May 09, 1983"


In [30]:
fighter_basic_stats_df.to_csv('fighter_basic_stats_csv', index=False)

#### Career Statistics DataFrame Construction

In [16]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [17]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        pass
    else:
        SLpM = float(SLpM)
        
    return SLpM

In [18]:
def career_stats_StrAcc(soup):
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        pass
    else:
        StrAcc = float(StrAcc)
        
    return StrAcc

In [19]:
def career_stats_SApM(soup):
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        pass
    else:
        SApM = float(SApM)
        
    return SApM

In [20]:
def career_stats_StrDef(soup):
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        pass
    else:
        StrDef = float(StrDef)
        
    return StrDef

In [21]:
def career_stats_TDAvg(soup):
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        pass
    else:
        TDAvg = float(TDAvg)
        
    return TDAvg

In [22]:
def career_stats_TDAcc(soup):
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        pass
    else:
        TDAcc = float(TDAcc)
        
    return TDAcc

In [23]:
def career_stats_TDDef(soup):
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        pass
    else:
        TDDef = float(TDDef)
        
    return TDDef

In [24]:
def career_stats_SubAvg(soup):
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        pass
    else:
        SubAvg = float(SubAvg)
        
    return SubAvg

In [40]:
fighter_SLpMs = list(map(career_stats_SLpM, stats_soups))

fighter_StrAccs = list(map(career_stats_StrAcc, stats_soups))

fighter_SApMs = list(map(career_stats_SApM, stats_soups))

fighter_StrDefs =list(map(career_stats_StrDef, stats_soups))

fighter_TDAvgs = list(map(career_stats_TDAvg, stats_soups))

fighter_TDAccs = list(map(career_stats_TDAcc, stats_soups))

fighter_TDDefs = list(map(career_stats_TDDef, stats_soups))

fighter_SubAvgs = list(map(career_stats_SubAvg, stats_soups))

In [41]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names
fighter_career_stats_df['SLpMs'] = fighter_SLpMs
fighter_career_stats_df['StrAccs'] = fighter_StrAccs
fighter_career_stats_df['SApMs'] = fighter_SApMs
fighter_career_stats_df['StrDefs'] = fighter_StrDefs
fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs
fighter_career_stats_df['TDAccs'] = fighter_TDAccs
fighter_career_stats_df['TDDefs'] = fighter_TDDefs
fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [45]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,David Avellan,3.33,25.0,10.0,57.0,0.0,0.0,0.0,0.0
1,Marcus Aurelio,1.42,33.0,2.27,59.0,1.69,27.0,27.0,1.3
2,Ildemar Alcantara,1.93,38.0,2.63,50.0,2.0,68.0,81.0,0.9
3,Sam Alvey,3.11,45.0,3.22,55.0,0.08,16.0,81.0,0.1
4,Pablo Alfonso,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

In [44]:
fight_soups_df.to_csv('fight_soups_csv', index=False)

PROTOCODE ==================================

In [49]:
fighter_basic_stats_df = dd.read_csv('fighter_basic_stats_csv')
fighter_career_stats_df = dd.read_csv('fighter_career_stats_csv')
soup_database_df = pd.read_csv('soup_database_csv')
fight_soups_df = pd.read_csv('fight_soups_csv')

In [None]:
test

In [52]:
len(fight_soups_df.soups[42])

34806

In [50]:
fights = set(soup_database_df.fight_urls)
print(len(fights), len(fight_soups_df))

144 709


In [161]:
target_list = []
target = tester_fight_url_list.split(',')
for x in range(0, len(target)):
    target_list.append(target[x].replace("'", "").replace("[", "").replace("]", "").replace(" ", ""))

In [182]:
soup_database_df.fight_urls[0:4]

0    [http://www.ufcstats.com/fight-details/e5908d0...
1    [http://www.ufcstats.com/fight-details/81bd1c1...
2    [http://www.ufcstats.com/fight-details/a16d8d0...
3    [http://www.ufcstats.com/fight-details/b70dfff...
Name: fight_urls, dtype: object

In [175]:
test_list = list(map(fight_url_flattener, tester_fight_url_list))
test_list

[['http://www.ufcstats.com/fight-details/671f42536454e541',
  'http://www.ufcstats.com/fight-details/b8c21a311f3f7953',
  'http://www.ufcstats.com/fight-details/4fe799b4fa1afb76',
  'http://www.ufcstats.com/fight-details/b0e16dd491bc4fea',
  'http://www.ufcstats.com/fight-details/051124b25d576906'],
 ['http://www.ufcstats.com/fight-details/7b80dd233e200dd2'],
 ['http://www.ufcstats.com/fight-details/9a27064991a9e9d4']]