# UFC_Stats Web Scraping

## importing pertinent libraries

In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
import dask.dataframe as dd

## Extracting UFC urls

In [2]:
#creating a list of unique fighter URLs available on website
fighter_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_url_list.append(fighter.get('href'))

#removing duplicate URLs
fighter_url_list = list(set(fighter_url_list))

['http://www.ufcstats.com/fighter-details/c482c8605455a213',
 'http://www.ufcstats.com/fighter-details/dd1b90eea08887f6',
 'http://www.ufcstats.com/fighter-details/c4fe2e9a06ea5bcb',
 'http://www.ufcstats.com/fighter-details/33a331684283900f',
 'http://www.ufcstats.com/fighter-details/c136b2a8852da5bd',
 'http://www.ufcstats.com/fighter-details/cad24459b28592ca',
 'http://www.ufcstats.com/fighter-details/269d103c96a4c3a5',
 'http://www.ufcstats.com/fighter-details/b361180739bed4b0',
 'http://www.ufcstats.com/fighter-details/9bad58fa651d6196',
 'http://www.ufcstats.com/fighter-details/91e3388e69060e69',
 'http://www.ufcstats.com/fighter-details/aa6e591c2a2cdecd',
 'http://www.ufcstats.com/fighter-details/c487223b0289bda9',
 'http://www.ufcstats.com/fighter-details/38b50fd1e1b5b656',
 'http://www.ufcstats.com/statistics/fighters?char=a&page=6',
 'http://www.ufcstats.com/fighter-details/2144954270be834d',
 'http://www.ufcstats.com/fighter-details/87a1dc546b1c5caf',
 'http://www.ufcstats.c

In [None]:
fighter_url_list[:15]

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [3]:
#removing invalid URLs
for url in fighter_url_list:
    if 'fighter-details' not in url:
        fighter_url_list.remove(url)

## Creating Soup Dataframes

### Information DataFrames

In [44]:
def info_generator(url):
    '''This function parses basic request to a url'''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [5]:
#creating a soup list to reduce number of website requests
basic_soup_list = list(map(info_generator, fighter_url_list))

In [6]:
def base_stats_soup(soup):
    '''This function identifies all statistics for later extraction'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [7]:
def fighter_history_fetcher(soup):
    '''This function is meant to extract the html information for each fighter as a list of urls. '''
    
    soup_holder = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    list_of_fights = []
    
    for x in soup_holder:
        fight_holder = []
        
        fight_holder.append(x.get('href'))
        
        list_of_fights.append(fight_holder) 
        
    return list_of_fights

#### Construction of soup_database

In [68]:
'''soup_database is meant to hold all soups creating during the scrapping process to minimize the requests to website'''

soup_database = pd.DataFrame()

soup_database['url'] = fighter_url_list
soup_database['base_soup'] = basic_soup_list

fighter_history_url_list = list(map(fighter_history_fetcher, basic_soup_list))
fighter_history_url_list[:3]

[[['http://www.ufcstats.com/fight-details/9ba051e0f5674108'],
  ['http://www.ufcstats.com/fight-details/491b7d974254596d'],
  ['http://www.ufcstats.com/fight-details/3dcb1a7b081afbe1'],
  ['http://www.ufcstats.com/fight-details/603354ffe9cc0755'],
  ['http://www.ufcstats.com/fight-details/d83ae4a2e117b775']],
 [['http://www.ufcstats.com/fight-details/3638be8341b7124c']],
 [['http://www.ufcstats.com/fight-details/602e8602fe9c9121'],
  ['http://www.ufcstats.com/fight-details/8b0ff8defc1a14f4'],
  ['http://www.ufcstats.com/fight-details/e57b884785848624']]]

In [69]:
def stats_soup(soup):
    '''parses soups down to the level of stats for faster, fluid parsing'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [70]:
stats_soups = list(map(stats_soup, basic_soup_list))

In [71]:
def compound_flattener(fight_url_list):
    flattened_fight_urls = list(itertools.chain(*fight_url_list))
    return flattened_fight_urls

In [72]:
soup_database['fight_urls'] = fighter_history_url_list

soup_database['stats_soup'] = stats_soups

In [61]:
soup_database.fight_urls = list(map(compound_flattener, soup_database.fight_urls))

In [14]:
soup_database.to_csv('soup_database_csv', index=False)

In [15]:
soup_database

Unnamed: 0,url,base_soup,fight_urls,stats_soup
0,http://www.ufcstats.com/fighter-details/c482c8...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/9ba051e...,"[[\n, [\n Height:\n ], \n 6' ..."
1,http://www.ufcstats.com/fighter-details/dd1b90...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/3638be8...,"[[\n, [\n Height:\n ], \n --\..."
2,http://www.ufcstats.com/fighter-details/c4fe2e...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/602e860...,"[[\n, [\n Height:\n ], \n 5' ..."
3,http://www.ufcstats.com/fighter-details/33a331...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/bd1fc4f...,"[[\n, [\n Height:\n ], \n 5' ..."
4,http://www.ufcstats.com/fighter-details/c136b2...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/0f85a1d...,"[[\n, [\n Height:\n ], \n 5' ..."
...,...,...,...,...
160,http://www.ufcstats.com/fighter-details/9b2829...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/b48fdc1...,"[[\n, [\n Height:\n ], \n --\..."
161,http://www.ufcstats.com/fighter-details/6fd953...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/96fe84d...,"[[\n, [\n Height:\n ], \n 5' ..."
162,http://www.ufcstats.com/fighter-details/210935...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/d6d8122...,"[[\n, [\n Height:\n ], \n 5' ..."
163,http://www.ufcstats.com/fighter-details/ae0716...,"[html, \n, [if lt IE 7]> <html class=""no-...",[http://www.ufcstats.com/fight-details/58e72b1...,"[[\n, [\n Height:\n ], \n 5' ..."


#### Basic Statistics DataFrame Construction

In [16]:
def fighter_name(soup):
    
    name = soup.find('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return name.text.strip()#[0].text.strip()

In [17]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [18]:
def basic_stats_height(soup):
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        pass
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return (inches + height_to_inches)

In [19]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        pass
    else:
        weight = int(weight)
    return weight 

In [20]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        pass
    else:
        reach = int(reach)
    return reach

In [21]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    return stance

In [22]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    return DOB

In [23]:
fighter_names = list(map(fighter_name, basic_soup_list))

fighter_records = list(map(fighter_record, basic_soup_list))

fighter_heights =  list(map(basic_stats_height, stats_soups))

fighter_weights = list(map(basic_stats_weight, stats_soups))

fighter_reachs = list(map(basic_stats_reach, stats_soups))

fighter_stances = list(map(basic_stats_stance, stats_soups))

fighter_DOBs = list(map(basic_stats_DOB, stats_soups))

In [24]:
fighter_basic_stats_df = pd.DataFrame()

fighter_basic_stats_df['name'] = fighter_names
fighter_basic_stats_df['record'] = fighter_records
fighter_basic_stats_df['height'] = fighter_heights
fighter_basic_stats_df['weight'] = fighter_weights
fighter_basic_stats_df['reach'] = fighter_reachs
fighter_basic_stats_df['stance'] = fighter_stances
fighter_basic_stats_df['DOB'] = fighter_DOBs

In [25]:
fighter_basic_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Junior Albini,14-6-0,75.0,264,74,Orthodox,"Mar 15, 1991"
1,Israel Albuquerque,0-3-0,,185,--,Orthodox,--
2,Benny Alloway,13-6-0,71.0,170,75,Orthodox,"Feb 19, 1981"
3,Eddie Alvarez,29-6-0 (1 NC),69.0,155,69,Orthodox,"Jan 11, 1984"
4,Akbarh Arreola,23-10-1,70.0,155,71,Southpaw,"Jan 14, 1983"


In [26]:
fighter_basic_stats_df.to_csv('fighter_basic_stats_csv', index=False)

#### Career Statistics DataFrame Construction

In [27]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [28]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        pass
    else:
        SLpM = float(SLpM)
        
    return SLpM

In [29]:
def career_stats_StrAcc(soup):
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        pass
    else:
        StrAcc = float(StrAcc)
        
    return StrAcc

In [30]:
def career_stats_SApM(soup):
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        pass
    else:
        SApM = float(SApM)
        
    return SApM

In [31]:
def career_stats_StrDef(soup):
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        pass
    else:
        StrDef = float(StrDef)
        
    return StrDef

In [32]:
def career_stats_TDAvg(soup):
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        pass
    else:
        TDAvg = float(TDAvg)
        
    return TDAvg

In [33]:
def career_stats_TDAcc(soup):
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        pass
    else:
        TDAcc = float(TDAcc)
        
    return TDAcc

In [34]:
def career_stats_TDDef(soup):
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        pass
    else:
        TDDef = float(TDDef)
        
    return TDDef

In [35]:
def career_stats_SubAvg(soup):
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        pass
    else:
        SubAvg = float(SubAvg)
        
    return SubAvg

In [36]:
fighter_SLpMs = list(map(career_stats_SLpM, stats_soups))

fighter_StrAccs = list(map(career_stats_StrAcc, stats_soups))

fighter_SApMs = list(map(career_stats_SApM, stats_soups))

fighter_StrDefs =list(map(career_stats_StrDef, stats_soups))

fighter_TDAvgs = list(map(career_stats_TDAvg, stats_soups))

fighter_TDAccs = list(map(career_stats_TDAcc, stats_soups))

fighter_TDDefs = list(map(career_stats_TDDef, stats_soups))

fighter_SubAvgs = list(map(career_stats_SubAvg, stats_soups))

In [37]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names
fighter_career_stats_df['SLpMs'] = fighter_SLpMs
fighter_career_stats_df['StrAccs'] = fighter_StrAccs
fighter_career_stats_df['SApMs'] = fighter_SApMs
fighter_career_stats_df['StrDefs'] = fighter_StrDefs
fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs
fighter_career_stats_df['TDAccs'] = fighter_TDAccs
fighter_career_stats_df['TDDefs'] = fighter_TDDefs
fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [38]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Junior Albini,3.98,51.0,6.08,52.0,1.54,37.0,100.0,0.0
1,Israel Albuquerque,0.32,13.0,3.34,25.0,0.0,0.0,0.0,0.0
2,Benny Alloway,1.28,49.0,1.9,59.0,0.62,50.0,26.0,0.0
3,Eddie Alvarez,4.32,41.0,4.39,55.0,2.92,36.0,92.0,0.5
4,Akbarh Arreola,2.32,45.0,4.01,56.0,1.07,100.0,33.0,0.4


In [39]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

#### Constructing Fight Event and Fight List Database

In [40]:
#creating a list of unique fighter URLs available on website
event_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/events/completed?page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
events = soup.findAll('a', attrs={'href': re.compile('http:')})
for event in events:
    event_url_list.append(event.get('href'))

#removing duplicate URLs
event_url_list = list(set(event_url_list))

['http://www.ufcstats.com/event-details/821cd80aab70d5f9',
 'http://www.ufcstats.com/event-details/1652f3213655b935',
 'http://www.ufcstats.com/event-details/b4ad3a06ee4d660c',
 'http://www.ufcstats.com/event-details/1e13936d708bcff7',
 'http://www.ufcstats.com/event-details/4f732e58ed907eff',
 'http://www.ufcstats.com/event-details/a71feb7ea7592a71',
 'http://www.ufcstats.com/event-details/d6455cb4bee503ce',
 'http://www.ufcstats.com/event-details/7139cd2ae4bf6a29',
 'http://www.ufcstats.com/event-details/3144121470023e9a',
 'http://www.ufcstats.com/event-details/9e0f28d1f639ad73',
 'http://www.ufcstats.com/event-details/2ee09ec2a0695eb9',
 'http://www.ufcstats.com/event-details/222d6b547de2e035',
 'http://www.ufcstats.com/event-details/4512e46543b960ad',
 'http://www.ufcstats.com/event-details/0313bf497de9c470',
 'http://www.ufcstats.com/event-details/896c322f56b8be5a',
 'http://www.ufcstats.com/event-details/a5c53b3ddb31cc7d',
 'http://www.ufcstats.com/event-details/282fa667ff9c51ed

In [42]:
#removing invalid URLs
for url in event_url_list:
    if 'event-details' not in url:
        event_url_list.remove(url)
        
event_url_list[:15]

['http://www.ufcstats.com/event-details/821cd80aab70d5f9',
 'http://www.ufcstats.com/event-details/1652f3213655b935',
 'http://www.ufcstats.com/event-details/b4ad3a06ee4d660c',
 'http://www.ufcstats.com/event-details/1e13936d708bcff7',
 'http://www.ufcstats.com/event-details/4f732e58ed907eff',
 'http://www.ufcstats.com/event-details/a71feb7ea7592a71',
 'http://www.ufcstats.com/event-details/d6455cb4bee503ce',
 'http://www.ufcstats.com/event-details/7139cd2ae4bf6a29',
 'http://www.ufcstats.com/event-details/3144121470023e9a',
 'http://www.ufcstats.com/event-details/9e0f28d1f639ad73',
 'http://www.ufcstats.com/event-details/2ee09ec2a0695eb9',
 'http://www.ufcstats.com/event-details/222d6b547de2e035',
 'http://www.ufcstats.com/event-details/4512e46543b960ad',
 'http://www.ufcstats.com/event-details/0313bf497de9c470',
 'http://www.ufcstats.com/event-details/896c322f56b8be5a']

In [43]:
event_database_df = pd.DataFrame()

event_database_df['event'] = event_url_list

In [45]:
event_soups = list(map(info_generator, event_url_list))

In [47]:
event_database_df['event_soup'] = event_soups

In [91]:
tester = event_soups[:5]

In [None]:
for test in tester:
    test.findAll('p', attrs={'href': re.compile('http:')})

In [326]:
event_database_df.to_csv('event_database_csv', index=False)

PROTOCODE ==================================

In [297]:
fight_soup_urls = []

for url_list in soup_database.fight_urls:
    for url in url_list:
        if url not in fight_soup_urls:
            fight_soup_urls.append(url) 
        else:
            pass

In [298]:
fight_soup_urls

['http://www.ufcstats.com/fight-details/8b5130c2f2723a90',
 'http://www.ufcstats.com/fight-details/5769c08adc653958',
 'http://www.ufcstats.com/fight-details/d825023bc9f343fc',
 'http://www.ufcstats.com/fight-details/5584458cadd0a0ca',
 'http://www.ufcstats.com/fight-details/9cb8fe20b05d886e',
 'http://www.ufcstats.com/fight-details/f0cf2ca8e5c621a8',
 'http://www.ufcstats.com/fight-details/8e4439cdc98d0644',
 'http://www.ufcstats.com/fight-details/74e92a66b29674a9',
 'http://www.ufcstats.com/fight-details/72481ce1897251cd',
 'http://www.ufcstats.com/fight-details/fa3b940bd48da7e2',
 'http://www.ufcstats.com/fight-details/09c092dc766fa3f9',
 'http://www.ufcstats.com/fight-details/84d89d2f36d07dc0',
 'http://www.ufcstats.com/fight-details/3e0090669d51415f',
 'http://www.ufcstats.com/fight-details/fa77134e0d90da3b',
 'http://www.ufcstats.com/fight-details/1ed266bed8005bbe',
 'http://www.ufcstats.com/fight-details/cb2844bb15d22cc6',
 'http://www.ufcstats.com/fight-details/4804776154430aad

In [299]:
fight_soups = list(map(info_generator, fight_soup_urls))

In [300]:
fight_soups_df = pd.DataFrame()

In [286]:
#fight_soups_df['soups'] = fight_soups

In [287]:
def event_title_fetcher(fight_soup):
    holder = fight_soup.find('a', attrs={'href': re.compile('event-details')})
    return holder.text.strip()

In [305]:
event_titles = list(map(event_title_fetcher, fight_soups))

In [306]:
event_titles[:4]

['Fury FC 5: Final Conflict',
 'PRIDE Bushido 9: The Tournament',
 'PRIDE Bushido 8',
 'PRIDE Bushido 6']

In [307]:
fight_database = pd.DataFrame()

In [308]:
fight_database['event'] = event_titles

In [49]:
fight_soups_df.to_csv('fight_soups_csv', index=False)