# UFC_Stats Web Scraping

## importing pertinent libraries

In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
from datetime import datetime
import pickle
import numpy as np

In [2]:
#creating datetime object to parse out future events

today = datetime.today()
current_date= today.strftime("%B %d, %Y")
current_datetime = datetime.strptime(current_date, "%B %d, %Y" ) 

## Fetching Fighter Url List

Describe url fetcher

In [3]:
with open('fighter_url_list', 'rb') as f:
    fighter_url_list = pickle.load(f)

In [4]:
fighter_url_list[:5]

['http://www.ufcstats.com/fighter-details/ee0b69e307c857e5',
 'http://www.ufcstats.com/fighter-details/269d103c96a4c3a5',
 'http://www.ufcstats.com/fighter-details/8753e125f4499816',
 'http://www.ufcstats.com/fighter-details/0e9869d712e81f8f',
 'http://www.ufcstats.com/fighter-details/3738e68d2261e60f']

## Creating Dataframe Building Blocks

### Generating basic statistics lists

In [5]:
def soup_generator(url):
    '''Meant to parse any url via beuatiful soup
    
    Args:
        url(str): url that will be parsed using BeautifulSoup
        '''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [6]:
#creating a soup list to reduce number of website requests
fighter_url_soup_list = list(map(soup_generator, fighter_url_list))

In [7]:
def base_stats_soup_generator(soup):
    '''Meant to identiy statistics in the context of UFC Stats urls.
    
    Args:
        
        soup(BeautifulSoup object) : must be an unaltered soup'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [8]:
fighter_stats_objects = list(map(base_stats_soup_generator, fighter_url_soup_list))

### Construction of Fighter Physical Statistics DataFrame 

In [78]:
def fighter_name(soup):
    '''Identifies fighter history to be parsed.
    Args:
    
        soup(BeautifulSoup object): BeautifulSoup object must originate from a urls hosting fighter profiles'''
    
    name = soup.find('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return str(name.text.strip())

In [82]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [83]:
def basic_stats_height(soup):
    
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        
        pass
    
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return int(final_height)

In [111]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        
        pass
    
    else:
        
        weight = int(weight)
        
    return weight

In [98]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        
        pass
    
    else:
        
        reach = int(reach)
        
    return reach

In [94]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    
    return str(stance)

In [100]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    
    return DOB

In [113]:
fighter_names = list(map(fighter_name, fighter_url_soup_list))

fighter_records = list(map(fighter_record, fighter_url_soup_list))

fighter_heights =  list(map(basic_stats_height, fighter_stats_objects))

fighter_weights = list(map(basic_stats_weight, fighter_stats_objects))

fighter_reachs = list(map(basic_stats_reach, fighter_stats_objects))

fighter_stances = list(map(basic_stats_stance, fighter_stats_objects))

fighter_DOBs = list(map(basic_stats_DOB, fighter_stats_objects))

In [114]:
fighter_physical_stats_df = pd.DataFrame()

fighter_physical_stats_df['name'] = fighter_names

fighter_physical_stats_df['record'] = fighter_records

fighter_physical_stats_df['height'] = fighter_heights

fighter_physical_stats_df['weight'] = fighter_weights

fighter_physical_stats_df['reach'] = fighter_reachs

fighter_physical_stats_df['stance'] = fighter_stances

fighter_physical_stats_df['DOB'] = fighter_DOBs

In [115]:
fighter_physical_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Chris Amarante,2-0-0,,185,--,,--
1,Kenji Arai,15-15-5,70.0,145,--,Southpaw,"Oct 05, 1979"
2,Blas Avena,8-7-0 (1 NC),72.0,170,74,Orthodox,"Jun 30, 1983"
3,Sam Adkins,7-20-2,75.0,225,--,Orthodox,"Apr 26, 1965"
4,Andrei Arlovski,29-19-0 (2 NC),75.0,240,77,Orthodox,"Feb 04, 1979"


In [105]:
fighter_physical_stats_df.to_csv('fighter_physical_stats_csv', index=False)

### Construction of Career Statistics DataFrame 

In [5]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [6]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        
        pass
    
    else:
        
        SLpM = float(SLpM)
        
    return SLpM

In [7]:
def career_stats_StrAcc(soup):
    
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        
        pass
    
    else:
        
        StrAcc = float(StrAcc)
        
    return StrAcc

In [8]:
def career_stats_SApM(soup):
    
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        
        pass
    
    else:
        
        SApM = float(SApM)
        
    return SApM

In [9]:
def career_stats_StrDef(soup):
    
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        
        pass
    
    else:
        
        StrDef = float(StrDef)
        
    return StrDef

In [10]:
def career_stats_TDAvg(soup):
    
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        
        pass
    
    else:
        
        TDAvg = float(TDAvg)
        
    return TDAvg

In [11]:
def career_stats_TDAcc(soup):
    
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        
        pass
    
    else:
        
        TDAcc = float(TDAcc)
        
    return TDAcc

In [12]:
def career_stats_TDDef(soup):
    
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        
        pass
    
    else:
        
        TDDef = float(TDDef)
        
    return TDDef

In [13]:
def career_stats_SubAvg(soup):
    
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        
        pass
    
    else:
        
        SubAvg = float(SubAvg)
        
    return SubAvg

In [24]:
fighter_SLpMs = list(map(career_stats_SLpM, fighter_stats_objects))

fighter_StrAccs = list(map(career_stats_StrAcc, fighter_stats_objects))

fighter_SApMs = list(map(career_stats_SApM, fighter_stats_objects))

fighter_StrDefs =list(map(career_stats_StrDef, fighter_stats_objects))

fighter_TDAvgs = list(map(career_stats_TDAvg, fighter_stats_objects))

fighter_TDAccs = list(map(career_stats_TDAcc, fighter_stats_objects))

fighter_TDDefs = list(map(career_stats_TDDef, fighter_stats_objects))

fighter_SubAvgs = list(map(career_stats_SubAvg, fighter_stats_objects))

In [34]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names

fighter_career_stats_df['SLpMs'] = fighter_SLpMs

fighter_career_stats_df['StrAccs'] = fighter_StrAccs

fighter_career_stats_df['SApMs'] = fighter_SApMs

fighter_career_stats_df['StrDefs'] = fighter_StrDefs

fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs

fighter_career_stats_df['TDAccs'] = fighter_TDAccs

fighter_career_stats_df['TDDefs'] = fighter_TDDefs

fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [35]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Chris Amarante,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Kenji Arai,8.79,27.0,10.29,39.0,0.0,0.0,0.0,0.0
2,Blas Avena,2.34,47.0,3.36,43.0,2.37,80.0,50.0,1.8
3,Sam Adkins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Andrei Arlovski,3.54,44.0,2.69,59.0,0.5,36.0,79.0,0.3


In [46]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

### Constructing Fight Event and Fight List Database

In [76]:
with open('event_url_list', 'rb') as f:
    event_url_list = pickle.load(f)

In [77]:
event_url_list[:5]

['http://www.ufcstats.com/event-details/c0231720fe516994',
 'http://www.ufcstats.com/event-details/aa5b4eff51bdc7d1',
 'http://www.ufcstats.com/event-details/32a3025d5db456ae',
 'http://www.ufcstats.com/event-details/b3fb8d2293e17a59',
 'http://www.ufcstats.com/event-details/bad28b7b34f334de']

#### Event Database Populator

In [78]:
test = pd.read_html('http://ufcstats.com/event-details/49590e0508b2c19f')[0]
test

Unnamed: 0,W/L,Fighter,Str,Td,Sub,Pass,Weight class,Method,Round,Time
0,win,Demetrious Johnson Ian McCall,90 57,2 1,0 0,0 0,Flyweight,U-DEC,3,5:00
1,win,Erick Silva Charlie Brenneman,10 3,0 1,2 0,1 0,Welterweight,SUB Rear Naked Choke,1,4:33
2,win,Mike Pyle Josh Neer,18 19,1 0,0 0,1 0,Welterweight,KO/TKO Punch,1,4:56
3,win,Eddie Wineland Scott Jorgensen,61 48,1 0,0 0,0 0,Bantamweight,KO/TKO Punch,2,4:10
4,win,Mike Pierce Carlos Eduardo Rocha,32 17,3 0,0 0,0 0,Welterweight,U-DEC,3,5:00
5,win,Seth Baczynski Lance Benoist,25 35,1 3,0 0,1 0,Welterweight,S-DEC,3,5:00
6,win,Matt Grice Leonard Garcia,61 2,4 0,1 0,8 0,Featherweight,U-DEC,3,5:00
7,win,Dustin Pague Jared Papazian,6 7,1 0,1 0,2 0,Bantamweight,SUB Rear Naked Choke,1,3:21
8,win,Tim Means Justin Salas,26 5,0 0,0 0,0 0,Lightweight,KO/TKO Punches,1,1:06
9,win,Buddy Roberts Caio Magalhaes,35 22,0 2,0 0,0 0,Middleweight,U-DEC,3,5:00


In [79]:
def event_fight_participants_fetcher(event_soup):
    fight_pairs = []
    name_holder = event_soup.findChildren('a', attrs={'class': re.compile('b-link')})
    fighter1_list_holder, fighter2_list_holder = name_holder[0::2], name_holder[1::2]
    fighter1_list = []
    fighter2_list = []

    for fighter1 in fighter1_list_holder:
        fighter1_list.append(fighter1.text.strip())

    for fighter2 in fighter2_list_holder:
        fighter2_list.append(fighter2.text.strip())

    fighters_in_ring = zip(fighter1_list, fighter2_list)

    for duo in fighters_in_ring:
        fight_pairs.append(duo)
    return fight_pairs

In [80]:
test_soup = soup_generator('http://ufcstats.com/event-details/49590e0508b2c19f')
event_fight_participants_fetcher(test_soup)

[('Demetrious Johnson', 'Ian McCall'),
 ('Erick Silva', 'Charlie Brenneman'),
 ('Mike Pyle', 'Josh Neer'),
 ('Eddie Wineland', 'Scott Jorgensen'),
 ('Mike Pierce', 'Carlos Eduardo Rocha'),
 ('Seth Baczynski', 'Lance Benoist'),
 ('Matt Grice', 'Leonard Garcia'),
 ('Dustin Pague', 'Jared Papazian'),
 ('Tim Means', 'Justin Salas'),
 ('Buddy Roberts', 'Caio Magalhaes'),
 ('Henry Martinez', 'Bernardo Magalhaes'),
 ('Sean Pierson', 'Jake Hecht')]

In [98]:
def column_transformer(df, column):
    
    '''This function reformats any df[column] with 2 item sublists in individual integers

    Args: 
    
    df: dataframes specific to UFC event pages gathered via pd.read_html()[1]
    
    column: target columne intended to be transformed into value pairs '''
    
    value_pairs = []
    
    for x in range(0, len(df.index)):
        
        values_split = df.iloc[x][column].split()
        
        values_split = [(value).strip() for value in values_split]
        
        value_pairs.append(values_split)
        
    df[column] = value_pairs

In [99]:
def event_name_fetcher(event_soup):
    event_title = event_soup.find('h2', attrs={'class': re.compile('b-content__')}).text.strip()
    return event_title

In [100]:
def event_date_fetcher(event_soup):
    date_holder = event_soup.findChildren('li', attrs={'class': re.compile('b-list__')})
    
    try:
        date = date_holder[0].text.split(':')[1].strip()
        
        return date 
        
    except:
        
        pass    

In [101]:
#test.Fighter = fighter_column_transformer(test)
test.head()

Unnamed: 0,W/L,Fighter,Str,Td,Sub,Pass,Weight class,Method,Round,Time
0,win,"Demetrious Johnson , Ian McCall",90 57,2 1,0 0,0 0,Flyweight,U-DEC,3,5:00
1,win,"Erick Silva , Charlie Brenneman",10 3,0 1,2 0,1 0,Welterweight,SUB Rear Naked Choke,1,4:33
2,win,"Mike Pyle , Josh Neer",18 19,1 0,0 0,1 0,Welterweight,KO/TKO Punch,1,4:56
3,win,"Eddie Wineland , Scott Jorgensen",61 48,1 0,0 0,0 0,Bantamweight,KO/TKO Punch,2,4:10
4,win,"Mike Pierce , Carlos Eduardo Rocha",32 17,3 0,0 0,0 0,Welterweight,U-DEC,3,5:00


In [121]:
test

Unnamed: 0,W/L,Fighter,Str,Td,Sub,Pass,Weight class,Method,Round,Time
0,win,"Demetrious Johnson , Ian McCall",90 57,2 1,0 0,0 0,Flyweight,U-DEC,3,5:00
1,win,"Erick Silva , Charlie Brenneman",10 3,0 1,2 0,1 0,Welterweight,SUB Rear Naked Choke,1,4:33
2,win,"Mike Pyle , Josh Neer",18 19,1 0,0 0,1 0,Welterweight,KO/TKO Punch,1,4:56
3,win,"Eddie Wineland , Scott Jorgensen",61 48,1 0,0 0,0 0,Bantamweight,KO/TKO Punch,2,4:10
4,win,"Mike Pierce , Carlos Eduardo Rocha",32 17,3 0,0 0,0 0,Welterweight,U-DEC,3,5:00
5,win,"Seth Baczynski , Lance Benoist",25 35,1 3,0 0,1 0,Welterweight,S-DEC,3,5:00
6,win,"Matt Grice , Leonard Garcia",61 2,4 0,1 0,8 0,Featherweight,U-DEC,3,5:00
7,win,"Dustin Pague , Jared Papazian",6 7,1 0,1 0,2 0,Bantamweight,SUB Rear Naked Choke,1,3:21
8,win,"Tim Means , Justin Salas",26 5,0 0,0 0,0 0,Lightweight,KO/TKO Punches,1,1:06
9,win,"Buddy Roberts , Caio Magalhaes",35 22,0 2,0 0,0 0,Middleweight,U-DEC,3,5:00


In [102]:
def unnesting(df, explode):
    idx = df.index.repeat(df[explode[0]].str.len())
    df1 = pd.concat([pd.DataFrame({x: np.concatenate(df[x].values)}) for x in explode], axis=1)
    df1.index = idx
    
    return df1.join(df.drop(explode, 1), how='left')

In [103]:
target_columns = ['Str', 'Td', 'Sub', 'Pass']
transformed_df = []


for url in event_url_list[:5]:
    event_soup = soup_generator(url)
    frame_holder = pd.read_html(url)[0]   
    frame_holder['Event'] = event_name_fetcher(event_soup)
    frame_holder['Date'] = event_date_fetcher(event_soup)
    
    for column in target_columns:
        column_transformer(frame_holder, column)
           
    transformed_df.append(unnesting(frame_holder, target_columns))

In [104]:
transformed_df = pd.concat(transformed_df)
transformed_df

Unnamed: 0,Str,Td,Sub,Pass,W/L,Fighter,Weight class,Method,Round,Time,Event,Date
0,6,0,0,0,win,Jimi Manuwa Corey Anderson,Light Heavyweight,KO/TKO Punch,1,3:05,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
0,6,0,0,0,win,Jimi Manuwa Corey Anderson,Light Heavyweight,KO/TKO Punch,1,3:05,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
1,8,1,1,5,win,Gunnar Nelson Alan Jouban,Welterweight,SUB Guillotine Choke,2,0:46,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
1,17,0,0,0,win,Gunnar Nelson Alan Jouban,Welterweight,SUB Guillotine Choke,2,0:46,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
2,67,0,0,0,win,Marlon Vera Brad Pickett,Catch Weight,KO/TKO Kick,3,3:50,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
...,...,...,...,...,...,...,...,...,...,...,...,...
6,11,0,2,4,win,Jeremy Stephens Diego Saraiva,Lightweight,U-DEC,3,5:00,UFC 76: Knockout,"September 22, 2007"
7,6,1,2,0,win,Christian Wellisch Scott Junk,Heavyweight,SUB Heel Hook,1,3:19,UFC 76: Knockout,"September 22, 2007"
7,17,0,0,0,win,Christian Wellisch Scott Junk,Heavyweight,SUB Heel Hook,1,3:19,UFC 76: Knockout,"September 22, 2007"
8,59,4,0,5,win,Matt Wiman Michihiro Omigawa,Lightweight,U-DEC,3,5:00,UFC 76: Knockout,"September 22, 2007"


In [105]:
transformed_df[transformed_df.isnull().any(axis=1)]

Unnamed: 0,Str,Td,Sub,Pass,W/L,Fighter,Weight class,Method,Round,Time,Event,Date


In [36]:
transformed_df.loc[8]

Unnamed: 0,Str,Td,Sub,Pass,W/L,Fighter,Weight class,Method,Round,Time,Event,Date
8,6,0,0,0,win,Marc Diakiese Teemu Packalen,Lightweight,KO/TKO Punch,1,0:30,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
8,1,0,0,0,win,Marc Diakiese Teemu Packalen,Lightweight,KO/TKO Punch,1,0:30,UFC Fight Night: Manuwa vs. Anderson,"March 18, 2017"
8,41,0,1,2,win,Sage Northcutt Enrique Marin,Lightweight,U-DEC,3,5:00,UFC 200: Tate vs Nunes,"July 09, 2016"
8,16,4,2,4,win,Sage Northcutt Enrique Marin,Lightweight,U-DEC,3,5:00,UFC 200: Tate vs Nunes,"July 09, 2016"
8,15,2,0,3,win,Alexander Yakovlev George Sullivan,Welterweight,KO/TKO Punches,1,3:59,UFC on FOX: Johnson vs. Bader,"January 30, 2016"
8,14,0,0,0,win,Alexander Yakovlev George Sullivan,Welterweight,KO/TKO Punches,1,3:59,UFC on FOX: Johnson vs. Bader,"January 30, 2016"
8,59,4,0,5,win,Matt Wiman Michihiro Omigawa,Lightweight,U-DEC,3,5:00,UFC 76: Knockout,"September 22, 2007"
8,33,1,1,0,win,Matt Wiman Michihiro Omigawa,Lightweight,U-DEC,3,5:00,UFC 76: Knockout,"September 22, 2007"


***************************************************************************************************************

In [81]:
fighter_pairs = event_fight_participants_fetcher(test_soup)

test.Fighter = fighter_pairs.copy()

In [95]:
fighter_df_pairs = []
for pair in fighter_pairs:
    holder = ' , '.join(map(str, pair))
    fighter_df_pairs.append(holder)
    
test.Fighter = fighter_df_pairs
test.head()

Unnamed: 0,W/L,Fighter,Str,Td,Sub,Pass,Weight class,Method,Round,Time
0,win,"Demetrious Johnson , Ian McCall",90 57,2 1,0 0,0 0,Flyweight,U-DEC,3,5:00
1,win,"Erick Silva , Charlie Brenneman",10 3,0 1,2 0,1 0,Welterweight,SUB Rear Naked Choke,1,4:33
2,win,"Mike Pyle , Josh Neer",18 19,1 0,0 0,1 0,Welterweight,KO/TKO Punch,1,4:56
3,win,"Eddie Wineland , Scott Jorgensen",61 48,1 0,0 0,0 0,Bantamweight,KO/TKO Punch,2,4:10
4,win,"Mike Pierce , Carlos Eduardo Rocha",32 17,3 0,0 0,0 0,Welterweight,U-DEC,3,5:00


In [96]:
def fighter_column_transformer(df):
    
    '''This function reformats df['Fighter'] into first/last name pairs for fighters
    
    Args: 
    
    df: dataframes specific to UFC event pages gathered via pd.read_html()[1]'''
    
    final_pairs = []
    
    for x in range(0, len(df.index)):

        value_split = df.iloc[x]['Fighter'].split(',')
        
        final_pairs.append(value_split)

        #split_values = [value_split[index : index + 1] for index in range(0, len(value_split), 1)]

        #column_value_pair = [' '.join(split_values[x]) for x in range(0,len(split_values))]
        
    return final_pairs
    #df['Fighter'] = final_pairs

In [97]:
fighter_column_transformer(test)

[['Demetrious Johnson ', ' Ian McCall'],
 ['Erick Silva ', ' Charlie Brenneman'],
 ['Mike Pyle ', ' Josh Neer'],
 ['Eddie Wineland ', ' Scott Jorgensen'],
 ['Mike Pierce ', ' Carlos Eduardo Rocha'],
 ['Seth Baczynski ', ' Lance Benoist'],
 ['Matt Grice ', ' Leonard Garcia'],
 ['Dustin Pague ', ' Jared Papazian'],
 ['Tim Means ', ' Justin Salas'],
 ['Buddy Roberts ', ' Caio Magalhaes'],
 ['Henry Martinez ', ' Bernardo Magalhaes'],
 ['Sean Pierson ', ' Jake Hecht']]