# UFC_Stats Web Scraping

## importing pertinent libraries

In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
from datetime import datetime
import pickle

In [2]:
#creating datetime object to parse out future events

today = datetime.today()
current_date= today.strftime("%B %d, %Y")
current_datetime = datetime.strptime(current_date, "%B %d, %Y" ) 

## Fetching Fighter Url List

In [18]:
with open('fighter_url_list', 'rb') as f:
    fighter_url_list = pickle.load(f)

In [19]:
fighter_url_list[:5]

['http://www.ufcstats.com/fighter-details/ee0b69e307c857e5',
 'http://www.ufcstats.com/fighter-details/269d103c96a4c3a5',
 'http://www.ufcstats.com/fighter-details/8753e125f4499816',
 'http://www.ufcstats.com/fighter-details/0e9869d712e81f8f',
 'http://www.ufcstats.com/fighter-details/3738e68d2261e60f']

## Creating Dataframe Building Blocks

### Generating basic statistics lists

In [20]:
def fighter_soup_generator(url):
    '''Meant to parse any url via beuatiful soup
    
    Args:
        url(str): url that will be parsed using BeautifulSoup
        '''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [21]:
#creating a soup list to reduce number of website requests
fighter_url_soup_list = list(map(fighter_soup_generator, fighter_url_list))

In [22]:
def base_stats_soup_generator(soup):
    '''Meant to identiy statistics in the context of UFC Stats urls.
    
    Args:
        
        soup(BeautifulSoup object) : must be an unaltered soup'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [23]:
fighter_stats_objects = list(map(base_stats_soup_generator, fighter_url_soup_list))

### Construction of Fighter Physical Statistics DataFrame 

In [26]:
def fighter_name(soup):
    '''Identifies fighter history to be parsed.
    Args:
    
        soup(BeautifulSoup object): BeautifulSoup object must originate from a urls hosting fighter profiles'''
    
    name = soup.find('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return name.text.strip()

In [27]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [28]:
def basic_stats_height(soup):
    
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        
        pass
    
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return (inches + height_to_inches)

In [29]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        
        pass
    
    else:
        
        weight = int(weight)
        
    return weight 

In [30]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        
        pass
    
    else:
        
        reach = int(reach)
        
    return reach

In [31]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    
    return stance

In [32]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    
    return DOB

In [33]:
fighter_names = list(map(fighter_name, fighter_url_soup_list))

fighter_records = list(map(fighter_record, fighter_url_soup_list))

fighter_heights =  list(map(basic_stats_height, fighter_stats_objects))

fighter_weights = list(map(basic_stats_weight, fighter_stats_objects))

fighter_reachs = list(map(basic_stats_reach, fighter_stats_objects))

fighter_stances = list(map(basic_stats_stance, fighter_stats_objects))

fighter_DOBs = list(map(basic_stats_DOB, fighter_stats_objects))

In [74]:
fighter_basic_stats_df = pd.DataFrame()

fighter_basic_stats_df['name'] = fighter_names

fighter_basic_stats_df['record'] = fighter_records

fighter_basic_stats_df['height'] = fighter_heights

fighter_basic_stats_df['weight'] = fighter_weights

fighter_basic_stats_df['reach'] = fighter_reachs

fighter_basic_stats_df['stance'] = fighter_stances

fighter_basic_stats_df['DOB'] = fighter_DOBs

In [75]:
fighter_basic_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Marcio Alexandre Junior,17-4-0,72.0,185,75,Southpaw,"May 05, 1989"
1,Yoshihiro Akiyama,14-6-0 (2 NC),70.0,170,75,Orthodox,"Jul 29, 1975"
2,Rostem Akman,6-2-0,70.0,170,72,Switch,"Dec 19, 1991"
3,Marcus Aurelio,22-10-0,70.0,155,74,Orthodox,"Aug 18, 1973"
4,Junior Albini,14-6-0,75.0,264,74,Orthodox,"Mar 15, 1991"


In [30]:
fighter_basic_stats_df.to_csv('fighter_basic_stats_csv', index=False)

In [3]:
pd.read_csv('fighter_physical_stats_csv').head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Chris Amarante,2-0-0,,185,--,,--
1,Kenji Arai,15-15-5,70.0,145,--,Southpaw,"Oct 05, 1979"
2,Blas Avena,8-7-0 (1 NC),72.0,170,74,Orthodox,"Jun 30, 1983"
3,Sam Adkins,7-20-2,75.0,225,--,Orthodox,"Apr 26, 1965"
4,Andrei Arlovski,29-19-0 (2 NC),75.0,240,77,Orthodox,"Feb 04, 1979"


In [4]:
pd.read_csv('fighter_career_stats_csv').head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Edwin Aguilar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Romie Aram,1.8,35.0,2.83,50.0,2.5,62.0,83.0,0.0
2,JJ Aldrich,3.88,40.0,4.68,62.0,0.31,25.0,53.0,0.0
3,Jaime Alvarez,2.73,40.0,4.53,56.0,1.0,33.0,50.0,0.0
4,Juan Adams,7.09,55.0,4.06,34.0,0.91,66.0,57.0,0.0


***************************************************************************************************************

### Career Statistics DataFrame Construction

In [5]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [6]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        
        pass
    
    else:
        
        SLpM = float(SLpM)
        
    return SLpM

In [7]:
def career_stats_StrAcc(soup):
    
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        
        pass
    
    else:
        
        StrAcc = float(StrAcc)
        
    return StrAcc

In [8]:
def career_stats_SApM(soup):
    
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        
        pass
    
    else:
        
        SApM = float(SApM)
        
    return SApM

In [9]:
def career_stats_StrDef(soup):
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        pass
    else:
        StrDef = float(StrDef)
        
    return StrDef

In [10]:
def career_stats_TDAvg(soup):
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        pass
    else:
        TDAvg = float(TDAvg)
        
    return TDAvg

In [11]:
def career_stats_TDAcc(soup):
    
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        
        pass
    
    else:
        
        TDAcc = float(TDAcc)
        
    return TDAcc

In [12]:
def career_stats_TDDef(soup):
    
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        
        pass
    
    else:
        
        TDDef = float(TDDef)
        
    return TDDef

In [13]:
def career_stats_SubAvg(soup):
    
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        
        pass
    
    else:
        
        SubAvg = float(SubAvg)
        
    return SubAvg

In [24]:
fighter_SLpMs = list(map(career_stats_SLpM, fighter_stats_objects))

fighter_StrAccs = list(map(career_stats_StrAcc, fighter_stats_objects))

fighter_SApMs = list(map(career_stats_SApM, fighter_stats_objects))

fighter_StrDefs =list(map(career_stats_StrDef, fighter_stats_objects))

fighter_TDAvgs = list(map(career_stats_TDAvg, fighter_stats_objects))

fighter_TDAccs = list(map(career_stats_TDAcc, fighter_stats_objects))

fighter_TDDefs = list(map(career_stats_TDDef, fighter_stats_objects))

fighter_SubAvgs = list(map(career_stats_SubAvg, fighter_stats_objects))

In [34]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names

fighter_career_stats_df['SLpMs'] = fighter_SLpMs

fighter_career_stats_df['StrAccs'] = fighter_StrAccs

fighter_career_stats_df['SApMs'] = fighter_SApMs

fighter_career_stats_df['StrDefs'] = fighter_StrDefs

fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs

fighter_career_stats_df['TDAccs'] = fighter_TDAccs

fighter_career_stats_df['TDDefs'] = fighter_TDDefs

fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [35]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Chris Amarante,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Kenji Arai,8.79,27.0,10.29,39.0,0.0,0.0,0.0,0.0
2,Blas Avena,2.34,47.0,3.36,43.0,2.37,80.0,50.0,1.8
3,Sam Adkins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Andrei Arlovski,3.54,44.0,2.69,59.0,0.5,36.0,79.0,0.3


In [46]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

### Constructing Fight Event and Fight List Database

In [11]:
#creating a list of unique fighter URLs available on website
event_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/events/completed?page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
events = soup.findAll('a', attrs={'href': re.compile('http:')})
for event in events:
    event_url_list.append(event.get('href'))

#removing duplicate URLs
event_url_list = list(set(event_url_list))

In [12]:
#removing invalid URLs
for url in event_url_list:
    if 'event-details' not in url:
        event_url_list.remove(url)
    
for url in event_url_list:  
    if len(url) <  54:
        event_url_list.remove(url)

In [14]:
event_url_list

['http://www.ufcstats.com/event-details/243b07fc65ccbb16',
 'http://www.ufcstats.com/event-details/6ca68b636fbc1f18',
 'http://www.ufcstats.com/event-details/4c8d6fde2dde07c4',
 'http://www.ufcstats.com/event-details/1e13936d708bcff7',
 'http://www.ufcstats.com/event-details/e8efeb9cf33b1941',
 'http://www.ufcstats.com/event-details/31ceaf0e670c1578',
 'http://www.ufcstats.com/event-details/45a2ba3ef82b9700',
 'http://www.ufcstats.com/event-details/5cde96e0a1a1fffe',
 'http://www.ufcstats.com/event-details/f5990c11974d8e9c',
 'http://www.ufcstats.com/event-details/f3155a94ca420126',
 'http://www.ufcstats.com/event-details/e0b74df14f52cd15',
 'http://www.ufcstats.com/event-details/c0c1bc0766df4c00',
 'http://www.ufcstats.com/event-details/b0550072e5f0afa7',
 'http://www.ufcstats.com/event-details/35dc6220b113b7ec',
 'http://www.ufcstats.com/event-details/319fa1bd3176bded',
 'http://www.ufcstats.com/event-details/eed2b71d77d95416',
 'http://www.ufcstats.com/event-details/a196332ee4aa8a82

In [402]:
test = pd.read_html('http://www.ufcstats.com/event-details/e670f8cc2969a789')[0]

In [403]:
test.head()

Unnamed: 0,W/L,Fighter,Str,Td,Sub,Pass,Weight class,Method,Round,Time
0,win,Matt Hughes Frank Trigg,0 0,1 1,2 1,2 0,Welterweight,SUB Rear Naked Choke,1,3:54
1,win,Matt Lindland Falaniko Vitale,35 3,1 0,1 0,3 2,Middleweight,KO/TKO Punches,3,4:23
2,win,Wesley Correira David Abbott,13 11,0 0,0 0,0 0,Heavyweight,KO/TKO,1,2:14
3,win,Evan Tanner Phil Baroni,28 17,1 0,0 0,2 0,Middleweight,KO/TKO Elbows,1,4:42
4,win,Robbie Lawler Chris Lytle,30 37,3 1,0 1,2 0,Welterweight,U-DEC,3,5:00


In [440]:
def fighter_column_transformer(df):
    
    '''This function reformats df['Fighter'] into first/last name pairs for fighters
    
    Args:
    df:
    column: target columne intended to be transformed into a list '''
    final_pairs = []
    for x in range(0, len(df.index)):

        value_split = df.iloc[x]['Fighter'].split()

        split_values = [value_split[index : index + 2] for index in range(0, len(value_split), 2)]

        column_value_pair = [' '.join(split_values[x]) for x in range(0,len(split_values))]
        final_pairs.append(column_value_pair)

    return final_pairs

In [442]:
fighter_column_transformer(test)

[['Matt Hughes', 'Frank Trigg'],
 ['Matt Lindland', 'Falaniko Vitale'],
 ['Wesley Correira', 'David Abbott'],
 ['Evan Tanner', 'Phil Baroni'],
 ['Robbie Lawler', 'Chris Lytle'],
 ['Pedro Rizzo', 'Ricco Rodriguez'],
 ['Keith Rockel', 'Chris Liguori'],
 ['Yves Edwards', 'Nick Agallar']]

In [435]:
def column_transformer(df, column):
    
    '''This function reformats df['Fighter'] into first/last name pairs for fighters

    Args:
    df:
    column: target columne intended to be transformed into a list '''
    
    value_pairs = []
    
    for x in range(0, len(df.index)):
        
        values_split = test.iloc[x][column].split()
        
        values_split = [int(value) for value in values_split]
        
        value_pairs.append(values_split)
        
    return(value_pairs)

In [436]:
column_transformer(test, 'Str')

[[0, 0], [35, 3], [13, 11], [28, 17], [30, 37], [30, 15], [5, 1], [33, 5]]

***************************************************************************************************************

In [446]:
test.Str = column_transformer(test, 'Str')
test.explode('Str')

Unnamed: 0,W/L,Fighter,Str,Td,Sub,Pass,Weight class,Method,Round,Time
0,win,"[Matt Hughes, Frank Trigg]",0,1 1,2 1,2 0,Welterweight,SUB Rear Naked Choke,1,3:54
0,win,"[Matt Hughes, Frank Trigg]",0,1 1,2 1,2 0,Welterweight,SUB Rear Naked Choke,1,3:54
1,win,"[Matt Lindland, Falaniko Vitale]",35,1 0,1 0,3 2,Middleweight,KO/TKO Punches,3,4:23
1,win,"[Matt Lindland, Falaniko Vitale]",3,1 0,1 0,3 2,Middleweight,KO/TKO Punches,3,4:23
2,win,"[Wesley Correira, David Abbott]",13,0 0,0 0,0 0,Heavyweight,KO/TKO,1,2:14
2,win,"[Wesley Correira, David Abbott]",11,0 0,0 0,0 0,Heavyweight,KO/TKO,1,2:14
3,win,"[Evan Tanner, Phil Baroni]",28,1 0,0 0,2 0,Middleweight,KO/TKO Elbows,1,4:42
3,win,"[Evan Tanner, Phil Baroni]",17,1 0,0 0,2 0,Middleweight,KO/TKO Elbows,1,4:42
4,win,"[Robbie Lawler, Chris Lytle]",30,3 1,0 1,2 0,Welterweight,U-DEC,3,5:00
4,win,"[Robbie Lawler, Chris Lytle]",37,3 1,0 1,2 0,Welterweight,U-DEC,3,5:00


In [53]:
event_soups = list(map(fighter_soup_generator, event_url_list))

In [54]:
len(event_soups)

522

In [6]:
event_database_df.head()

Unnamed: 0,date,name,event_fight_list,matchups,strike_counts,takedown_counts,submission_counts,pass_counts,weight_class,win_method,final_round,final_round_time
0,2019-10-26,UFC Fight Night: Maia vs. Askren,['http://www.ufcstats.com/fight-details/241a08...,"[('Demian Maia', 'Ben Askren'), ('Stevie Ray',...","[['69', '63'], ['85', '78'], ['13', '3'], ['94...","[['0', '4'], ['1', '0'], ['2', '0'], ['3', '0'...","[['1', '0'], ['0', '0'], ['1', '0'], ['1', '0'...","[['4', '1'], ['1', '0'], ['1', '0'], ['1', '0'...","['Welterweight', 'Lightweight', 'Lightweight',...","['SUB', 'M-DEC', 'SUB', 'SUB', 'U-DEC', 'S-DEC...","[3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3]","['3:54', '5:00', '2:02', '4:46', '5:00', '5:00..."
1,2016-04-10,UFC Fight Night: Rothwell vs Dos Santos,['http://www.ufcstats.com/fight-details/5f6b8e...,"[('Junior Dos Santos', 'Ben Rothwell'), ('Derr...","[['157', '77'], ['17', '5'], ['36', '24'], ['4...","[['0', '0'], ['0', '1'], ['0', '2'], ['0', '1'...","[['0', '0'], ['0', '0'], ['0', '0'], ['0', '0'...","[['0', '0'], ['0', '2'], ['0', '1'], ['0', '1'...","['Heavyweight', 'Heavyweight', 'Heavyweight', ...","['U-DEC', 'KO/TKO', 'KO/TKO', 'U-DEC', 'U-DEC'...","[5, 1, 2, 3, 3, 3, 3, 1, 1, 1, 3, 1, 3]","['5:00', '4:48', '5:00', '5:00', '5:00', '5:00..."
2,2012-10-05,UFC on FX: Browne vs Bigfoot,['http://www.ufcstats.com/fight-details/0cced3...,"[('Antonio Silva', 'Travis Browne'), ('Jake El...","[['15', '8'], ['25', '32'], ['33', '9'], ['6',...","[['0', '0'], ['2', '0'], ['0', '0'], ['0', '0'...","[['0', '0'], ['0', '0'], ['0', '0'], ['1', '0'...","[['0', '0'], ['0', '0'], ['0', '0'], ['0', '0'...","['Heavyweight', 'Welterweight', 'Flyweight', '...","['KO/TKO', 'U-DEC', 'KO/TKO', 'SUB', 'KO/TKO',...","[1, 3, 2, 1, 2, 2, 3, 1, 3, 2]","['3:27', '5:00', '4:35', '0:45', '1:06', '0:29..."
3,2002-07-13,UFC 38: Brawl at the Hall,['http://www.ufcstats.com/fight-details/0f6dd5...,"[('Matt Hughes', 'Carlos Newton'), ('Ian Freem...","[['58', '3'], ['34', '6'], ['4', '0'], ['9', '...","[['4', '0'], ['0', '1'], ['0', '0'], ['2', '0'...","[['1', '1'], ['0', '3'], ['0', '0'], ['3', '0'...","[['12', '1'], ['0', '0'], ['0', '0'], ['2', '0...","['Welterweight', 'Heavyweight', 'Middleweight'...","['KO/TKO', 'KO/TKO', 'KO/TKO', 'SUB', 'U-DEC',...","[4, 1, 1, 2, 3, 3, 3]","['3:37', '4:35', '0:10', '1:38', '5:00', '5:00..."
4,2013-10-09,UFC Fight Night: Maia vs Shields,['http://www.ufcstats.com/fight-details/a41ba2...,"[('Jake Shields', 'Demian Maia'), ('Dong Hyun ...","[['35', '24'], ['11', '28'], ['96', '70'], ['8...","[['1', '3'], ['1', '0'], ['0', '3'], ['0', '1'...","[['0', '0'], ['0', '0'], ['0', '0'], ['1', '0'...","[['5', '1'], ['1', '0'], ['0', '0'], ['0', '0'...","['Welterweight', 'Welterweight', 'Light Heavyw...","['S-DEC', 'KO/TKO', 'U-DEC', 'S-DEC', 'SUB', '...","[5, 2, 3, 3, 1, 3, 3, 3, 2, 1]","['5:00', '3:01', '5:00', '5:00', '0:31', '5:00..."


In [55]:
def event_date_fetcher(event_soup):
    date_holder = event_soup.findChildren('li', attrs={'class': re.compile('b-list__')})
    
    try:
        date = date_holder[0].text.split(':')[1].strip()
        
        return date 
        
    except:
        
        pass    

In [56]:
def event_fights_parser(event_soup):
        
    holder = event_soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    event_fights_holder = []
    
    for x in range(0, len(holder)):
        
        event_fights_holder.append(holder[x].get('href'))
        
    return event_fights_holder

In [57]:
def event_name_fetcher(event_soup):
    event_title = event_soup.find('h2', attrs={'class': re.compile('b-content__')}).text.strip()
    return event_title

In [78]:
event_database_df = pd.DataFrame()

#adding event_dates to dataframe and converting to time series objects
event_dates_list = list(map(event_date_fetcher, event_soups))

event_database_df['date'] = event_dates_list

event_database_df['date'] = pd.to_datetime(event_database_df['date'])

#adding event_name to dataframe
event_names_list = list(map(event_name_fetcher,event_soups))

event_database_df['name'] = event_names_list

In [59]:
event_database_df

Unnamed: 0,event,date,name
0,http://www.ufcstats.com/event-details/df05aa15...,2019-10-26,UFC Fight Night: Maia vs. Askren
1,http://www.ufcstats.com/event-details/4887e5bc...,2016-04-10,UFC Fight Night: Rothwell vs Dos Santos
2,http://www.ufcstats.com/event-details/6a0b80a2...,2012-10-05,UFC on FX: Browne vs Bigfoot
3,http://www.ufcstats.com/event-details/1dc56b59...,2002-07-13,UFC 38: Brawl at the Hall
4,http://www.ufcstats.com/event-details/43612456...,2013-10-09,UFC Fight Night: Maia vs Shields
...,...,...,...
517,http://www.ufcstats.com/event-details/ae58685c...,2003-04-25,UFC 42: Sudden Impact
518,http://www.ufcstats.com/event-details/1979c801...,2017-09-16,UFC Fight Night: Rockhold vs. Branch
519,http://www.ufcstats.com/event-details/46f11d15...,2008-07-19,UFC: Silva vs Irvin
520,http://www.ufcstats.com/event-details/db1f2ed6...,2016-03-05,UFC 196: McGregor vs Diaz


In [60]:
event_database_df_dropped = event_database_df[event_database_df.date > current_datetime]

pending_events_indexes = event_database_df_dropped.index.tolist()

for index in pending_events_indexes:
    event_soups.pop(index)

In [61]:
event_database_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   event   522 non-null    object        
 1   date    522 non-null    datetime64[ns]
 2   name    522 non-null    object        
dtypes: datetime64[ns](1), object(2)
memory usage: 12.4+ KB


In [62]:
def event_fight_participants_fetcher(event_soup):
    
    fight_pairs = []
    
    name_holder = event_soup.findChildren('a', attrs={'class': re.compile('b-link')})

    fighter1_list_holder, fighter2_list_holder = name_holder[0::2], name_holder[1::2]

    fighter1_list = []
    fighter2_list = []

    for fighter1 in fighter1_list_holder:
        fighter1_list.append(fighter1.text.strip())
        
    for fighter2 in fighter2_list_holder:
        fighter2_list.append(fighter2.text.strip())
      
    fighters_in_ring = zip(fighter1_list, fighter2_list)
    
    for duo in fighters_in_ring:
        fight_pairs.append(duo)
        
    return fight_pairs

In [63]:
def event_fight_strikes(event_soup):
        
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    event_strike_list = []
    
    for x in range(0, len(parser)):

        fight_strikes_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})

        event_strike_list.append(fight_strikes_holder[2].text.split())

    return event_strike_list

In [64]:
def event_fight_takedowns(event_soup):
        
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    event_takedowns_list = []
    
    for x in range(0, len(parser)):

        fight_takedowns_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})
        
        event_takedowns_list.append(fight_takedowns_holder[3].text.split())
            
    return event_takedowns_list

In [65]:
def event_fight_submissions(event_soup):    
    
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})

    event_submissions_list = []

    for x in range(0, len(parser)):

        event_submissions_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})      
    
        event_submissions_list.append(event_submissions_holder[4].text.split())

    return event_submissions_list


In [66]:
def event_fight_passes(event_soup):
    
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})

    event_passes_list = []

    for x in range(0, len(parser)):

        event_passes_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})      

        event_passes_list.append(event_passes_holder[5].text.split())

    return event_passes_list

In [67]:
def event_fight_weight_classes(event_soup):
    
    event_weight_class_list = []

    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})

    for x in range(0, len(parser)):

        holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})

        weight_class = holder[6].text.strip()

        event_weight_class_list.append(weight_class)

    return event_weight_class_list

In [68]:
def event_fight_win_method(event_soup):
        
    win_methods_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    win_methods_list = []    
    
    for x in range(0, len(win_methods_holder)):
        
        win_method_holder = win_methods_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[7]
        
        win_method = win_method_holder.findAll('p', attrs={'class': re.compile('b-fight-details')})[0].text.strip()
        
        win_methods_list.append(win_method)
    
    return win_methods_list

In [69]:
def event_fight_win_method_details(event_soup):
        
    win_methods_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    win_methods_list = []    
    
    for x in range(0, len(win_methods_holder)):
        
        win_method_holder = win_methods_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[7]
        
        win_method = win_method_holder.findAll('p', attrs={'class': re.compile('b-fight-details')})[1].text.strip()
        
        win_methods_list.append(win_method)
    
    return win_methods_list

In [70]:
def event_fight_final_round(event_soup):
    
    ending_round_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    ending_round_list = []
    
    for x in range(0, len(ending_round_holder)):
        
        ending_round = int(ending_round_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[8].text.strip())
        
        ending_round_list.append(ending_round)
        
    return ending_round_list
    

In [71]:
def event_fight_final_round_time(event_soup):
    
    ending_round_time_list = []
    
    ending_round_time_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    for x in range(0, len(ending_round_time_holder)):
    
        ending_round_time = ending_round_time_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[-1].text.strip()
        
        ending_round_time_list.append(ending_round_time)

    return ending_round_time_list

In [79]:
event_fight_list = list(map(event_fights_parser, event_soups))

matchup_list = list(map(event_fight_participants_fetcher, event_soups))

fighter_strike_count_list = list(map(event_fight_strikes,event_soups))
    
fight_takedown_list = list(map(event_fight_takedowns,event_soups))

fight_submission_list = list(map(event_fight_submissions,event_soups))
    
fight_pass_list = list(map(event_fight_passes,event_soups))

fight_weight_class_list = list(map(event_fight_weight_classes,event_soups))
    
fight_win_method_list = list(map(event_fight_win_method,event_soups))

fight_final_round_list = list(map(event_fight_final_round,event_soups))
    
fight_final_round_time_list = list(map(event_fight_final_round_time,event_soups))

In [80]:
event_database_df['event_fight_list'] = event_fight_list

event_database_df['matchups'] = matchup_list

event_database_df['strike_counts'] = fighter_strike_count_list

event_database_df['takedown_counts'] = fight_takedown_list

event_database_df['submission_counts'] = fight_submission_list

event_database_df['pass_counts'] = fight_pass_list

event_database_df['weight_class'] = fight_weight_class_list

event_database_df['win_method'] = fight_win_method_list

event_database_df['final_round'] = fight_final_round_list

event_database_df['final_round_time'] = fight_final_round_time_list

event_database_df

Unnamed: 0,date,name,event_fight_list,matchups,strike_counts,takedown_counts,submission_counts,pass_counts,weight_class,win_method,final_round,final_round_time
0,2019-10-26,UFC Fight Night: Maia vs. Askren,[http://www.ufcstats.com/fight-details/241a083...,"[(Demian Maia, Ben Askren), (Stevie Ray, Micha...","[[69, 63], [85, 78], [13, 3], [94, 32], [63, 4...","[[0, 4], [1, 0], [2, 0], [3, 0], [3, 0], [0, 2...","[[1, 0], [0, 0], [1, 0], [1, 0], [0, 0], [0, 0...","[[4, 1], [1, 0], [1, 0], [1, 0], [0, 0], [1, 3...","[Welterweight, Lightweight, Lightweight, Heavy...","[SUB, M-DEC, SUB, SUB, U-DEC, S-DEC, U-DEC, U-...","[3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3]","[3:54, 5:00, 2:02, 4:46, 5:00, 5:00, 5:00, 5:0..."
1,2016-04-10,UFC Fight Night: Rothwell vs Dos Santos,[http://www.ufcstats.com/fight-details/5f6b8e4...,"[(Junior Dos Santos, Ben Rothwell), (Derrick L...","[[157, 77], [17, 5], [36, 24], [45, 48], [98, ...","[[0, 0], [0, 1], [0, 2], [0, 1], [4, 0], [1, 1...","[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 2...","[[0, 0], [0, 2], [0, 1], [0, 1], [4, 3], [3, 0...","[Heavyweight, Heavyweight, Heavyweight, Heavyw...","[U-DEC, KO/TKO, KO/TKO, U-DEC, U-DEC, U-DEC, U...","[5, 1, 2, 3, 3, 3, 3, 1, 1, 1, 3, 1, 3]","[5:00, 4:48, 5:00, 5:00, 5:00, 5:00, 5:00, 4:0..."
2,2012-10-05,UFC on FX: Browne vs Bigfoot,[http://www.ufcstats.com/fight-details/0cced3b...,"[(Antonio Silva, Travis Browne), (Jake Ellenbe...","[[15, 8], [25, 32], [33, 9], [6, 0], [17, 17],...","[[0, 0], [2, 0], [0, 0], [0, 0], [0, 0], [0, 1...","[[0, 0], [0, 0], [0, 0], [1, 0], [0, 1], [0, 1...","[[0, 0], [0, 0], [0, 0], [0, 0], [0, 2], [0, 1...","[Heavyweight, Welterweight, Flyweight, Welterw...","[KO/TKO, U-DEC, KO/TKO, SUB, KO/TKO, KO/TKO, S...","[1, 3, 2, 1, 2, 2, 3, 1, 3, 2]","[3:27, 5:00, 4:35, 0:45, 1:06, 0:29, 5:00, 2:3..."
3,2002-07-13,UFC 38: Brawl at the Hall,[http://www.ufcstats.com/fight-details/0f6dd5d...,"[(Matt Hughes, Carlos Newton), (Ian Freeman, F...","[[58, 3], [34, 6], [4, 0], [9, 5], [30, 29], [...","[[4, 0], [0, 1], [0, 0], [2, 0], [4, 0], [4, 0...","[[1, 1], [0, 3], [0, 0], [3, 0], [0, 1], [0, 0...","[[12, 1], [0, 0], [0, 0], [2, 0], [9, 0], [0, ...","[Welterweight, Heavyweight, Middleweight, Ligh...","[KO/TKO, KO/TKO, KO/TKO, SUB, U-DEC, U-DEC, U-...","[4, 1, 1, 2, 3, 3, 3]","[3:37, 4:35, 0:10, 1:38, 5:00, 5:00, 5:00]"
4,2013-10-09,UFC Fight Night: Maia vs Shields,[http://www.ufcstats.com/fight-details/a41ba26...,"[(Jake Shields, Demian Maia), (Dong Hyun Kim, ...","[[35, 24], [11, 28], [96, 70], [87, 57], [0, 2...","[[1, 3], [1, 0], [0, 3], [0, 1], [0, 1], [0, 2...","[[0, 0], [0, 0], [0, 0], [1, 0], [2, 0], [1, 3...","[[5, 1], [1, 0], [0, 0], [0, 0], [0, 0], [0, 1...","[Welterweight, Welterweight, Light Heavyweight...","[S-DEC, KO/TKO, U-DEC, S-DEC, SUB, S-DEC, U-DE...","[5, 2, 3, 3, 1, 3, 3, 3, 2, 1]","[5:00, 3:01, 5:00, 5:00, 0:31, 5:00, 5:00, 5:0..."
...,...,...,...,...,...,...,...,...,...,...,...,...
516,2003-04-25,UFC 42: Sudden Impact,[http://www.ufcstats.com/fight-details/00a5e03...,"[(Matt Hughes, Sean Sherk), (Pete Spratt, Robb...","[[44, 19], [19, 15], [60, 24], [22, 6], [16, 6...","[[5, 3], [1, 2], [1, 2], [0, 0], [0, 0], [0, 3...","[[1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0...","[[10, 2], [1, 3], [1, 1], [0, 0], [0, 0], [0, ...","[Welterweight, Welterweight, Welterweight, Hea...","[U-DEC, SUB, U-DEC, KO/TKO, KO/TKO, U-DEC, U-D...","[5, 2, 3, 2, 1, 3, 3, 1]","[5:00, 2:28, 5:00, 1:46, 2:40, 5:00, 5:00, 3:55]"
517,2017-09-16,UFC Fight Night: Rockhold vs. Branch,[http://www.ufcstats.com/fight-details/24438a6...,"[(Luke Rockhold, David Branch), (Mike Perry, A...","[[58, 25], [11, 6], [35, 65], [20, 22], [16, 1...","[[2, 1], [0, 0], [0, 1], [4, 0], [0, 0], [0, 0...","[[0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0...","[[2, 0], [0, 0], [0, 0], [6, 0], [0, 0], [0, 0...","[Middleweight, Welterweight, Middleweight, Lig...","[KO/TKO, KO/TKO, KO/TKO, SUB, KO/TKO, S-DEC, S...","[2, 1, 3, 2, 1, 3, 3, 1, 2, 2]","[4:05, 1:19, 2:33, 2:11, 2:48, 5:00, 5:00, 0:2..."
518,2008-07-19,UFC: Silva vs Irvin,[http://www.ufcstats.com/fight-details/4eafb26...,"[(Anderson Silva, James Irvin), (Brandon Vera,...","[[13, 0], [38, 30], [69, 18], [24, 1], [37, 51...","[[0, 0], [2, 1], [10, 0], [1, 0], [0, 5], [0, ...","[[0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [2, 0...","[[0, 0], [2, 0], [0, 0], [1, 0], [0, 0], [1, 0...","[Light Heavyweight, Light Heavyweight, Lightwe...","[KO/TKO, U-DEC, U-DEC, KO/TKO, KO/TKO, SUB, KO...","[1, 3, 3, 1, 3, 1, 1, 1, 1, 2, 1]","[1:01, 5:00, 5:00, 2:02, 3:35, 3:58, 1:54, 1:3..."
519,2016-03-05,UFC 196: McGregor vs Diaz,[http://www.ufcstats.com/fight-details/4ace70b...,"[(Nate Diaz, Conor McGregor), (Miesha Tate, Ho...","[[77, 61], [40, 59], [30, 23], [55, 31], [39, ...","[[1, 0], [2, 0], [3, 0], [1, 1], [2, 1], [3, 1...","[[2, 0], [2, 0], [0, 0], [0, 0], [2, 0], [1, 0...","[[1, 0], [3, 0], [1, 0], [1, 0], [4, 1], [4, 0...","[Welterweight, Women's Bantamweight, Light Hea...","[SUB, SUB, U-DEC, U-DEC, U-DEC, SUB, KO/TKO, K...","[2, 5, 3, 3, 3, 3, 2, 2, 3, 3, 1, 2]","[4:12, 3:30, 5:00, 5:00, 5:00, 4:11, 1:34, 1:0..."


In [4]:
event_database_df.to_csv('event_database_csv', index=False)

NameError: name 'event_database_df' is not defined

In [5]:
event_database_df = pd.read_csv('event_database_csv')

In [7]:
test = pd.read_html('http://www.ufcstats.com/fighter-details/d28dee5c705991df')
test_df = test[0]

In [8]:
test_df.drop([0, 1], inplace=True)

--------------------------------------FIX IT FIX IT FIX IT FIX IT FIX IT FIX IT--------------------------------------------

In [46]:
def fighter_history_fetcher(soup):
    '''This function is meant to extract the html information for each fighter as a list of urls. '''
    
    soup_holder = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    list_of_fights = []
    
    for x in soup_holder:
        fight_holder = []
        
        fight_holder.append(x.get('href'))
        
        list_of_fights.append(fight_holder) 
        
    return list_of_fights

In [48]:
fighter_history_fetcher(fighter_url_soup_list[0])

[['http://www.ufcstats.com/fight-details/889207e3c622a5aa'],
 ['http://www.ufcstats.com/fight-details/7c23770c639dcd22'],
 ['http://www.ufcstats.com/fight-details/ac656cebef8cf0a4'],
 ['http://www.ufcstats.com/fight-details/8d9b460143ae19bc'],
 ['http://www.ufcstats.com/fight-details/3c6659f1b09760ba'],
 ['http://www.ufcstats.com/fight-details/a9d8ddb49dbe414b'],
 ['http://www.ufcstats.com/fight-details/78f1457371e2ef6b'],
 ['http://www.ufcstats.com/fight-details/49b05e23e3a2af13']]

In [14]:
fighter_history_url_list = list(map(fighter_history_fetcher, fighter_url_soup_list))
fighter_history_url_list[:3]

[[['http://www.ufcstats.com/fight-details/1206dc7d5228f5bd']],
 [['http://www.ufcstats.com/fight-details/9f5f9173666ce1b7'],
  ['http://www.ufcstats.com/fight-details/010793ad35d686d5']],
 [['http://www.ufcstats.com/fight-details/96fe84d051847591'],
  ['http://www.ufcstats.com/fight-details/64bee58391921b4b'],
  ['http://www.ufcstats.com/fight-details/55b53fdad307b708'],
  ['http://www.ufcstats.com/fight-details/d2f6b87189908088'],
  ['http://www.ufcstats.com/fight-details/9f10afab94cafc45'],
  ['http://www.ufcstats.com/fight-details/bd4a648bebaefab3'],
  ['http://www.ufcstats.com/fight-details/87b6c507a7684ec2']]]

fighter_history_url_list generated lists 

In [74]:
def compound_flattener(fight_url_list):
    '''Will reduce the dimentionaliy of lists by factor of one'''
    
    flattened_fight_urls = list(itertools.chain(*fight_url_list))
    return flattened_fight_urls

In [16]:
fighter_history_url_list = list(map(compound_flattener, fighter_history_url_list))

In [17]:
fighter_history_url_list[:3]

[['http://www.ufcstats.com/fight-details/1206dc7d5228f5bd'],
 ['http://www.ufcstats.com/fight-details/9f5f9173666ce1b7',
  'http://www.ufcstats.com/fight-details/010793ad35d686d5'],
 ['http://www.ufcstats.com/fight-details/96fe84d051847591',
  'http://www.ufcstats.com/fight-details/64bee58391921b4b',
  'http://www.ufcstats.com/fight-details/55b53fdad307b708',
  'http://www.ufcstats.com/fight-details/d2f6b87189908088',
  'http://www.ufcstats.com/fight-details/9f10afab94cafc45',
  'http://www.ufcstats.com/fight-details/bd4a648bebaefab3',
  'http://www.ufcstats.com/fight-details/87b6c507a7684ec2']]

In [450]:
loud_noises_df = pd.DataFrame({'Noise': [['boom', 'ka-bloom'], 'bang', ['pow', 'kapow'], ['boom', 'badaboom', 'boosh']], 'Cause': 'explosion'})
loud_noises_df

Unnamed: 0,Noise,Cause
0,"[boom, ka-bloom]",explosion
1,bang,explosion
2,"[pow, kapow]",explosion
3,"[boom, badaboom, boosh]",explosion
