# UFC_Stats Web Scraping

## importing pertinent libraries

In [33]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests, re
import json
import itertools
import dask.dataframe as dd
from datetime import datetime
import pickle

In [34]:
#creating datetime object to parse out future events

today = datetime.today()
current_date= today.strftime("%B %d, %Y")
current_datetime = datetime.strptime(current_date, "%B %d, %Y" ) 

## Extracting UFC urls

In [37]:
#creating a list of unique fighter URLs available on website
fighter_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_url_list.append(fighter.get('href'))

#removing duplicate URLs
fighter_url_list = list(set(fighter_url_list))

In [38]:
fighter_url_list[:15]

['http://www.ufcstats.com/fighter-details/2b074403b7c6cdb4',
 'http://www.ufcstats.com/fighter-details/0541480fbf719d86',
 'http://www.ufcstats.com/fighter-details/e17770faae3ca54c',
 'http://www.ufcstats.com/fighter-details/9b28292abe3166d5',
 'http://www.ufcstats.com/fighter-details/d317a5e2b3f88c5f',
 'http://www.ufcstats.com/fighter-details/770b9d4813c25902',
 'http://www.ufcstats.com/fighter-details/d26934530dc5b248',
 'http://www.ufcstats.com/statistics/fighters?char=a&page=2',
 'http://www.ufcstats.com/fighter-details/4665cbf36b08193b',
 'http://www.ufcstats.com/fighter-details/1ffc38f67785797b',
 'http://www.ufcstats.com/fighter-details/b0550072e5f0afa7',
 'http://www.ufcstats.com/fighter-details/a0e75f4a13eb73f1',
 'http://www.ufcstats.com/fighter-details/c487223b0289bda9',
 'http://www.ufcstats.com/fighter-details/b757c73f443d4fca',
 'http://www.ufcstats.com/fighter-details/fd7acf42bd6e7e95']

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [39]:
#removing invalid URLs
for url in fighter_url_list:
    if 'fighter-details' not in url:
        fighter_url_list.remove(url)

In [40]:
fighter_url_list[:5]

['http://www.ufcstats.com/fighter-details/2b074403b7c6cdb4',
 'http://www.ufcstats.com/fighter-details/0541480fbf719d86',
 'http://www.ufcstats.com/fighter-details/e17770faae3ca54c',
 'http://www.ufcstats.com/fighter-details/9b28292abe3166d5',
 'http://www.ufcstats.com/fighter-details/d317a5e2b3f88c5f']

In [41]:
with open('fighter_url_list', 'wb') as f:
    pickle.dump(fighter_url_list, f)

In [42]:
with open('fighter_url_list', 'rb') as f:
    mylist = pickle.load(f)

## Creating Dataframe Building Blocks

### Generating basic statistics lists

In [44]:
def info_generator(url):
    '''Meant to parse any url via beuatiful soup
    
    Args:
        url(str): url that will be parsed using BeautifulSoup
        '''
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [45]:
#creating a soup list to reduce number of website requests
basic_soup_list = list(map(info_generator, fighter_url_list))

In [12]:
def base_stats_soup_generator(soup):
    '''Meant to identiy statistics in the context of UFC Stats urls.
    
    Args:
        
        soup(BeautifulSoup object) : must be an unaltered soup'''
    
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    return base_stats

In [13]:
stats_soups = list(map(base_stats_soup_generator, basic_soup_list))

### Basic Statistics DataFrame Construction

In [18]:
def fighter_name(soup):
    '''Identifies fighter history to be parsed.
    Args:
    
        soup(BeautifulSoup object): BeautifulSoup object must originate from a urls hosting fighter profiles'''
    
    name = soup.find('span', attrs={'class': re.compile('b-content__title-highlight')})
    
    return name.text.strip()

In [19]:
def fighter_record(soup):
    
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    
    return record[0].text.strip().split(':')[1].strip()

In [20]:
def basic_stats_height(soup):
    '''soups run thorugh this function should be run through stats_soup function beforehand.'''
    
    #feet
    height = soup[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "")
    
   #inches
    if height == '--':
        pass
    else:
        
        height = int(height)
        
        inches = int(soup[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

        height_to_inches = (height * 12) 
    
        final_height = height_to_inches + inches
    
        return (inches + height_to_inches)

In [21]:
def basic_stats_weight(soup):
    
    weight = soup[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    
    if weight == '--':
        pass
    else:
        weight = int(weight)
    return weight 

In [22]:
def basic_stats_reach(soup):
    
    reach = soup[2].text.strip().split(':')[1].strip().replace('"', '')
    
    if reach == '--':
        pass
    else:
        reach = int(reach)
    return reach

In [23]:
def basic_stats_stance(soup):
    
    stance = soup[3].text.strip().split(':')[1].strip()
    return stance

In [24]:
def basic_stats_DOB(soup):
    
    DOB = soup[4].text.strip().split(':')[1].strip()
    return DOB

In [25]:
fighter_names = list(map(fighter_name, basic_soup_list))

fighter_records = list(map(fighter_record, basic_soup_list))

fighter_heights =  list(map(basic_stats_height, stats_soups))

fighter_weights = list(map(basic_stats_weight, stats_soups))

fighter_reachs = list(map(basic_stats_reach, stats_soups))

fighter_stances = list(map(basic_stats_stance, stats_soups))

fighter_DOBs = list(map(basic_stats_DOB, stats_soups))

In [28]:
fighter_basic_stats_df = pd.DataFrame()

fighter_basic_stats_df['name'] = fighter_names

fighter_basic_stats_df['record'] = fighter_records

fighter_basic_stats_df['height'] = fighter_heights

fighter_basic_stats_df['weight'] = fighter_weights

fighter_basic_stats_df['reach'] = fighter_reachs

fighter_basic_stats_df['stance'] = fighter_stances

fighter_basic_stats_df['DOB'] = fighter_DOBs

In [29]:
fighter_basic_stats_df.head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Edwin Aguilar,26-19-0 (1 NC),70.0,185,--,,--
1,Romie Aram,7-1-0,70.0,170,--,Orthodox,"Dec 02, 1977"
2,JJ Aldrich,8-4-0,65.0,125,67,Southpaw,"Sep 29, 1992"
3,Jaime Alvarez,6-1-0,68.0,125,69,Orthodox,"Mar 08, 1988"
4,Juan Adams,5-3-0,77.0,265,80,Orthodox,"Jan 16, 1992"


In [30]:
fighter_basic_stats_df.to_csv('fighter_basic_stats_csv', index=False)

In [31]:
dd.read_csv('fighter_basic_stats_csv').head()

Unnamed: 0,name,record,height,weight,reach,stance,DOB
0,Edwin Aguilar,26-19-0 (1 NC),70.0,185,--,,--
1,Romie Aram,7-1-0,70.0,170,--,Orthodox,"Dec 02, 1977"
2,JJ Aldrich,8-4-0,65.0,125,67,Southpaw,"Sep 29, 1992"
3,Jaime Alvarez,6-1-0,68.0,125,69,Orthodox,"Mar 08, 1988"
4,Juan Adams,5-3-0,77.0,265,80,Orthodox,"Jan 16, 1992"


### Career Statistics DataFrame Construction

In [32]:
def career_stats_soup(soup):
    
    career_stats_soup = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    career_stats = career_stats_soup[5:]
    
    return career_stats_soup

In [33]:
def career_stats_SLpM(soup):

    SLpM = soup[5].text.split(':')[1].replace('%', '').strip()
    
    if SLpM == '--':
        pass
    else:
        SLpM = float(SLpM)
        
    return SLpM

In [34]:
def career_stats_StrAcc(soup):
    StrAcc =  soup[6].text.split(':')[1].replace('%', '').strip()
    
    if StrAcc == '--':
        pass
    else:
        StrAcc = float(StrAcc)
        
    return StrAcc

In [35]:
def career_stats_SApM(soup):
    SApM = soup[7].text.split(':')[1].strip()
    
    if SApM == '--':
        pass
    else:
        SApM = float(SApM)
        
    return SApM

In [36]:
def career_stats_StrDef(soup):
    StrDef = soup[8].text.split(':')[1].replace('%', '').strip()
    
    if StrDef == '--':
        pass
    else:
        StrDef = float(StrDef)
        
    return StrDef

In [37]:
def career_stats_TDAvg(soup):
    TDAvg = soup[10].text.split(':')[1].strip()
    
    if TDAvg == '--':
        pass
    else:
        TDAvg = float(TDAvg)
        
    return TDAvg

In [38]:
def career_stats_TDAcc(soup):
    TDAcc = soup[11].text.split(':')[1].strip().replace('%', '')
    
    if TDAcc == '--':
        pass
    else:
        TDAcc = float(TDAcc)
        
    return TDAcc

In [39]:
def career_stats_TDDef(soup):
    TDDef = soup[12].text.split(':')[1].replace('%', '').strip()
    
    if TDDef == '--':
        pass
    else:
        TDDef = float(TDDef)
        
    return TDDef

In [40]:
def career_stats_SubAvg(soup):
    SubAvg =  soup[13].text.split(':')[1].strip().replace('%', '')  
    
    if SubAvg == '--':
        pass
    else:
        SubAvg = float(SubAvg)
        
    return SubAvg

In [41]:
fighter_SLpMs = list(map(career_stats_SLpM, stats_soups))

fighter_StrAccs = list(map(career_stats_StrAcc, stats_soups))

fighter_SApMs = list(map(career_stats_SApM, stats_soups))

fighter_StrDefs =list(map(career_stats_StrDef, stats_soups))

fighter_TDAvgs = list(map(career_stats_TDAvg, stats_soups))

fighter_TDAccs = list(map(career_stats_TDAcc, stats_soups))

fighter_TDDefs = list(map(career_stats_TDDef, stats_soups))

fighter_SubAvgs = list(map(career_stats_SubAvg, stats_soups))

In [42]:
fighter_career_stats_df = pd.DataFrame()

fighter_career_stats_df['name'] = fighter_names

fighter_career_stats_df['SLpMs'] = fighter_SLpMs

fighter_career_stats_df['StrAccs'] = fighter_StrAccs

fighter_career_stats_df['SApMs'] = fighter_SApMs

fighter_career_stats_df['StrDefs'] = fighter_StrDefs

fighter_career_stats_df['TDAvgs'] = fighter_TDAvgs

fighter_career_stats_df['TDAccs'] = fighter_TDAccs

fighter_career_stats_df['TDDefs'] = fighter_TDDefs

fighter_career_stats_df['SubAvgs'] = fighter_SubAvgs

In [43]:
fighter_career_stats_df.head()

Unnamed: 0,name,SLpMs,StrAccs,SApMs,StrDefs,TDAvgs,TDAccs,TDDefs,SubAvgs
0,Edwin Aguilar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Romie Aram,1.8,35.0,2.83,50.0,2.5,62.0,83.0,0.0
2,JJ Aldrich,3.88,40.0,4.68,62.0,0.31,25.0,53.0,0.0
3,Jaime Alvarez,2.73,40.0,4.53,56.0,1.0,33.0,50.0,0.0
4,Juan Adams,7.09,55.0,4.06,34.0,0.91,66.0,57.0,0.0


In [46]:
fighter_career_stats_df.to_csv('fighter_career_stats_csv', index=False)

### Constructing Fight Event and Fight List Database

In [49]:
#creating a list of unique fighter URLs available on website
event_url_list = []
response = requests.get('http://www.ufcstats.com/statistics/events/completed?page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
events = soup.findAll('a', attrs={'href': re.compile('http:')})
for event in events:
    event_url_list.append(event.get('href'))

#removing duplicate URLs
event_url_list = list(set(event_url_list))

In [50]:
#removing invalid URLs
for url in event_url_list:
    if 'event-details' not in url:
        event_url_list.remove(url)
    
for url in event_url_list:  
    if len(url) <  54:
        event_url_list.remove(url)

In [51]:
event_url_list[:15]

['http://www.ufcstats.com/event-details/df05aa15b2d66f57',
 'http://www.ufcstats.com/event-details/4887e5bc4dbb73ff',
 'http://www.ufcstats.com/event-details/6a0b80a24f22e152',
 'http://www.ufcstats.com/event-details/1dc56b59cb28425d',
 'http://www.ufcstats.com/event-details/43612456979e5d5e',
 'http://www.ufcstats.com/event-details/e5c38954c006f15c',
 'http://www.ufcstats.com/event-details/304fcd812f12c589',
 'http://www.ufcstats.com/event-details/c6e6926a81adcd00',
 'http://www.ufcstats.com/event-details/997b4f52f76a0b53',
 'http://www.ufcstats.com/event-details/232c582f29f8f65e',
 'http://www.ufcstats.com/event-details/4d74641fac830182',
 'http://www.ufcstats.com/event-details/e8efeb9cf33b1941',
 'http://www.ufcstats.com/event-details/a79bfbc01b2264d6',
 'http://www.ufcstats.com/event-details/601cf40c09090853',
 'http://www.ufcstats.com/event-details/d29b5c4f22c6357d']

In [52]:
len(event_url_list)

522

In [53]:
event_soups = list(map(info_generator, event_url_list))

In [54]:
len(event_soups)

522

In [55]:
def event_date_fetcher(event_soup):
    date_holder = event_soup.findChildren('li', attrs={'class': re.compile('b-list__')})
    
    try:
        date = date_holder[0].text.split(':')[1].strip()
        
        return date 
        
    except:
        
        pass    

In [56]:
def event_fights_parser(event_soup):
        
    holder = event_soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    event_fights_holder = []
    
    for x in range(0, len(holder)):
        
        event_fights_holder.append(holder[x].get('href'))
        
    return event_fights_holder

In [57]:
def event_name_fetcher(event_soup):
    event_title = event_soup.find('h2', attrs={'class': re.compile('b-content__')}).text.strip()
    return event_title

In [78]:
event_database_df = pd.DataFrame()

#adding event_dates to dataframe and converting to time series objects
event_dates_list = list(map(event_date_fetcher, event_soups))

event_database_df['date'] = event_dates_list

event_database_df['date'] = pd.to_datetime(event_database_df['date'])

#adding event_name to dataframe
event_names_list = list(map(event_name_fetcher,event_soups))

event_database_df['name'] = event_names_list

In [59]:
event_database_df

Unnamed: 0,event,date,name
0,http://www.ufcstats.com/event-details/df05aa15...,2019-10-26,UFC Fight Night: Maia vs. Askren
1,http://www.ufcstats.com/event-details/4887e5bc...,2016-04-10,UFC Fight Night: Rothwell vs Dos Santos
2,http://www.ufcstats.com/event-details/6a0b80a2...,2012-10-05,UFC on FX: Browne vs Bigfoot
3,http://www.ufcstats.com/event-details/1dc56b59...,2002-07-13,UFC 38: Brawl at the Hall
4,http://www.ufcstats.com/event-details/43612456...,2013-10-09,UFC Fight Night: Maia vs Shields
...,...,...,...
517,http://www.ufcstats.com/event-details/ae58685c...,2003-04-25,UFC 42: Sudden Impact
518,http://www.ufcstats.com/event-details/1979c801...,2017-09-16,UFC Fight Night: Rockhold vs. Branch
519,http://www.ufcstats.com/event-details/46f11d15...,2008-07-19,UFC: Silva vs Irvin
520,http://www.ufcstats.com/event-details/db1f2ed6...,2016-03-05,UFC 196: McGregor vs Diaz


In [60]:
event_database_df_dropped = event_database_df[event_database_df.date > current_datetime]

pending_events_indexes = event_database_df_dropped.index.tolist()

for index in pending_events_indexes:
    event_soups.pop(index)

In [61]:
event_database_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   event   522 non-null    object        
 1   date    522 non-null    datetime64[ns]
 2   name    522 non-null    object        
dtypes: datetime64[ns](1), object(2)
memory usage: 12.4+ KB


In [62]:
def event_fight_participants_fetcher(event_soup):
    
    fight_pairs = []
    
    name_holder = event_soup.findChildren('a', attrs={'class': re.compile('b-link')})

    fighter1_list_holder, fighter2_list_holder = name_holder[0::2], name_holder[1::2]

    fighter1_list = []
    fighter2_list = []

    for fighter1 in fighter1_list_holder:
        fighter1_list.append(fighter1.text.strip())
        
    for fighter2 in fighter2_list_holder:
        fighter2_list.append(fighter2.text.strip())
      
    fighters_in_ring = zip(fighter1_list, fighter2_list)
    
    for duo in fighters_in_ring:
        fight_pairs.append(duo)
        
    return fight_pairs

In [63]:
def event_fight_strikes(event_soup):
        
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    event_strike_list = []
    
    for x in range(0, len(parser)):

        fight_strikes_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})

        event_strike_list.append(fight_strikes_holder[2].text.split())

    return event_strike_list

In [64]:
def event_fight_takedowns(event_soup):
        
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    event_takedowns_list = []
    
    for x in range(0, len(parser)):

        fight_takedowns_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})
        
        event_takedowns_list.append(fight_takedowns_holder[3].text.split())
            
    return event_takedowns_list

In [65]:
def event_fight_submissions(event_soup):    
    
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})

    event_submissions_list = []

    for x in range(0, len(parser)):

        event_submissions_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})      
    
        event_submissions_list.append(event_submissions_holder[4].text.split())

    return event_submissions_list


In [66]:
def event_fight_passes(event_soup):
    
    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})

    event_passes_list = []

    for x in range(0, len(parser)):

        event_passes_holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})      

        event_passes_list.append(event_passes_holder[5].text.split())

    return event_passes_list

In [67]:
def event_fight_weight_classes(event_soup):
    
    event_weight_class_list = []

    parser = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})

    for x in range(0, len(parser)):

        holder = parser[x].findAll('td', attrs={'class': re.compile('b-fight-details')})

        weight_class = holder[6].text.strip()

        event_weight_class_list.append(weight_class)

    return event_weight_class_list

In [68]:
def event_fight_win_method(event_soup):
        
    win_methods_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    win_methods_list = []    
    
    for x in range(0, len(win_methods_holder)):
        
        win_method_holder = win_methods_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[7]
        
        win_method = win_method_holder.findAll('p', attrs={'class': re.compile('b-fight-details')})[0].text.strip()
        
        win_methods_list.append(win_method)
    
    return win_methods_list

In [69]:
def event_fight_win_method_details(event_soup):
        
    win_methods_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    win_methods_list = []    
    
    for x in range(0, len(win_methods_holder)):
        
        win_method_holder = win_methods_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[7]
        
        win_method = win_method_holder.findAll('p', attrs={'class': re.compile('b-fight-details')})[1].text.strip()
        
        win_methods_list.append(win_method)
    
    return win_methods_list

In [70]:
def event_fight_final_round(event_soup):
    
    ending_round_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    ending_round_list = []
    
    for x in range(0, len(ending_round_holder)):
        
        ending_round = int(ending_round_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[8].text.strip())
        
        ending_round_list.append(ending_round)
        
    return ending_round_list
    

In [71]:
def event_fight_final_round_time(event_soup):
    
    ending_round_time_list = []
    
    ending_round_time_holder = event_soup.findAll('tr', attrs={'class': 'b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click'})
    
    for x in range(0, len(ending_round_time_holder)):
    
        ending_round_time = ending_round_time_holder[x].findAll('td', attrs={'class': re.compile('b-fight-details')})[-1].text.strip()
        
        ending_round_time_list.append(ending_round_time)

    return ending_round_time_list

In [79]:
event_fight_list = list(map(event_fights_parser, event_soups))

matchup_list = list(map(event_fight_participants_fetcher, event_soups))

fighter_strike_count_list = list(map(event_fight_strikes,event_soups))
    
fight_takedown_list = list(map(event_fight_takedowns,event_soups))

fight_submission_list = list(map(event_fight_submissions,event_soups))
    
fight_pass_list = list(map(event_fight_passes,event_soups))

fight_weight_class_list = list(map(event_fight_weight_classes,event_soups))
    
fight_win_method_list = list(map(event_fight_win_method,event_soups))

fight_final_round_list = list(map(event_fight_final_round,event_soups))
    
fight_final_round_time_list = list(map(event_fight_final_round_time,event_soups))

In [80]:
event_database_df['event_fight_list'] = event_fight_list

event_database_df['matchups'] = matchup_list

event_database_df['strike_counts'] = fighter_strike_count_list

event_database_df['takedown_counts'] = fight_takedown_list

event_database_df['submission_counts'] = fight_submission_list

event_database_df['pass_counts'] = fight_pass_list

event_database_df['weight_class'] = fight_weight_class_list

event_database_df['win_method'] = fight_win_method_list

event_database_df['final_round'] = fight_final_round_list

event_database_df['final_round_time'] = fight_final_round_time_list

event_database_df

Unnamed: 0,date,name,event_fight_list,matchups,strike_counts,takedown_counts,submission_counts,pass_counts,weight_class,win_method,final_round,final_round_time
0,2019-10-26,UFC Fight Night: Maia vs. Askren,[http://www.ufcstats.com/fight-details/241a083...,"[(Demian Maia, Ben Askren), (Stevie Ray, Micha...","[[69, 63], [85, 78], [13, 3], [94, 32], [63, 4...","[[0, 4], [1, 0], [2, 0], [3, 0], [3, 0], [0, 2...","[[1, 0], [0, 0], [1, 0], [1, 0], [0, 0], [0, 0...","[[4, 1], [1, 0], [1, 0], [1, 0], [0, 0], [1, 3...","[Welterweight, Lightweight, Lightweight, Heavy...","[SUB, M-DEC, SUB, SUB, U-DEC, S-DEC, U-DEC, U-...","[3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3]","[3:54, 5:00, 2:02, 4:46, 5:00, 5:00, 5:00, 5:0..."
1,2016-04-10,UFC Fight Night: Rothwell vs Dos Santos,[http://www.ufcstats.com/fight-details/5f6b8e4...,"[(Junior Dos Santos, Ben Rothwell), (Derrick L...","[[157, 77], [17, 5], [36, 24], [45, 48], [98, ...","[[0, 0], [0, 1], [0, 2], [0, 1], [4, 0], [1, 1...","[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 2...","[[0, 0], [0, 2], [0, 1], [0, 1], [4, 3], [3, 0...","[Heavyweight, Heavyweight, Heavyweight, Heavyw...","[U-DEC, KO/TKO, KO/TKO, U-DEC, U-DEC, U-DEC, U...","[5, 1, 2, 3, 3, 3, 3, 1, 1, 1, 3, 1, 3]","[5:00, 4:48, 5:00, 5:00, 5:00, 5:00, 5:00, 4:0..."
2,2012-10-05,UFC on FX: Browne vs Bigfoot,[http://www.ufcstats.com/fight-details/0cced3b...,"[(Antonio Silva, Travis Browne), (Jake Ellenbe...","[[15, 8], [25, 32], [33, 9], [6, 0], [17, 17],...","[[0, 0], [2, 0], [0, 0], [0, 0], [0, 0], [0, 1...","[[0, 0], [0, 0], [0, 0], [1, 0], [0, 1], [0, 1...","[[0, 0], [0, 0], [0, 0], [0, 0], [0, 2], [0, 1...","[Heavyweight, Welterweight, Flyweight, Welterw...","[KO/TKO, U-DEC, KO/TKO, SUB, KO/TKO, KO/TKO, S...","[1, 3, 2, 1, 2, 2, 3, 1, 3, 2]","[3:27, 5:00, 4:35, 0:45, 1:06, 0:29, 5:00, 2:3..."
3,2002-07-13,UFC 38: Brawl at the Hall,[http://www.ufcstats.com/fight-details/0f6dd5d...,"[(Matt Hughes, Carlos Newton), (Ian Freeman, F...","[[58, 3], [34, 6], [4, 0], [9, 5], [30, 29], [...","[[4, 0], [0, 1], [0, 0], [2, 0], [4, 0], [4, 0...","[[1, 1], [0, 3], [0, 0], [3, 0], [0, 1], [0, 0...","[[12, 1], [0, 0], [0, 0], [2, 0], [9, 0], [0, ...","[Welterweight, Heavyweight, Middleweight, Ligh...","[KO/TKO, KO/TKO, KO/TKO, SUB, U-DEC, U-DEC, U-...","[4, 1, 1, 2, 3, 3, 3]","[3:37, 4:35, 0:10, 1:38, 5:00, 5:00, 5:00]"
4,2013-10-09,UFC Fight Night: Maia vs Shields,[http://www.ufcstats.com/fight-details/a41ba26...,"[(Jake Shields, Demian Maia), (Dong Hyun Kim, ...","[[35, 24], [11, 28], [96, 70], [87, 57], [0, 2...","[[1, 3], [1, 0], [0, 3], [0, 1], [0, 1], [0, 2...","[[0, 0], [0, 0], [0, 0], [1, 0], [2, 0], [1, 3...","[[5, 1], [1, 0], [0, 0], [0, 0], [0, 0], [0, 1...","[Welterweight, Welterweight, Light Heavyweight...","[S-DEC, KO/TKO, U-DEC, S-DEC, SUB, S-DEC, U-DE...","[5, 2, 3, 3, 1, 3, 3, 3, 2, 1]","[5:00, 3:01, 5:00, 5:00, 0:31, 5:00, 5:00, 5:0..."
...,...,...,...,...,...,...,...,...,...,...,...,...
516,2003-04-25,UFC 42: Sudden Impact,[http://www.ufcstats.com/fight-details/00a5e03...,"[(Matt Hughes, Sean Sherk), (Pete Spratt, Robb...","[[44, 19], [19, 15], [60, 24], [22, 6], [16, 6...","[[5, 3], [1, 2], [1, 2], [0, 0], [0, 0], [0, 3...","[[1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0...","[[10, 2], [1, 3], [1, 1], [0, 0], [0, 0], [0, ...","[Welterweight, Welterweight, Welterweight, Hea...","[U-DEC, SUB, U-DEC, KO/TKO, KO/TKO, U-DEC, U-D...","[5, 2, 3, 2, 1, 3, 3, 1]","[5:00, 2:28, 5:00, 1:46, 2:40, 5:00, 5:00, 3:55]"
517,2017-09-16,UFC Fight Night: Rockhold vs. Branch,[http://www.ufcstats.com/fight-details/24438a6...,"[(Luke Rockhold, David Branch), (Mike Perry, A...","[[58, 25], [11, 6], [35, 65], [20, 22], [16, 1...","[[2, 1], [0, 0], [0, 1], [4, 0], [0, 0], [0, 0...","[[0, 0], [0, 0], [0, 0], [1, 0], [0, 0], [0, 0...","[[2, 0], [0, 0], [0, 0], [6, 0], [0, 0], [0, 0...","[Middleweight, Welterweight, Middleweight, Lig...","[KO/TKO, KO/TKO, KO/TKO, SUB, KO/TKO, S-DEC, S...","[2, 1, 3, 2, 1, 3, 3, 1, 2, 2]","[4:05, 1:19, 2:33, 2:11, 2:48, 5:00, 5:00, 0:2..."
518,2008-07-19,UFC: Silva vs Irvin,[http://www.ufcstats.com/fight-details/4eafb26...,"[(Anderson Silva, James Irvin), (Brandon Vera,...","[[13, 0], [38, 30], [69, 18], [24, 1], [37, 51...","[[0, 0], [2, 1], [10, 0], [1, 0], [0, 5], [0, ...","[[0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [2, 0...","[[0, 0], [2, 0], [0, 0], [1, 0], [0, 0], [1, 0...","[Light Heavyweight, Light Heavyweight, Lightwe...","[KO/TKO, U-DEC, U-DEC, KO/TKO, KO/TKO, SUB, KO...","[1, 3, 3, 1, 3, 1, 1, 1, 1, 2, 1]","[1:01, 5:00, 5:00, 2:02, 3:35, 3:58, 1:54, 1:3..."
519,2016-03-05,UFC 196: McGregor vs Diaz,[http://www.ufcstats.com/fight-details/4ace70b...,"[(Nate Diaz, Conor McGregor), (Miesha Tate, Ho...","[[77, 61], [40, 59], [30, 23], [55, 31], [39, ...","[[1, 0], [2, 0], [3, 0], [1, 1], [2, 1], [3, 1...","[[2, 0], [2, 0], [0, 0], [0, 0], [2, 0], [1, 0...","[[1, 0], [3, 0], [1, 0], [1, 0], [4, 1], [4, 0...","[Welterweight, Women's Bantamweight, Light Hea...","[SUB, SUB, U-DEC, U-DEC, U-DEC, SUB, KO/TKO, K...","[2, 5, 3, 3, 3, 3, 2, 2, 3, 3, 1, 2]","[4:12, 3:30, 5:00, 5:00, 5:00, 4:11, 1:34, 1:0..."


In [81]:
event_database_df.to_csv('event_database_csv', index=False)

In [82]:
event_database_df = dd.read_csv('event_database_csv')

In [83]:
event_database_df.head()

Unnamed: 0,date,name,event_fight_list,matchups,strike_counts,takedown_counts,submission_counts,pass_counts,weight_class,win_method,final_round,final_round_time
0,2019-10-26,UFC Fight Night: Maia vs. Askren,['http://www.ufcstats.com/fight-details/241a08...,"[('Demian Maia', 'Ben Askren'), ('Stevie Ray',...","[['69', '63'], ['85', '78'], ['13', '3'], ['94...","[['0', '4'], ['1', '0'], ['2', '0'], ['3', '0'...","[['1', '0'], ['0', '0'], ['1', '0'], ['1', '0'...","[['4', '1'], ['1', '0'], ['1', '0'], ['1', '0'...","['Welterweight', 'Lightweight', 'Lightweight',...","['SUB', 'M-DEC', 'SUB', 'SUB', 'U-DEC', 'S-DEC...","[3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3]","['3:54', '5:00', '2:02', '4:46', '5:00', '5:00..."
1,2016-04-10,UFC Fight Night: Rothwell vs Dos Santos,['http://www.ufcstats.com/fight-details/5f6b8e...,"[('Junior Dos Santos', 'Ben Rothwell'), ('Derr...","[['157', '77'], ['17', '5'], ['36', '24'], ['4...","[['0', '0'], ['0', '1'], ['0', '2'], ['0', '1'...","[['0', '0'], ['0', '0'], ['0', '0'], ['0', '0'...","[['0', '0'], ['0', '2'], ['0', '1'], ['0', '1'...","['Heavyweight', 'Heavyweight', 'Heavyweight', ...","['U-DEC', 'KO/TKO', 'KO/TKO', 'U-DEC', 'U-DEC'...","[5, 1, 2, 3, 3, 3, 3, 1, 1, 1, 3, 1, 3]","['5:00', '4:48', '5:00', '5:00', '5:00', '5:00..."
2,2012-10-05,UFC on FX: Browne vs Bigfoot,['http://www.ufcstats.com/fight-details/0cced3...,"[('Antonio Silva', 'Travis Browne'), ('Jake El...","[['15', '8'], ['25', '32'], ['33', '9'], ['6',...","[['0', '0'], ['2', '0'], ['0', '0'], ['0', '0'...","[['0', '0'], ['0', '0'], ['0', '0'], ['1', '0'...","[['0', '0'], ['0', '0'], ['0', '0'], ['0', '0'...","['Heavyweight', 'Welterweight', 'Flyweight', '...","['KO/TKO', 'U-DEC', 'KO/TKO', 'SUB', 'KO/TKO',...","[1, 3, 2, 1, 2, 2, 3, 1, 3, 2]","['3:27', '5:00', '4:35', '0:45', '1:06', '0:29..."
3,2002-07-13,UFC 38: Brawl at the Hall,['http://www.ufcstats.com/fight-details/0f6dd5...,"[('Matt Hughes', 'Carlos Newton'), ('Ian Freem...","[['58', '3'], ['34', '6'], ['4', '0'], ['9', '...","[['4', '0'], ['0', '1'], ['0', '0'], ['2', '0'...","[['1', '1'], ['0', '3'], ['0', '0'], ['3', '0'...","[['12', '1'], ['0', '0'], ['0', '0'], ['2', '0...","['Welterweight', 'Heavyweight', 'Middleweight'...","['KO/TKO', 'KO/TKO', 'KO/TKO', 'SUB', 'U-DEC',...","[4, 1, 1, 2, 3, 3, 3]","['3:37', '4:35', '0:10', '1:38', '5:00', '5:00..."
4,2013-10-09,UFC Fight Night: Maia vs Shields,['http://www.ufcstats.com/fight-details/a41ba2...,"[('Jake Shields', 'Demian Maia'), ('Dong Hyun ...","[['35', '24'], ['11', '28'], ['96', '70'], ['8...","[['1', '3'], ['1', '0'], ['0', '3'], ['0', '1'...","[['0', '0'], ['0', '0'], ['0', '0'], ['1', '0'...","[['5', '1'], ['1', '0'], ['0', '0'], ['0', '0'...","['Welterweight', 'Welterweight', 'Light Heavyw...","['S-DEC', 'KO/TKO', 'U-DEC', 'S-DEC', 'SUB', '...","[5, 2, 3, 3, 1, 3, 3, 3, 2, 1]","['5:00', '3:01', '5:00', '5:00', '0:31', '5:00..."


In [84]:
test = pd.read_html('http://www.ufcstats.com/fighter-details/d28dee5c705991df')
test_df = test[0]

In [None]:
test_df.drop([0, 1], inplace=True)

In [None]:
test_df.iloc[0].Fighter.split('  ')

In [None]:
test_df.iloc[0].Method.split()

--------------------------------------FIX IT FIX IT FIX IT FIX IT FIX IT FIX IT--------------------------------------------

In [46]:
def fighter_history_fetcher(soup):
    '''This function is meant to extract the html information for each fighter as a list of urls. '''
    
    soup_holder = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    list_of_fights = []
    
    for x in soup_holder:
        fight_holder = []
        
        fight_holder.append(x.get('href'))
        
        list_of_fights.append(fight_holder) 
        
    return list_of_fights

In [48]:
fighter_history_fetcher(basic_soup_list[0])

[['http://www.ufcstats.com/fight-details/889207e3c622a5aa'],
 ['http://www.ufcstats.com/fight-details/7c23770c639dcd22'],
 ['http://www.ufcstats.com/fight-details/ac656cebef8cf0a4'],
 ['http://www.ufcstats.com/fight-details/8d9b460143ae19bc'],
 ['http://www.ufcstats.com/fight-details/3c6659f1b09760ba'],
 ['http://www.ufcstats.com/fight-details/a9d8ddb49dbe414b'],
 ['http://www.ufcstats.com/fight-details/78f1457371e2ef6b'],
 ['http://www.ufcstats.com/fight-details/49b05e23e3a2af13']]

In [14]:
fighter_history_url_list = list(map(fighter_history_fetcher, basic_soup_list))
fighter_history_url_list[:3]

[[['http://www.ufcstats.com/fight-details/1206dc7d5228f5bd']],
 [['http://www.ufcstats.com/fight-details/9f5f9173666ce1b7'],
  ['http://www.ufcstats.com/fight-details/010793ad35d686d5']],
 [['http://www.ufcstats.com/fight-details/96fe84d051847591'],
  ['http://www.ufcstats.com/fight-details/64bee58391921b4b'],
  ['http://www.ufcstats.com/fight-details/55b53fdad307b708'],
  ['http://www.ufcstats.com/fight-details/d2f6b87189908088'],
  ['http://www.ufcstats.com/fight-details/9f10afab94cafc45'],
  ['http://www.ufcstats.com/fight-details/bd4a648bebaefab3'],
  ['http://www.ufcstats.com/fight-details/87b6c507a7684ec2']]]

fighter_history_url_list generated lists 

In [15]:
def compound_flattener(fight_url_list):
    '''Will reduce the dimentionaliy of lists by factor of one'''
    
    flattened_fight_urls = list(itertools.chain(*fight_url_list))
    return flattened_fight_urls

In [16]:
fighter_history_url_list = list(map(compound_flattener, fighter_history_url_list))

In [17]:
fighter_history_url_list[:3]

[['http://www.ufcstats.com/fight-details/1206dc7d5228f5bd'],
 ['http://www.ufcstats.com/fight-details/9f5f9173666ce1b7',
  'http://www.ufcstats.com/fight-details/010793ad35d686d5'],
 ['http://www.ufcstats.com/fight-details/96fe84d051847591',
  'http://www.ufcstats.com/fight-details/64bee58391921b4b',
  'http://www.ufcstats.com/fight-details/55b53fdad307b708',
  'http://www.ufcstats.com/fight-details/d2f6b87189908088',
  'http://www.ufcstats.com/fight-details/9f10afab94cafc45',
  'http://www.ufcstats.com/fight-details/bd4a648bebaefab3',
  'http://www.ufcstats.com/fight-details/87b6c507a7684ec2']]