In [33]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import sqlite3, requests, re
import json

In [34]:
conn = sqlite3.connect('UFC_Fighter_Encyclopedia.db')
cur = conn.cursor()

In [35]:
#extracting UFC data

#creating a list of unique fighter URLs available on website
fighter_urls = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_urls.append(fighter.get('href'))

#removing duplicate URLs
fighter_urls = list(set(fighter_urls))
fighter_urls

['http://www.ufcstats.com/fighter-details/9bcfb40dbcd50568',
 'http://www.ufcstats.com/fighter-details/b757c73f443d4fca',
 'http://www.ufcstats.com/fighter-details/86d9dbe1bfcbade7',
 'http://www.ufcstats.com/fighter-details/79ded75550efc139',
 'http://www.ufcstats.com/fighter-details/9abc648e76c4493a',
 'http://www.ufcstats.com/fighter-details/87a1dc546b1c5caf',
 'http://www.ufcstats.com/fighter-details/7c6e87729e824ef4',
 'http://www.ufcstats.com/fighter-details/dc5a6b2fdb27e7dc',
 'http://www.ufcstats.com/fighter-details/d562b12b8fe88336',
 'http://www.ufcstats.com/fighter-details/d343df8ba11f4c4e',
 'http://www.ufcstats.com/fighter-details/d802174b0c0c1f4e',
 'http://www.ufcstats.com/fighter-details/1ccff7f0cfdf85eb',
 'http://www.ufcstats.com/fighter-details/cf946e03ba2e7666',
 'http://www.ufcstats.com/statistics/fighters?char=a&page=2',
 'http://www.ufcstats.com/fighter-details/36541f1e6c5d4955',
 'http://www.ufcstats.com/fighter-details/0b31f87be71ebbb1',
 'http://www.ufcstats.c

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [36]:
#removing invalid URLs
for url in fighter_urls:
    if 'fighter-details' not in url:
        fighter_urls.remove(url)

In [37]:
fighter_urls

['http://www.ufcstats.com/fighter-details/9bcfb40dbcd50568',
 'http://www.ufcstats.com/fighter-details/b757c73f443d4fca',
 'http://www.ufcstats.com/fighter-details/86d9dbe1bfcbade7',
 'http://www.ufcstats.com/fighter-details/79ded75550efc139',
 'http://www.ufcstats.com/fighter-details/9abc648e76c4493a',
 'http://www.ufcstats.com/fighter-details/87a1dc546b1c5caf',
 'http://www.ufcstats.com/fighter-details/7c6e87729e824ef4',
 'http://www.ufcstats.com/fighter-details/dc5a6b2fdb27e7dc',
 'http://www.ufcstats.com/fighter-details/d562b12b8fe88336',
 'http://www.ufcstats.com/fighter-details/d343df8ba11f4c4e',
 'http://www.ufcstats.com/fighter-details/d802174b0c0c1f4e',
 'http://www.ufcstats.com/fighter-details/1ccff7f0cfdf85eb',
 'http://www.ufcstats.com/fighter-details/cf946e03ba2e7666',
 'http://www.ufcstats.com/fighter-details/36541f1e6c5d4955',
 'http://www.ufcstats.com/fighter-details/0b31f87be71ebbb1',
 'http://www.ufcstats.com/fighter-details/d0f3959b4a9747e6',
 'http://www.ufcstats.co

In [74]:
#creating data frames and lists  
fighter_basic_stats = pd.DataFrame()

name_list = [] 
height_list = []
record_list = []
weight_list = []
reach_list = []
stance_list = []
DOB_list  = []

In [75]:
for url in fighter_urls:

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})

    #Retrieving fighter name/appending fighter_basic_stats
    name = soup.findChildren('span', attrs={'class': re.compile('b-content__title-highlight')})
    name_list.append(name[0].text.strip())
    
    #Retrieving fighter record
    record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
    record_list.append(record[0].text.strip().split(':')[1].strip())
    
    base_stats_titles = soup.findChildren('li', attrs={'class': re.compile('b-list')})

    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    
    #feet
    height = int(base_stats[0].text.strip().split(':')[1].strip().split(' ')[0].replace("'", "").replace("--", "0"))
    
    #inches
    if height == 0:
        pass
    else:
        inches = int(base_stats[0].text.strip().split(':')[1].strip().split(' ')[1].replace("'", "").replace('"', '').replace("--", "0"))

    height_inches = (height * 12) 
    height_list.append(height_inches)
    
    #Weight
    weight = base_stats[1].text.strip().split(':')[1].strip().replace(' lbs.', '')
    weight_list.append(weight)
    
    #Reach
    reach = base_stats[2].text.strip().split(':')[1].strip().replace('"', '')
    reach_list.append(reach)
    
    #Stance
    stance = base_stats[3].text.strip().split(':')[1].strip()
    stance_list.append(stance)
    
    #DOB
    DOB = base_stats[4].text.strip().split(':')[1].strip()
    DOB_list.append(DOB)

In [76]:
#populating basic fighter stats dataframe
fighter_basic_stats['name'] = name_list
fighter_basic_stats['height'] = height_list
fighter_basic_stats['record'] = record_list
fighter_basic_stats['weight'] =  weight_list
fighter_basic_stats['reach'] = reach_list
fighter_basic_stats['stance'] = stance_list
fighter_basic_stats['DOB'] = DOB_list

fighter_basic_stats

Unnamed: 0,name,height,record,weight,reach,stance,DOB
0,Luciano Azevedo,72,16-9-1,161,--,Orthodox,"Jun 25, 1981"
1,Olaf Alfonso,72,8-11-0,170,--,Orthodox,"Aug 06, 1974"
2,Julia Avila,60,7-1-0,135,68,Orthodox,"May 11, 1988"
3,Shinya Aoki,60,30-5-0 (1 NC),154,--,Southpaw,"May 09, 1983"
4,Bill Algeo,72,12-3-0,145,73,Switch,"Jun 09, 1989"
...,...,...,...,...,...,...,...
158,Saad Awad,60,9-5-0,155,--,,"Jun 07, 1983"
159,Scott Askham,72,14-4-0,185,75,Southpaw,"May 20, 1988"
160,Israel Albuquerque,0,0-3-0,185,--,Orthodox,--
161,Marcelo Aguiar,60,2-3-1,170,--,Orthodox,--


In [41]:
fighter_basic_stats.to_csv('fighter_basic_stats')

In [77]:
#Retrieving careeer stats suite:
career_stats = base_stats[5:]

#Significan Strkes Landed per Minute
print(career_stats[0].text.split(':')[1].replace('%', '').strip())

#Striking Accuracy %
print('Significant Strike Accuracy:', career_stats[1].text.split(':')[1].replace('%', '').strip())

#Striking Accuracy %
print('Significant Strikes Absorbed per Minute:', career_stats[2].text.split(':')[1].strip())

#Striking Defence %
print('Significant Strike Defence:', career_stats[3].text.split(':')[1].replace('%', '').strip())

#Average Takedowns Landed per 15 min
print('Takedown Average:', career_stats[5].text.split(':')[1].strip())

#Takedown Accuracy
print('Takedown Accuracy:', career_stats[6].text.split(':')[1].strip())

#Takedown Defense %
print('Takedown Defence:', career_stats[7].text.split(':')[1].replace('%', '').strip())

#Average Submissiongs Attempted per 15 min
print('Submission averages:', career_stats[7].text.split(':')[1].strip())

1.41
Significant Strike Accuracy: 45
Significant Strikes Absorbed per Minute: 2.20
Significant Strike Defence: 56
Takedown Average: 1.76
Takedown Accuracy: 38%
Takedown Defence: 100
Submission averages: 100%


In [78]:
#creating data frames and lists  
fighter_career_stats = pd.DataFrame()


SLpM_list = []
StrAcc_list = []
SApM_list = []
StrDef_list = []
TDAvg_list = []
TDAcc_list  = []
TDDef_list  = []
SubAvg_list  = []

In [79]:
for url in fighter_urls:

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})

    #Retrieving careeer stats suite:
    career_stats = base_stats
    
    #Significant Strkes Landed per Minute
    SLpM = career_stats[0].text.split(':')[1].replace('%', '').strip()
    SLpM_list.append(SLpM)
    
    #Striking Accuracy %
    StrAcc =  career_stats[1].text.split(':')[1].replace('%', '').strip()
    StrAcc_list.append(StrAcc)
    
    #Striking Accuracy %
    SApM = career_stats[1].text.split(':')[1].strip()
    SApM_list.append(SApM)
    
    #Striking Defence %
    StrDef = career_stats[2].text.split(':')[1].replace('%', '').strip()
    StrDef_list.append(StrDef)
    
    #Average Takedowns Landed per 15 min
    TDAvg = career_stats[5].text.split(':')[1].strip()
    TDAvg_list.append(TDAvg)
    
    #Takedown Accuracy
    TDAcc = career_stats[6].text.split(':')[1].strip()
    TDAcc_list.append(TDAcc)
    
    #Takedown Defense %
    TDDef = career_stats[7].text.split(':')[1].replace('%', '').strip()
    TDDef_list.append(TDDef)
    
    #Average Submissiongs Attempted per 15 min
    SubAvg =  career_stats[8].text.split(':')[1].strip()   
    SubAvg_list.append(SubAvg)

In [80]:
fighter_career_stats['name'] = name_list
fighter_career_stats['SLpM'] = SLpM_list
fighter_career_stats['StrAcc'] = StrAcc_list
fighter_career_stats['SApM'] = SApM_list
fighter_career_stats['StrDef'] = StrDef_list
fighter_career_stats['TDAvg'] = TDAvg_list
fighter_career_stats['TDAcc'] = TDAcc_list
fighter_career_stats['TDDef'] = TDDef_list
fighter_career_stats['SubAvg'] = SubAvg_list

fighter_career_stats

Unnamed: 0,name,SLpM,StrAcc,SApM,StrDef,TDAvg,TDAcc,TDDef,SubAvg
0,Luciano Azevedo,"6' 3""",161 lbs.,161 lbs.,--,0.76,45%,1.97,27%
1,Olaf Alfonso,"6' 2""",170 lbs.,170 lbs.,--,1.65,38%,4.77,53%
2,Julia Avila,"5' 7""",135 lbs.,135 lbs.,"68""",3.27,40%,2.20,60%
3,Shinya Aoki,"5' 11""",154 lbs.,154 lbs.,--,0.97,58%,1.25,58%
4,Bill Algeo,"6' 0""",145 lbs.,145 lbs.,"73""",9.80,53%,7.47,36%
...,...,...,...,...,...,...,...,...,...
158,Saad Awad,"5' 11""",155 lbs.,155 lbs.,--,2.13,26%,1.87,60%
159,Scott Askham,"6' 3""",185 lbs.,185 lbs.,"75""",3.03,46%,3.52,52%
160,Israel Albuquerque,--,185 lbs.,185 lbs.,--,0.32,13%,3.34,25%
161,Marcelo Aguiar,"5' 10""",170 lbs.,170 lbs.,--,0.00,0%,0.00,0%


In [97]:
fighter_career_stats.to_csv('fighter_career_stats')

In [12]:
fights = soup.findAll('a', attrs={'href': re.compile('fight-details')})
fights

[<a class="b-flag b-flag_style_bordered" href="http://www.ufcstats.com/fight-details/435c93cb89016fd1"><i class="b-flag__inner"><i class="b-flag__text">loss<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>,
 <a class="b-flag b-flag_style_bordered" href="http://www.ufcstats.com/fight-details/b2857e799319f931"><i class="b-flag__inner"><i class="b-flag__text">nc<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>,
 <a class="b-flag b-flag_style_bordered" href="http://www.ufcstats.com/fight-details/b91807add93763c9"><i class="b-flag__inner"><i class="b-flag__text">loss<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>,
 <a class="b-flag b-flag_style_green" href="http://www.ufcstats.com/fight-details/f7fac3df5c6dc91f"><i class="b-flag__inner"><i class="b-flag__text">win<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>]

In [17]:
for fight in fights:
    print(fight.get('href'))

http://www.ufcstats.com/fight-details/435c93cb89016fd1
http://www.ufcstats.com/fight-details/b2857e799319f931
http://www.ufcstats.com/fight-details/b91807add93763c9
http://www.ufcstats.com/fight-details/f7fac3df5c6dc91f


In [88]:
fight_list = []
fight_history = pd.DataFrame()

for url in fighter_urls: 
   
    fight_holder = []
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    fights = soup.findAll('a', attrs={'href': re.compile('fight-details')})
    
    for fight in fights:
        fight_holder.append(fight.get('href'))
    fight_list.append(fight_holder) 
fight_history['name'] = name_list
fight_history['fight_history'] = fight_list

In [96]:
fight_history['fight_history'][80]

['http://www.ufcstats.com/fight-details/57961c6adebfb13d',
 'http://www.ufcstats.com/fight-details/88b1d4f951070f83',
 'http://www.ufcstats.com/fight-details/0bc354fbb6e99ab6',
 'http://www.ufcstats.com/fight-details/c4f443509308dec6']

In [98]:
fight_history.to_csv('fight_history')