In [1]:
#importing pertinent libraries
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import sqlite3, requests, re
import json

In [2]:
conn = sqlite3.connect('UFC_Fighter_Encyclopedia.db')
cur = conn.cursor()

In [3]:
#extracting UFC data

#creating a list of unique fighter URLs available on website
fighter_urls = []
response = requests.get('http://www.ufcstats.com/statistics/fighters?char=a&page=all')
soup = BeautifulSoup(response.content, 'html.parser')

#targeting URLs that contain fighter data
fighters = soup.findAll('a', attrs={'href': re.compile('http:')})
for fighter in fighters:
    fighter_urls.append(fighter.get('href'))

#removing duplicate URLs
fighter_urls = list(set(fighter_urls))
fighter_urls

['http://www.ufcstats.com/fighter-details/0e9869d712e81f8f',
 'http://www.ufcstats.com/fighter-details/e70de1859b7ee78e',
 'http://www.ufcstats.com/fighter-details/cad24459b28592ca',
 'http://www.ufcstats.com/fighter-details/8f382b3baa954d2a',
 'http://www.ufcstats.com/fighter-details/1ffc38f67785797b',
 'http://www.ufcstats.com/fighter-details/1562b12763cc8d67',
 'http://www.ufcstats.com/fighter-details/73ef22f25d0f70e2',
 'http://www.ufcstats.com/fighter-details/61fb8098ccf81c7f',
 'http://www.ufcstats.com/fighter-details/b1d19449397541dc',
 'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
 'http://www.ufcstats.com/fighter-details/44aa652b181bcf68',
 'http://www.ufcstats.com/fighter-details/1e327b281ef6a745',
 'http://www.ufcstats.com/fighter-details/16690d100f995f8f',
 'http://www.ufcstats.com/fighter-details/bd92cf5da5413d2a',
 'http://www.ufcstats.com/fighter-details/9199e0735b83dd32',
 'http://www.ufcstats.com/fighter-details/8753e125f4499816',
 'http://www.ufcstats.co

Looks like there are some URLs we don't need in out list. Let's weed them out.

In [4]:
#removing invalid URLs
for url in fighter_urls:
    if 'fighter-details' not in url:
        fighter_urls.remove(url)

In [5]:
fighter_urls

['http://www.ufcstats.com/fighter-details/0e9869d712e81f8f',
 'http://www.ufcstats.com/fighter-details/e70de1859b7ee78e',
 'http://www.ufcstats.com/fighter-details/cad24459b28592ca',
 'http://www.ufcstats.com/fighter-details/8f382b3baa954d2a',
 'http://www.ufcstats.com/fighter-details/1ffc38f67785797b',
 'http://www.ufcstats.com/fighter-details/1562b12763cc8d67',
 'http://www.ufcstats.com/fighter-details/73ef22f25d0f70e2',
 'http://www.ufcstats.com/fighter-details/61fb8098ccf81c7f',
 'http://www.ufcstats.com/fighter-details/b1d19449397541dc',
 'http://www.ufcstats.com/fighter-details/15df64c02b6b0fde',
 'http://www.ufcstats.com/fighter-details/44aa652b181bcf68',
 'http://www.ufcstats.com/fighter-details/1e327b281ef6a745',
 'http://www.ufcstats.com/fighter-details/16690d100f995f8f',
 'http://www.ufcstats.com/fighter-details/bd92cf5da5413d2a',
 'http://www.ufcstats.com/fighter-details/9199e0735b83dd32',
 'http://www.ufcstats.com/fighter-details/8753e125f4499816',
 'http://www.ufcstats.co

In [6]:
#Building the skeleton of a function that will parse each URL and developer a comprehensive fighter profile

tester = fighter_urls[0]
response = requests.get(tester)
soup = BeautifulSoup(response.content, 'html.parser')
tester_info = soup.findAll('h2', attrs={'class': re.compile('b-content')})

#Retrieving fighter name
name = soup.findChildren('span', attrs={'class': re.compile('b-content__title-highlight')})
print(name[0].text.strip())

#Retrieving fighter record
record = soup.findChildren('span', attrs={'class': re.compile('b-content__title-record')})
print(record[0].text.strip())


base_stats_titles = soup.findChildren('li', attrs={'class': re.compile('b-list')})
print(base_stats_titles[0].text.strip())


base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
print(base_stats[13].text.strip())


for x in range(0, 5):
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    print(base_stats[x].text.strip())

#Retrieving careeer stats:
career_stat_title = soup.findAll('i', attrs={'class': re.compile('b-list__box-item-title')})
print(career_stat_title[5].text.strip().title())
for x in range(6, 14):
    base_stats = soup.findChildren('li', attrs={'class': re.compile('b-list')})
    print(base_stats[x].text.strip())

Sam Adkins
Record: 7-20-2
Height:
      
      6' 3"
Sub. Avg.:
          
          0.0
Height:
      
      6' 3"
Weight:
      
      225 lbs.
Reach:
      
      --
STANCE:
      
      Orthodox
DOB:
      
      
        Apr 26, 1965
Career Statistics:
Str. Acc.:
          
          0%
SApM:
          
          0.00
Str. Def:
          
          0%

TD Avg.:
          
          0.00
TD Acc.:
          
          0%
TD Def.:
          
          0%
Sub. Avg.:
          
          0.0


In [7]:
#creating data frames and lists  
fighter_basic_stats = pd.DataFrame()

name_list = [] 
height_list = []
record_list = []
weight_list = []
reach_list = []
stance_list = []
DOB_list  = []

In [9]:
#populating basic fighter stats dataframe
fighter_basic_stats['name'] = name_list
fighter_basic_stats['height'] = height_list
fighter_basic_stats['record'] = record_list
fighter_basic_stats['weight'] =  weight_list
fighter_basic_stats['reach'] = reach_list
fighter_basic_stats['stance'] = stance_list
fighter_basic_stats['DOB'] = DOB_list

fighter_basic_stats

Unnamed: 0,name,height,record,weight,reach,stance,DOB
0,Sam Adkins,72,7-20-2,225,--,Orthodox,"Apr 26, 1965"
1,Sultan Aliev,60,15-3-0,170,74,Orthodox,"Sep 17, 1984"
2,Eryk Anders,72,13-5-0,185,75,Southpaw,"Apr 21, 1987"
3,Jessica Aguilar,60,20-8-0,115,63,Orthodox,"May 08, 1982"
4,Lowell Anderson,60,0-1-0,160,--,Orthodox,--
...,...,...,...,...,...,...,...
158,Erik Apple,72,10-3-0,170,--,Orthodox,"Aug 26, 1977"
159,Amir Aliakbari,72,10-1-0,250,--,,"Jun 10, 1984"
160,Jessin Ayari,72,16-5-0,155,73,Orthodox,"May 31, 1992"
161,Royce Alger,60,3-2-0,199,--,Orthodox,--


In [8]:
#Retrieving careeer stats suite:
career_stats = base_stats[6:]

#Striking Accuracy %
print('Significant Strike Accuracy:', career_stats[0].text.split(':')[1].replace('%', '').strip())

#Striking Accuracy %
print('Significant Strikes Absorbed per Minute:', career_stats[1].text.split(':')[1].strip())

#Striking Defence %
print('Significant Strike Defence:', career_stats[2].text.split(':')[1].replace('%', '').strip())

#Average Takedowns Landed per 15 min
print('Takedown Average:', career_stats[4].text.split(':')[1].strip())

#Takedown Accuracy
print('Takedown Accuracy:', career_stats[5].text.split(':')[1].strip())

#Takedown Defense %
print('Takedown Defence:', career_stats[6].text.split(':')[1].replace('%', '').strip())

#Average Submissiongs Attempted per 15 min
print('Submission averages:', career_stats[7].text.split(':')[1].strip())

Significant Strike Accuracy: 40
Significant Strikes Absorbed per Minute: 4.38
Significant Strike Defence: 56
Takedown Average: 2.26
Takedown Accuracy: 41%
Takedown Defence: 80
Submission averages: 0.3


In [12]:
fights = soup.findAll('a', attrs={'href': re.compile('fight-details')})
fights

[<a class="b-flag b-flag_style_bordered" href="http://www.ufcstats.com/fight-details/435c93cb89016fd1"><i class="b-flag__inner"><i class="b-flag__text">loss<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>,
 <a class="b-flag b-flag_style_bordered" href="http://www.ufcstats.com/fight-details/b2857e799319f931"><i class="b-flag__inner"><i class="b-flag__text">nc<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>,
 <a class="b-flag b-flag_style_bordered" href="http://www.ufcstats.com/fight-details/b91807add93763c9"><i class="b-flag__inner"><i class="b-flag__text">loss<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>,
 <a class="b-flag b-flag_style_green" href="http://www.ufcstats.com/fight-details/f7fac3df5c6dc91f"><i class="b-flag__inner"><i class="b-flag__text">win<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>]

In [17]:
for fight in fights:
    print(fight.get('href'))

http://www.ufcstats.com/fight-details/435c93cb89016fd1
http://www.ufcstats.com/fight-details/b2857e799319f931
http://www.ufcstats.com/fight-details/b91807add93763c9
http://www.ufcstats.com/fight-details/f7fac3df5c6dc91f


In [None]:
test fighter = 