# Scraping Wikipedia

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import html5lib

import requests 
from bs4 import BeautifulSoup
import urllib
import time
import ast

Firstly we got the relevant url

In [2]:
a = pd.read_html('https://en.wikipedia.org/wiki/List_of_current_UFC_fighters', flavor = 'html5lib')

Next we put each table into a seperate dataframe and added a weight class column and index. These will be useful later. We then merged all the dataframes into one big one.

In [3]:
a11 = a[11]
a11['Weight Class']='Heavyweight'
a11['Weight Class Index']= 0

a12 = a[12]
a12['Weight Class']='Light heavyweight'
a12['Weight Class Index']= 1

a13 = a[13]
a13['Weight Class']='Middleweight'
a13['Weight Class Index']= 2

a14 = a[14]
a14['Weight Class']='Welterweight'
a14['Weight Class Index']= 3

a15 = a[15]
a15['Weight Class']='Lightweight'
a15['Weight Class Index']= 4

a16 = a[16]
a16['Weight Class']='Featherweight'
a16['Weight Class Index']= 5

a17 = a[17]
a17['Weight Class']='Bantamweight'
a17['Weight Class Index']= 6

a18 = a[18]
a18['Weight Class']='Flyweight'
a18['Weight Class Index']= 7

a19 = a[19]
a19['Weight Class']='Women"s Featherweight'
a19['Weight Class Index']= 8

a20 = a[20]
a20['Weight Class']='Women"s Bantamweight'
a20['Weight Class Index']= 9

a21 = a[21]
a21['Weight Class']='Women"s Flyweight'
a21['Weight Class Index']= 10

a22 = a[22]
a22['Weight Class']='Women"s Strawweight'
a22['Weight Class Index']= 11


fighters = pd.concat([a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22])
fighters.head(5)


Unnamed: 0,0,1,2,3,4,5,6,7,8,Weight Class,Weight Class Index
0,ISO,Name,Age,Ht.,Nickname,Result / next fight / status,Ref,Endeavor record,MMA record,Heavyweight,0
1,,Andrei Arlovski *,40,6 ft 3 in (1.91 m),The Pit Bull,Loss - UFC on ESPN+ 8 (Sunrise) - Augusto Sakai,[114],16–12 (1 NC),27–18 (2 NC),Heavyweight,0
2,,Alistair Overeem *,38,6 ft 5 in (1.96 m),The Reem,Win - UFC on ESPN+ 7 (St. Peterburg) - Alexey ...,[115],11–6,45–17 (1 NC),Heavyweight,0
3,,Cain Velasquez,36,6 ft 1 in (1.85 m),,Loss - UFC on ESPN 1 (Phoenix) - Francis Ngannou,[116],12–3,14–3,Heavyweight,0
4,,Fabrício Werdum *,41,6 ft 4 in (1.93 m),Vai Cavalo,USADA suspension for 2 years - for tested posi...,[79],11–6,23–8–1,Heavyweight,0


Once again the first row had be changed to the column titles

In [4]:
fighters.columns = fighters.iloc[0]
fighters = fighters.drop(fighters.index[[0]])
fighters.reset_index(drop = True)
fighters.head(4)


Unnamed: 0,ISO,Name,Age,Ht.,Nickname,Result / next fight / status,Ref,Endeavor record,MMA record,Heavyweight,0
1,,Andrei Arlovski *,40,6 ft 3 in (1.91 m),The Pit Bull,Loss - UFC on ESPN+ 8 (Sunrise) - Augusto Sakai,[114],16–12 (1 NC),27–18 (2 NC),Heavyweight,0
2,,Alistair Overeem *,38,6 ft 5 in (1.96 m),The Reem,Win - UFC on ESPN+ 7 (St. Peterburg) - Alexey ...,[115],11–6,45–17 (1 NC),Heavyweight,0
3,,Cain Velasquez,36,6 ft 1 in (1.85 m),,Loss - UFC on ESPN 1 (Phoenix) - Francis Ngannou,[116],12–3,14–3,Heavyweight,0
4,,Fabrício Werdum *,41,6 ft 4 in (1.93 m),Vai Cavalo,USADA suspension for 2 years - for tested posi...,[79],11–6,23–8–1,Heavyweight,0


We then dropped any row without a name

In [5]:
fighters = fighters[fighters['Name'].notnull()]

The name column had to be cleaned with asterixs and "(C)" for champions removed

In [6]:
fighters['Name'] = fighters['Name'].map(lambda x: x.split('*')[0].strip())
fighters['Name'] = fighters['Name'].map(lambda x: x.split('(C)')[0].strip())

fighters.head(5)

Unnamed: 0,ISO,Name,Age,Ht.,Nickname,Result / next fight / status,Ref,Endeavor record,MMA record,Heavyweight,0
1,,Andrei Arlovski,40,6 ft 3 in (1.91 m),The Pit Bull,Loss - UFC on ESPN+ 8 (Sunrise) - Augusto Sakai,[114],16–12 (1 NC),27–18 (2 NC),Heavyweight,0
2,,Alistair Overeem,38,6 ft 5 in (1.96 m),The Reem,Win - UFC on ESPN+ 7 (St. Peterburg) - Alexey ...,[115],11–6,45–17 (1 NC),Heavyweight,0
3,,Cain Velasquez,36,6 ft 1 in (1.85 m),,Loss - UFC on ESPN 1 (Phoenix) - Francis Ngannou,[116],12–3,14–3,Heavyweight,0
4,,Fabrício Werdum,41,6 ft 4 in (1.93 m),Vai Cavalo,USADA suspension for 2 years - for tested posi...,[79],11–6,23–8–1,Heavyweight,0
5,,Junior dos Santos,35,6 ft 4 in (1.93 m),Cigano,UFC 239 (Las Vegas) - Francis Ngannou,[117],15–4,21–5,Heavyweight,0


These rows where not relevant to our project so we dropped them

In [7]:
to_drop = ['Result / next fight / status',
            'Ref',
           'ISO']
fighters.drop(to_drop, inplace=True, axis=1)
            

Changing the first row the the column header messed up the column we added earlier so they needed renaming

In [8]:
fighters.rename(columns={'Heavyweight':'Weight Class'}, inplace=True)
fighters.rename(columns={0:'Weight Class Index'}, inplace=True)
fighters.head(5)

Unnamed: 0,Name,Age,Ht.,Nickname,Endeavor record,MMA record,Weight Class,Weight Class Index
1,Andrei Arlovski,40,6 ft 3 in (1.91 m),The Pit Bull,16–12 (1 NC),27–18 (2 NC),Heavyweight,0
2,Alistair Overeem,38,6 ft 5 in (1.96 m),The Reem,11–6,45–17 (1 NC),Heavyweight,0
3,Cain Velasquez,36,6 ft 1 in (1.85 m),,12–3,14–3,Heavyweight,0
4,Fabrício Werdum,41,6 ft 4 in (1.93 m),Vai Cavalo,11–6,23–8–1,Heavyweight,0
5,Junior dos Santos,35,6 ft 4 in (1.93 m),Cigano,15–4,21–5,Heavyweight,0


The records needed to be cleaned. We removed the dash between the numbers. We then converted them to strings and split them into wins/losses based on their position in the string.

In [9]:
fighters['MMA record'] = fighters['MMA record'].str.replace('–',' ')
fighters['Endeavor record'] = fighters['Endeavor record'].str.replace('–',' ')

fighters['x'] = fighters['MMA record'].astype(str)
fighters['y'] = fighters['Endeavor record'].astype(str)

# make the new columns using string indexing
fighters['Overall Wins'] = fighters['x'].str[0:2]
fighters['Overall Losses'] = fighters['x'].str[2:5]

fighters['Overall Losses'] = fighters['Overall Losses'].str[0:3]
fighters['Overall Losses'] = fighters['Overall Losses'].str.replace('(',' ')

fighters['UFC Wins'] = fighters['y'].str[0:2]
fighters['UFC Losses'] = fighters['y'].str[2:5]
fighters['UFC Losses'] = fighters['UFC Losses'].str[0:3]
fighters['UFC Losses'] = fighters['UFC Losses'].str.replace('(',' ')

fighters.drop('x', axis=1, inplace=True)
fighters.drop('y', axis=1, inplace=True)

fighters.head(100)

Unnamed: 0,Name,Age,Ht.,Nickname,Endeavor record,MMA record,Weight Class,Weight Class Index,Overall Wins,Overall Losses,UFC Wins,UFC Losses
1,Andrei Arlovski,40,6 ft 3 in (1.91 m),The Pit Bull,16 12 (1 NC),27 18 (2 NC),Heavyweight,0,27,18,16,12
2,Alistair Overeem,38,6 ft 5 in (1.96 m),The Reem,11 6,45 17 (1 NC),Heavyweight,0,45,17,11,6
3,Cain Velasquez,36,6 ft 1 in (1.85 m),,12 3,14 3,Heavyweight,0,14,3,12,3
4,Fabrício Werdum,41,6 ft 4 in (1.93 m),Vai Cavalo,11 6,23 8 1,Heavyweight,0,23,8,11,6
5,Junior dos Santos,35,6 ft 4 in (1.93 m),Cigano,15 4,21 5,Heavyweight,0,21,5,15,4
6,Todd Duffee,33,6 ft 4 in (1.93 m),Irish Car Bomb,3 2,9 3,Heavyweight,0,9,3,3,2
7,Ben Rothwell,37,6 ft 4 in (1.93 m),,6 5,36 11,Heavyweight,0,36,11,6,5
8,Daniel Cormier,40,5 ft 11 in (1.80 m),DC,15 1 (1 NC),22 1 (1 NC),Heavyweight,0,22,1,15,1
9,Stipe Miočić,36,6 ft 4 in (1.93 m),,12 3,18 3,Heavyweight,0,18,3,12,3
10,Walt Harris,35,6 ft 5 in (1.96 m),The Big Ticket,5 6 (1 NC),12 7 (1 NC),Heavyweight,0,12,7,5,6


Daniel Cormier appeares in two weight classes, which is a rare occurence.This was messing up a graph later on so he was dropped from his old weight class.

In [10]:
fighters = fighters[~((fighters['Name'] == 'Daniel Cormier') & (fighters['Weight Class'] == 'Light heavyweight'))]

fighters.loc[fighters['Name'] == 'Daniel Cormier']


Unnamed: 0,Name,Age,Ht.,Nickname,Endeavor record,MMA record,Weight Class,Weight Class Index,Overall Wins,Overall Losses,UFC Wins,UFC Losses
8,Daniel Cormier,40,5 ft 11 in (1.80 m),DC,15 1 (1 NC),22 1 (1 NC),Heavyweight,0,22,1,15,1


These fighters names were different on the UFC website than on wikipedia for various reasons so I changed them for the merge

In [11]:
fighters['Name'] = fighters['Name'].replace('Khalil Rountree', 'Khalil Rountree Jr.')
fighters['Name'] = fighters['Name'].replace('Joseph Duffy', 'Joe Duffy')
fighters['Name'] = fighters['Name'].replace('Emil Weber Meek', 'Emil Meek')
fighters['Name'] = fighters['Name'].replace('Sergey Pavlovich', 'Sergei Pavlovich')
fighters['Name'] = fighters['Name'].replace('Jan Błachowicz', 'Jan Blachowicz')
fighters['Name'] = fighters['Name'].replace('Yaozong Hu', 'Hu Yaozong') 
fighters['Name'] = fighters['Name'].replace('Josh Burkman', 'Joshua Burkman')  
fighters['Name'] = fighters['Name'].replace('Brad Scott', 'Bradley Scott')  
fighters['Name'] = fighters['Name'].replace('Jingliang Li', 'Li Jingliang')  
fighters['Name'] = fighters['Name'].replace('Kenan Song', 'Song Kenan')  
fighters['Name'] = fighters['Name'].replace('Carlo Pedersoli Jr.', 'Carlo Pedersoli')  
fighters['Name'] = fighters['Name'].replace('B.J.-Penn', 'BJ Penn')  
fighters['Name'] = fighters['Name'].replace('Diego Ferreira', 'Diego Ferreira')  
fighters['Name'] = fighters['Name'].replace('Marco Reyes', 'Marco Polo Reyes')  
fighters['Name'] = fighters['Name'].replace('Doo ho Choi', 'Dooho Choi')  
fighters['Name'] = fighters['Name'].replace('Guan Wang', 'Wang Guan')  
fighters['Name'] = fighters['Name'].replace('Yadong Song', 'Song Yadong') 
fighters['Name'] = fighters['Name'].replace('Yadong Song', 'Sean O"Malley')
fighters['Name'] = fighters['Name'].replace('France Kai Kara', 'Kai Kara France')
fighters['Name'] = fighters['Name'].replace('Aleksandra Albu', 'Alexandra Albu')
fighters['Name'] = fighters['Name'].replace('Xiaonan Yan', 'Yan Xiaonan')
fighters['Name'] = fighters['Name'].replace('Souza Livinha', 'Livinha Souza') 
fighters['Name'] = fighters['Name'].replace('Fabricio Werdum ', 'Fabricio Werdum') 
                                            


## Height Cleaning

The metric part of the height was removed and the imoerial was changed from feet to inches

In [12]:
fighters = fighters[pd.notnull(fighters["Ht."])]
fighters['Ht.'] = fighters['Ht.'].map(lambda x: x.split('(')[1].strip())
fighters['Ht.'] = fighters['Ht.'].map(lambda x: x.split('m')[0].strip())
fighters['Ht.'] = fighters['Ht.'].apply(ast.literal_eval)
fighters['Ht.']= fighters['Ht.'].apply(lambda x: x* 39.37)

In [13]:
fighters.head(5)

Unnamed: 0,Name,Age,Ht.,Nickname,Endeavor record,MMA record,Weight Class,Weight Class Index,Overall Wins,Overall Losses,UFC Wins,UFC Losses
1,Andrei Arlovski,40,75.1967,The Pit Bull,16 12 (1 NC),27 18 (2 NC),Heavyweight,0,27,18,16,12
2,Alistair Overeem,38,77.1652,The Reem,11 6,45 17 (1 NC),Heavyweight,0,45,17,11,6
3,Cain Velasquez,36,72.8345,,12 3,14 3,Heavyweight,0,14,3,12,3
4,Fabrício Werdum,41,75.9841,Vai Cavalo,11 6,23 8 1,Heavyweight,0,23,8,11,6
5,Junior dos Santos,35,75.9841,Cigano,15 4,21 5,Heavyweight,0,21,5,15,4


In [14]:
fighters.shape

(587, 12)

Export the dataframe to a csv

In [15]:
 fighters.to_csv('wiki_scrape.csv',index = False)