In [1]:
import time

In [8]:
%%time
# STEP 1 - GET FIGHTERS' GENERAL INFO FROM UFCSTATS.COM


# imports for webscraping
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

# import for dataframes
import pandas as pd


# iterating through every page with stats
import string  
alphabet = list(string.ascii_lowercase) # list of letters in alphabetc order

pages = {letter:[] for letter in alphabet} # a dictionary with letters as keys and parsed pages as values

# iterating through the list to store every parsed page in dictionary
for letter in alphabet:
    
    my_url = "http://ufcstats.com/statistics/fighters?char="+letter+"&page=all" # url of a page to parse
    
    # loading the page
    uClient = uReq(my_url)  
    page_html = uClient.read()
    uClient.close()
    
    
    pages[letter].append(soup(page_html, 'html.parser'))
    
    
    

# creating a dictionary of fighters info from stats pages     
fighters = {
    "first name":[],
    "last name":[],
    "nickname":[],
    "link":[],
    "height(ft)":[],
    "weight(lb)":[],
    "reach(inch)":[],
    "stance":[],
    "wins":[],
    "losses":[],
    "draws":[],
    "belt":[]
}


# iterating through every page, getting required info and putting it in the dictioanry
for key,value in pages.items():
    
    # going through every parsed page to get stats
    containers = value[0].findAll("tr",{"class":"b-statistics__table-row"})
    
    # first two values doesn't contain any relevant information
    del containers[:2]
    
    for container in containers:
    
    # filling out dictionary (values to respective keys)
        fighters["first name"].append(container.findAll("td",{"class":"b-statistics__table-col"})[0].a.text)
        fighters["last name"].append(container.findAll("td",{"class":"b-statistics__table-col"})[1].a.text)
        fighters["nickname"].append(container.findAll("td",{"class":"b-statistics__table-col"})[2].a.text)
        fighters["link"].append(container.findAll("td",{"class":"b-statistics__table-col"})[0].a['href'])
        fighters["height(ft)"].append(container.findAll("td",{"class":"b-statistics__table-col"})[3].text.strip())
        fighters["weight(lb)"].append(container.findAll("td",{"class":"b-statistics__table-col"})[4].text.strip())
        fighters["reach(inch)"].append(container.findAll("td",{"class":"b-statistics__table-col"})[5].text.strip())
        fighters["stance"].append(container.findAll("td",{"class":"b-statistics__table-col"})[6].text.strip())
        fighters["wins"].append(container.findAll("td",{"class":"b-statistics__table-col"})[7].text.strip())
        fighters["losses"].append(container.findAll("td",{"class":"b-statistics__table-col"})[8].text.strip())
        fighters["draws"].append(container.findAll("td",{"class":"b-statistics__table-col"})[9].text.strip())
   
    # champions have a link to belt image on the page, contenders don't have it.
    # so if there's a link put 1 for champion in the dictionary, no link - put 0 for a contender 
        if container.findAll("td",{"class":"b-statistics__table-col"})[10].img is None:
            fighters["belt"].append(0)
        else:
            fighters["belt"].append(1)  
            
            
# creating a dataframe for later use        
fighters_data = pd.DataFrame(data=fighters)


# a dictionary for fighters' statistics from each fighter's personal page on ufcstats.com (will be added to fighters_data)
fighter_stats = {
    "SLpM":[],
    "Str. Acc":[],
    "SApM":[],
    "Str. Def":[],
    "TD Avg":[],
    "TD Acc.":[],
    "TD Def":[],
    "Sub. Avg":[]
}


# using links to fighter's personal stats page that I got in the code above
for link in fighters_data['link']:
    
    # parsing pages
    uClient = uReq(link)
    page_html = uClient.read()
    uClient.close()
    
    fighter_page = soup(page_html, "html.parser")
    
    # each page has tables with fighter's stats. Going through each table extracting required info
    containers = fighter_page.findAll("ul",{"class":"b-list__box-list b-list__box-list_margin-top"})
    
    # filling out dictionary
    fighter_stats["SLpM"].append(containers[0].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[0].text.split(':')[1].strip())
    fighter_stats["Str. Acc"].append(containers[0].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[1].text.split(':')[1].strip())
    fighter_stats["SApM"].append(containers[0].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[2].text.split(':')[1].strip())
    fighter_stats["Str. Def"].append(containers[0].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[3].text.split(':')[1].strip())
    fighter_stats["TD Avg"].append(containers[1].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[1].text.split(':')[1].strip())
    fighter_stats["TD Acc."].append(containers[1].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[2].text.split(':')[1].strip())
    fighter_stats["TD Def"].append(containers[1].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[3].text.split(':')[1].strip())
    fighter_stats["Sub. Avg"].append(containers[1].findAll("li",{"class":"b-list__box-list-item b-list__box-list-item_type_block"})[4].text.split(':')[1].strip())
    

    
    
# creating new columns in fighters_data and adding scrapped information into new columns
fighters_data["SLpM"] = fighter_stats["SLpM"]
fighters_data["Str. Acc"] = fighter_stats["Str. Acc"]
fighters_data["SApM"] = fighter_stats["SApM"]
fighters_data["Str. Def"] = fighter_stats["Str. Def"]
fighters_data["TD Avg"] = fighter_stats["TD Avg"]
fighters_data["TD Acc."] = fighter_stats["TD Acc."]
fighters_data["TD Def"] = fighter_stats["TD Def"]
fighters_data["Sub. Avg"] = fighter_stats["Sub. Avg"]


# saving dataframe into .csv file (will have to be updated regulary)
fighters_data.to_csv('fighters_data.csv', index=False)


# STEP 2 - EVERY EVENT'S GENERAL INFO FROM UFCSTATS.COM

# webpage to scrap from
my_url = "http://ufcstats.com/statistics/events/completed?page=all"

# parsing the page
uClient = uReq(my_url)
page_html = uClient.read()
fight_events = soup(page_html, "html.parser")

# the page consists of tables with every event general info (name, date, place and link to event's stats)
# creating a list of those tables
containers = fight_events.findAll("td",{"class":"b-statistics__table-col"})

# first two items in the list don't contain any relevant information
del containers[:2]

# a dictionary for events' info
events = {
    "event name":[],
    "event date":[],
    "event place":[],
    "event link":[]
}


# so every even value of the list has event's name, date and link while every odd value has event's place
# iterating through the list of tables from the webpage to fill out dictionary
for i in range(len(containers)):
    
    if i%2==0:
        events["event name"].append(containers[i].a.text.strip())
        events["event date"].append(containers[i].span.text.strip())
        events["event link"].append(containers[i].a['href'])
    else:
        events["event place"].append(containers[i].text.strip())
        
# a dataframe for later use        
events_data = pd.DataFrame(data=events)

# saving dataframe into .csv file (will have to be updated regulary)
events_data.to_csv("events_data.csv", index=False)
# Step 3 - every event's detailed fight information


# a dictionary that will hold all the information scrapped
fights = {
    "fight date":[],
    "fight weightclass":[],
    "win method":[],
    "rounds":[],
    "time":[],
    "Winner":[],
    "KD W":[],
    "TD succeeded W":[],
    "TD attempted W":[],
    "TD % W":[],
    "sub attempts W":[],
    "pass W":[],
    "str landed W":[],
    "str total W":[],
    "sig str landed W":[],
    "sig str total W":[],
    "sig str % W":[],
    "dist str landed W":[],
    "dist str total W":[],
    "clinch str landed W":[],
    "clinch str total W":[],
    "ground str landed W":[],
    "ground str total W":[],
    "Loser":[],
    "KD L":[],
    "TD succeeded L":[],
    "TD attempted L":[],
    "TD % L":[],
    "sub attempts L":[],
    "pass L":[],
    "str landed L":[],
    "str total L":[],
    "sig str landed L":[],
    "sig str total L":[],
    "sig str % L":[],
    "dist str landed L":[],
    "dist str total L":[],
    "clinch str landed L":[],
    "clinch str total L":[],
    "ground str landed L":[],
    "ground str total L":[],
}
    

#using links from events_data dataframe to get stats for each fight    
for link in events_data["event link"]:
    
    uClient = uReq(link)
    page_html = uClient.read()
    uClient.close()
    
    #parsed webpage of the event
    event_page = soup(page_html, "html.parser")
    
    
    #event's every fight (general info)
    containers = event_page.findAll("tr",{"class":"b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click"})
    
    #looping through every fight to scrap general info to fill out dictionary
    for container in containers:
        
        stats = container.findAll("p",{"class":"b-fight-details__table-text"})
        
        #stats contain every p class element of the table, p class is basically a cell of the table on the webpage
        #some cells are split in two some not
        #the first page has a winn/draw info of the fight. If there is a draw that cell is split in two (draw / draw)
        #which creates additional element and makes len(stats) one element bigger. 
        #that's why if there's a 'win' in the first cell I'll delete the first element from the list (don't need that info)
        #but if there's a 'draw' I'll delete first two elements to make the length of the list same for each iteration
        
        if stats[0].i.text == 'win':
            del stats[0]
        else:
            del stats[0:2]
        
        #filling out the dictionary with scrapped information
        fights["fight date"].append(event_page.find("li",{"class":"b-list__box-list-item"}).text.split(":")[1].strip())
        fights["Winner"].append(stats[0].a.text.strip())
        fights["Loser"].append(stats[1].a.text.strip())
        fights["fight weightclass"].append(stats[10].text.strip())
        fights["win method"].append(stats[11].text.strip())
        fights["rounds"].append(int(stats[13].text.strip()))
        fights["time"].append(stats[14].text.strip())
        
               
        # parsing page with detailed fight statistics        
        uClient_fight = uReq(container.a["href"]) 
        fight_page_html = uClient_fight.read()
        uClient_fight.close()
        fight_page = soup(fight_page_html, "html.parser")
        
        #parsed webpage to collect more detailed fight stats
        #there are 4 stats tables on the page. I need the first(index 0) and the third(index 2) tables
        #sometimes there is no detailed info on fight. In this case len(fight_tables)=0
        
        fight_tables = fight_page.findAll("tbody",{"class":"b-fight-details__table-body"})
        
        
         
        #If info for a fight is available (there're tables created on the page), execute this
        if len(fight_tables) > 0:
            
            fights["KD W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[2].text.strip())
            fights["TD succeeded W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[10].text.strip().split("of")[0])
            fights["TD attempted W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[10].text.strip().split("of")[1])
            fights["TD % W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[12].text.strip()[:-1])
            fights["sub attempts W"].append(stats[6].text.strip())
            fights["pass W"].append(stats[8].text.strip())
            fights["str landed W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[8].text.strip().split("of")[0])
            fights["str total W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[8].text.strip().split("of")[1])
            fights["sig str landed W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[4].text.strip().split("of")[0])
            fights["sig str total W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[4].text.strip().split("of")[1])
            fights["sig str % W"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[6].text.strip()[:-1])
            fights["dist str landed W"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[12].text.strip().split("of")[0])
            fights["dist str total W"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[12].text.strip().split("of")[1])
            fights["clinch str landed W"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[14].text.strip().split("of")[0])
            fights["clinch str total W"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[14].text.strip().split("of")[1])
            fights["ground str landed W"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[16].text.strip().split("of")[0])
            fights["ground str total W"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[16].text.strip().split("of")[1])
            fights["KD L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[3].text.strip())
            fights["TD succeeded L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[11].text.strip().split("of")[0])
            fights["TD attempted L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[11].text.strip().split("of")[1])
            fights["TD % L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[13].text.strip()[:-1])
            fights["sub attempts L"].append(stats[7].text.strip())
            fights["pass L"].append(stats[9].text.strip())
            fights["str landed L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[9].text.strip().split("of")[0])
            fights["str total L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[9].text.strip().split("of")[1])
            fights["sig str landed L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[5].text.strip().split("of")[0])
            fights["sig str total L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[5].text.strip().split("of")[1])
            fights["sig str % L"].append(fight_tables[0].findAll("p",{"class":"b-fight-details__table-text"})[7].text.strip()[:-1])
            fights["dist str landed L"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[13].text.strip().split("of")[0])
            fights["dist str total L"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[13].text.strip().split("of")[1])
            fights["clinch str landed L"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[15].text.strip().split("of")[0])
            fights["clinch str total L"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[15].text.strip().split("of")[1])
            fights["ground str landed L"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[17].text.strip().split("of")[0])
            fights["ground str total L"].append(fight_tables[2].findAll("p",{"class":"b-fight-details__table-text"})[17].text.strip().split("of")[1])
            
        
        #If not (no tables created on the page), execute this
        else:
            
            fights["KD W"].append(float("NaN"))
            fights["TD succeeded W"].append(float("NaN"))
            fights["TD attempted W"].append(float("NaN"))
            fights["TD % W"].append(float("NaN"))
            fights["sub attempts W"].append(float("NaN"))
            fights["pass W"].append(float("NaN"))
            fights["str landed W"].append(float("NaN"))
            fights["str total W"].append(float("NaN"))
            fights["sig str landed W"].append(float("NaN"))
            fights["sig str total W"].append(float("NaN"))
            fights["sig str % W"].append(float("NaN"))
            fights["dist str landed W"].append(float("NaN"))
            fights["dist str total W"].append(float("NaN"))
            fights["clinch str landed W"].append(float("NaN"))
            fights["clinch str total W"].append(float("NaN"))
            fights["ground str landed W"].append(float("NaN"))
            fights["ground str total W"].append(float("NaN"))
            fights["KD L"].append(float("NaN"))
            fights["TD succeeded L"].append(float("NaN"))
            fights["TD attempted L"].append(float("NaN"))
            fights["TD % L"].append(float("NaN"))
            fights["sub attempts L"].append(float("NaN"))
            fights["pass L"].append(float("NaN"))
            fights["str landed L"].append(float("NaN"))
            fights["str total L"].append(float("NaN"))
            fights["sig str landed L"].append(float("NaN"))
            fights["sig str total L"].append(float("NaN"))
            fights["sig str % L"].append(float("NaN"))
            fights["dist str landed L"].append(float("NaN"))
            fights["dist str total L"].append(float("NaN"))
            fights["clinch str landed L"].append(float("NaN"))
            fights["clinch str total L"].append(float("NaN"))
            fights["ground str landed L"].append(float("NaN"))
            fights["ground str total L"].append(float("NaN"))
            

        
#creating a dataframe out of dictionary        
fights_data = pd.DataFrame(data=fights)

#saving data into .csv file
fights_data.to_csv("fights_data", index=False)

Wall time: 2h 15min 28s
