In [None]:
# enable saving scraper results to drive
# THIS NOTEBOOK ASSUMES THAT YOU HAVE "Colab Notebooks/UFCScraper/bouts_html" AS GOOGLE DRIVE DIRECTORIES
# MAKE SURE TO CREATE THIS DIRECTORY STRUCTURE FIRST!!!
from google.colab import drive
drive.mount("/content/drive")
# run pip installations into colab environment
!pip install MechanicalSoup
# import required modules here
import mechanicalsoup
from bs4 import BeautifulSoup
import json
import pandas as pd

In [None]:
# 3 MINUTES TO RUN
# fetch ufcstats html page containing all completed card events
browser = mechanicalsoup.Browser()
url = "http://ufcstats.com/statistics/events/completed?page=all"
page = browser.get(url)
main_html = page.soup
# open links to all card events from 1994 to before Today (i.e. UFC2 to UFC284)
table_rows = main_html.find_all("tr", "b-statistics__table-row")
card_event_urls = [table_row.find("a")["href"] for table_row in table_rows if table_row.find("a") != None]
card_event_htmls = [browser.get(url).soup.prettify() for url in card_event_urls]
# save result externally
json_name = "/content/drive/My Drive/Colab Notebooks/UFCScraper/bout_htmls/card_event_htmls.json"
with open(json_name, "w") as f:
  json.dump(card_event_htmls, f)

In [None]:
# 60 MINUTES TO RUN
# open links to all bouts within all card events
bout_urls, bout_htmls, index = [], [], 0
for card_event_html in card_event_htmls:
  card_event_html = BeautifulSoup(card_event_html, "html.parser")
  table_rows = card_event_html.find_all("tr", "b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click")
  bout_urls = [table_row["data-link"] for table_row in table_rows]
  for bout_url in bout_urls:
    bout_htmls.append(browser.get(bout_url).soup.prettify())
  # save results incrementally via streaming to alleviate System RAM
  json_name = "/content/drive/My Drive/Colab Notebooks/UFCScraper/bout_htmls/bout_htmls" + str(index) + ".json"
  with open(json_name, "w") as f:
    json.dump(bout_htmls, f)
  index = index + 1
  bout_htmls = []

In [None]:
# 12 MINUTES TO RUN
import traceback
# prepare local pandas dataframe that will store statistics per bout
ufcstats = ['card_name', 'red_name', 'blue_name', 'winner_name', 'loser_name', 'winner_color', 'loser_color',    
            'match_result', 'num_rounds', 'time', 'method', 'time_format', 'referee_name', 'division', 'sig_str_total_red',
            'sig_str_landed_red', 'sig_str_total_blue', 'sig_str_landed_blue', 'total_str_total_red', 'total_str_landed_red', 
            'total_str_total_blue', 'total_str_landed_blue', 'td_total_red', 'td_landed_red', 'td_total_blue', 'td_landed_blue', 
            'sub_att_red', 'sub_att_blue', 'rev_red', 'rev_blue', 'ctrl_red', 'ctrl_blue', 'head_str_total_red', 'head_str_landed_red',
            'head_str_total_blue', 'head_str_landed_blue', 'body_str_total_red', 'body_str_landed_red', 'body_str_total_blue',
            'body_str_landed_blue', 'leg_str_total_red', 'leg_str_landed_red', 'leg_str_total_blue', 'leg_str_landed_blue',
            'distance_str_total_red', 'distance_str_landed_red', 'distance_str_total_blue', 'distance_str_landed_blue',
            'clinch_str_total_red', 'clinch_str_landed_red', 'clinch_str_total_blue', 'clinch_str_landed_blue',
            'ground_str_total_red', 'ground_str_landed_red', 'ground_str_total_blue', 'ground_str_landed_blue']

df = pd.DataFrame(columns=ufcstats)
# statistics extraction from bout html files
index = 0
while True:
  try:
    json_name = "/content/drive/My Drive/Colab Notebooks/UFCScraper/bout_htmls/bout_htmls" + str(index) + ".json"
    with open(json_name) as f:
      bout_htmls = json.load(f)
      for html in bout_htmls:
        bout_html = BeautifulSoup(html, 'html.parser')
        # BEGIN LOGIC FOR EXTRACTION HERE
        # info from the bottom of the page
        test = bout_html.find("i", "b-fight-details__charts-name b-fight-details__charts-name_pos_left js-chart-name")
        if test == None:
          continue
        red_name = bout_html.find("i", "b-fight-details__charts-name b-fight-details__charts-name_pos_left js-chart-name").text.strip()
        blue_name = bout_html.find("i", "b-fight-details__charts-name b-fight-details__charts-name_pos_right js-chart-name").text.strip()
        
        # info from the top of the page
        top_name = bout_html.find_all("a", "b-link b-fight-details__person-link")[0].text.strip()
        bot_name = bout_html.find_all("a", "b-link b-fight-details__person-link")[1].text.strip()
        card_name = bout_html.find("a", "b-link").text.strip()
        top_result = bout_html.select("i[class^='b-fight-details__person-status b-fight-details__person-status']")[0].text.strip()
        bot_result = bout_html.select("i[class^='b-fight-details__person-status b-fight-details__person-status']")[1].text.strip()
        winner_name = "None"
        loser_name = "None"
        winner_color = "none"
        loser_color = "none"
        if top_result == "W":
          winner_name = top_name
          loser_name = bot_name
          if winner_name == red_name:
            winner_color = "red"
            loser_color = "blue"
          else:
            winner_color = "blue"
            loser_color = "red"
          match_result = "W/L"
        elif top_result == "L":
          winner_name = bot_name
          loser_name = top_name
          if winner_name == red_name:
            winner_color = "red"
            loser_color = "blue"
          else:
            winner_color = "blue"
            loser_color = "red"
          match_result = "W/L"
        else: # match_result = NC or D
          match_result = top_result

        summary_html = bout_html.find("div", "b-fight-details__content")
        num_rounds = summary_html.find_all("i", "b-fight-details__text-item")[0].text.split()[1]
        time = summary_html.find_all("i", "b-fight-details__text-item")[1].text.split()[1]
        method = summary_html.find("i", style="font-style: normal").text.strip()
        time_format = summary_html.find_all("i", "b-fight-details__text-item")[2].text.split()[2]
        referee_name = summary_html.find_all("i", "b-fight-details__text-item")[3].find("span").text.strip()
        division = " ".join(bout_html.find("i", "b-fight-details__fight-title").text.split()[0:-1])
        
        # info from totals summary
        totals_row_html = bout_html.find_all("section", "b-fight-details__section js-fight-section")[1]
        
        kd_html = totals_row_html.find_all("td", "b-fight-details__table-col")[1]
        kd_total_top = sig_str_total_top = kd_html.find_all("p", "b-fight-details__table-text")[0].text.strip()
        kd_total_bot = sig_str_total_top = kd_html.find_all("p", "b-fight-details__table-text")[1].text.strip()
        
        sig_str_html = totals_row_html.find_all("td", "b-fight-details__table-col")[2]
        sig_str_total_top = sig_str_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        sig_str_landed_top = sig_str_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        sig_str_total_bot = sig_str_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        sig_str_landed_bot = sig_str_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0] 
        
        total_str_html = totals_row_html.find_all("td", "b-fight-details__table-col")[4]
        total_str_total_top = total_str_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        total_str_landed_top = total_str_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        total_str_total_bot = total_str_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        total_str_landed_bot = total_str_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0] 
        
        td_html = totals_row_html.find_all("td", "b-fight-details__table-col")[5]
        td_total_top = td_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        td_landed_top = td_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        td_total_bot = td_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        td_landed_bot = td_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0] 

        sub_att_html = totals_row_html.find_all("td", "b-fight-details__table-col")[7]
        sub_att_top = sub_att_html.find_all("p", "b-fight-details__table-text")[0].text.strip()
        sub_att_bot = sub_att_html.find_all("p", "b-fight-details__table-text")[1].text.strip()

        rev_html = totals_row_html.find_all("td", "b-fight-details__table-col")[8]
        rev_top = rev_html.find_all("p", "b-fight-details__table-text")[0].text.strip()
        rev_bot = rev_html.find_all("p", "b-fight-details__table-text")[1].text.strip()

        ctrl_html = totals_row_html.find_all("td", "b-fight-details__table-col")[9]
        ctrl_top = ctrl_html.find_all("p", "b-fight-details__table-text")[0].text.strip()
        ctrl_bot = ctrl_html.find_all("p", "b-fight-details__table-text")[1].text.strip()

        # info from significant strikes summary
        sig_strikes_row = bout_html.find_all("table", style="width: 745px")[1]
        
        total_head_html = sig_strikes_row.find_all("td", "b-fight-details__table-col")[3]
        head_str_total_top = total_head_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        head_str_landed_top = total_head_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        head_str_total_bot = total_head_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        head_str_landed_bot = total_head_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0]

        total_body_html = sig_strikes_row.find_all("td", "b-fight-details__table-col")[4]
        body_str_total_top = total_body_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        body_str_landed_top = total_body_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        body_str_total_bot = total_body_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        body_str_landed_bot = total_body_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0]

        total_leg_html = sig_strikes_row.find_all("td", "b-fight-details__table-col")[5]
        leg_str_total_top = total_leg_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        leg_str_landed_top = total_leg_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        leg_str_total_bot = total_leg_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        leg_str_landed_bot = total_leg_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0]

        total_distance_html = sig_strikes_row.find_all("td", "b-fight-details__table-col")[6]
        distance_str_total_top = total_distance_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        distance_str_landed_top = total_distance_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        distance_str_total_bot = total_distance_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        distance_str_landed_bot = total_distance_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0]

        total_clinch_html = sig_strikes_row.find_all("td", "b-fight-details__table-col")[7]
        clinch_str_total_top = total_clinch_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        clinch_str_landed_top = total_clinch_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        clinch_str_total_bot = total_clinch_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        clinch_str_landed_bot = total_clinch_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0]

        total_ground_html = sig_strikes_row.find_all("td", "b-fight-details__table-col")[8]
        ground_str_total_top = total_ground_html.find_all("p", "b-fight-details__table-text")[0].text.split()[2]
        ground_str_landed_top = total_ground_html.find_all("p", "b-fight-details__table-text")[0].text.split()[0]
        ground_str_total_bot = total_ground_html.find_all("p", "b-fight-details__table-text")[1].text.split()[2]
        ground_str_landed_bot = total_ground_html.find_all("p", "b-fight-details__table-text")[1].text.split()[0]

        # convert top and bot to red and blue
        if top_name == red_name:
          sig_str_total_red = sig_str_total_top
          sig_str_landed_red = sig_str_landed_top
          sig_str_total_blue = sig_str_total_bot
          sig_str_landed_blue = sig_str_landed_bot
          
          kd_total_red = kd_total_top
          kd_total_blue = kd_total_bot
          
          total_str_total_red = total_str_total_top
          total_str_landed_red = total_str_landed_top
          total_str_total_blue = total_str_total_bot
          total_str_landed_blue = total_str_landed_bot
          
          td_total_red = td_total_top
          td_landed_red = td_landed_top
          td_total_blue = td_total_bot
          td_landed_blue = td_landed_bot

          sub_att_red = sub_att_top
          sub_att_blue = sub_att_bot

          rev_red = rev_top
          rev_blue = rev_bot

          ctrl_red = ctrl_top
          ctrl_blue = ctrl_bot

          head_str_total_red = head_str_total_top
          head_str_landed_red = head_str_landed_top
          head_str_total_blue = head_str_total_bot
          head_str_landed_blue = head_str_landed_bot

          body_str_total_red = body_str_total_top
          body_str_landed_red = body_str_landed_top
          body_str_total_blue = body_str_total_bot
          body_str_landed_blue = body_str_landed_bot

          leg_str_total_red = leg_str_total_top
          leg_str_landed_red = leg_str_landed_top
          leg_str_total_blue = leg_str_total_bot
          leg_str_landed_blue = leg_str_landed_bot

          distance_str_total_red = distance_str_total_top
          distance_str_landed_red = distance_str_landed_top
          distance_str_total_blue = distance_str_total_bot
          distance_str_landed_blue = distance_str_landed_bot

          clinch_str_total_red = clinch_str_total_top
          clinch_str_landed_red = clinch_str_landed_top
          clinch_str_total_blue = clinch_str_total_bot
          clinch_str_landed_blue = clinch_str_landed_bot

          ground_str_total_red = ground_str_total_top
          ground_str_landed_red = ground_str_landed_top
          ground_str_total_blue = ground_str_total_bot
          ground_str_landed_blue = ground_str_landed_bot

        elif top_name == blue_name:
          sig_str_total_blue = sig_str_total_top
          sig_str_landed_blue = sig_str_landed_top
          sig_str_total_red = sig_str_total_bot
          sig_str_landed_red = sig_str_landed_bot

          kd_total_blue = kd_total_top
          kd_total_red = kd_total_bot
          
          total_str_total_red = total_str_total_bot
          total_str_landed_red = total_str_landed_bot
          total_str_total_blue = total_str_total_top
          total_str_landed_blue = total_str_landed_top

          td_total_red = td_total_bot
          td_landed_red = td_landed_bot
          td_total_blue = td_total_top
          td_landed_blue = td_landed_top

          sub_att_red = sub_att_bot
          sub_att_blue = sub_att_top

          rev_red = rev_bot
          rev_blue = rev_top

          ctrl_red = ctrl_bot
          ctrl_blue = ctrl_top

          head_str_total_red = head_str_total_bot
          head_str_landed_red = head_str_landed_bot
          head_str_total_blue = head_str_total_top
          head_str_landed_blue = head_str_landed_top

          body_str_total_red = body_str_total_bot
          body_str_landed_red = body_str_landed_bot
          body_str_total_blue = body_str_total_top
          body_str_landed_blue = body_str_landed_top

          leg_str_total_red = leg_str_total_bot
          leg_str_landed_red = leg_str_landed_bot
          leg_str_total_blue = leg_str_total_top
          leg_str_landed_blue = leg_str_landed_top

          distance_str_total_red = distance_str_total_bot
          distance_str_landed_red = distance_str_landed_bot
          distance_str_total_blue = distance_str_total_top
          distance_str_landed_blue = distance_str_landed_top

          clinch_str_total_red = clinch_str_total_bot
          clinch_str_landed_red = clinch_str_landed_bot
          clinch_str_total_blue = clinch_str_total_top
          clinch_str_landed_blue = clinch_str_landed_top

          ground_str_total_red = ground_str_total_bot
          ground_str_landed_red = ground_str_landed_bot
          ground_str_total_blue = ground_str_total_top
          ground_str_landed_blue = ground_str_landed_top

        # add bout statistics as pandas dataframe row
        new_row = { 'card_name': card_name, 
                    'red_name': red_name, 
                    'blue_name': blue_name, 
                    'winner_name': winner_name,
                    'loser_name': loser_name, 
                    'winner_color': winner_color, 
                    'loser_color': loser_color, 
                    'match_result': match_result,
                    'num_rounds': num_rounds, 
                    'time': time, 
                    'method': method,
                    'time_format': time_format,
                    'referee_name': referee_name, 
                    'division': division,
                    'sig_str_total_red': sig_str_total_red, 
                    'sig_str_landed_red': sig_str_landed_red,
                    'sig_str_total_blue': sig_str_total_blue, 
                    'sig_str_landed_blue': sig_str_landed_blue,
                    'total_str_total_red': total_str_total_red,
                    'total_str_landed_red': total_str_landed_red,
                    'total_str_total_blue': total_str_total_blue,
                    'total_str_landed_blue': total_str_landed_blue,
                    'td_total_red': td_total_red,
                    'td_landed_red': td_landed_red,
                    'td_total_blue': td_total_blue,
                    'td_landed_blue': td_landed_blue,
                    'sub_att_red': sub_att_red,
                    'sub_att_blue': sub_att_blue,
                    'rev_red': rev_red,
                    'rev_blue': rev_blue,
                    'ctrl_red': ctrl_red,
                    'ctrl_blue': ctrl_blue,
                    'head_str_total_red': head_str_total_red,
                    'head_str_landed_red': head_str_landed_red,
                    'head_str_total_blue': head_str_total_blue,
                    'head_str_landed_blue': head_str_landed_blue,
                    'body_str_total_red': body_str_total_red,
                    'body_str_landed_red': body_str_landed_red,
                    'body_str_total_blue': body_str_total_blue,
                    'body_str_landed_blue': body_str_landed_blue,
                    'leg_str_total_red': leg_str_total_red,
                    'leg_str_landed_red': leg_str_landed_red,
                    'leg_str_total_blue': leg_str_total_blue,
                    'leg_str_landed_blue': leg_str_landed_blue,
                    'distance_str_total_red': distance_str_total_red,
                    'distance_str_landed_red': distance_str_landed_red,
                    'distance_str_total_blue': distance_str_total_blue,
                    'distance_str_landed_blue': distance_str_landed_blue,
                    'clinch_str_total_red': clinch_str_total_red,
                    'clinch_str_landed_red': clinch_str_landed_red,
                    'clinch_str_total_blue': clinch_str_total_blue,
                    'clinch_str_landed_blue': clinch_str_landed_blue,
                    'ground_str_total_red': ground_str_total_red,
                    'ground_str_landed_red': ground_str_landed_red,
                    'ground_str_total_blue': ground_str_total_blue,
                    'ground_str_landed_blue': ground_str_landed_blue,
                   }

        df = df.append(new_row, ignore_index=True)
    index = index + 1
  except Exception:
    break

In [None]:
# open the original page once again
dfn = df.copy()
browser = mechanicalsoup.Browser()
url = "http://ufcstats.com/statistics/events/completed?page=all"
page = browser.get(url)
main_html = page.soup
# iterate through all cards and add date
dfn.insert(0, 'card_date', ["" for i in range(len(dfn))])
table_rows = main_html.find_all("tr", "b-statistics__table-row")
for table_row in table_rows:
  if table_row.find("a") != None:
    card_name = table_row.find("a").text.strip()
    card_date = table_row.find("span").text.strip()
    dfn.loc[dfn['card_name'] == card_name, 'card_date'] = card_date

In [None]:
# export local pandas dataframe as a csv file
path = "/content/drive/My Drive/Colab Notebooks/UFCScraper/bout_statistics.csv"
dfn.to_csv(path, index=False)